Did you hear the news?
import pandas as pd
import plotly.graph_objects as go
import datetime as dt
def news_table(dates:np.array,
titles:np.array,
summaries:np.array,
urls:np.array,
autosize=True,
width=500,
height=550):
df = pd.DataFrame([])
df['Date'] = list(map(lambda date:dt.datetime.fromtimestamp(date.item() / 10**9).strftime('%d-%b'),dates))
df['URL'] = list(map(lambda url: f'<a href="{url}">Link</a>', urls))
df['Title'] = list(map(lambda url: f'<b>"{url}"</b> ', titles))
df['Text'] = '<b>' + df['Title'] + '</b> ' + summaries + '... ' + df['URL']
values = [df['Date'], df['Text']]
text_colors = ['#72bf44', '#414141', '#313c59']
fig = go.Figure(data=[go.Table(
columnwidth=[1.4, 8],
header=dict(values=[[''],
['']],
fill_color='rgba(0,0,0,0)',
# align='left',
font=dict(color='#414141', size=12, family='Verdana'),
height=0
),
cells=dict(values=values,
fill_color='rgba(0,0,0,0)',
align='left',
font=dict(color=text_colors, size=[10, 12], family='Verdana'),
height=40
))
])
fig.update_layout(
title=dict(text='<b>Latest Aviation news</b>',
font=dict(color='#414141', size=18, family='Verdana')),
title_x=0.5,
autosize=autosize,
width=width,
height=height,
)
return fig.show(config={
'displayModeBar': False})
market_keywords_df = pd.DataFrame().from_dict(markets_dict,orient='index')
market_keywords_df
How to create the queries for the news api?
First follow the documentation over at bing, create an account, activate billing and setup an API key. Then you need to create the query for the API request. Which consist of:
- Market codes
- Keywords (one string, keywords separated by spaces)
- freshness
- count
mkt = 'nl-NL'
keywords = 'Luchtvaart vliegmaatschappij Vluchten Luchthaven Vliegtuig'
search_url = "https://api.cognitive.microsoft.com/bing/v7.0/news/search"
freshness = 'Week'
headers = {"Ocp-Apim-Subscription-Key":os.environ['BINGS_NEWS_API_KEY']}
params = {
"q": keywords,
'freshness': freshness,
'count': 10,
'mkt': mkt,
# "textDecorations": True, "textFormat": "RAW"
}
import requests
response = requests.get(search_url, headers=headers, params=params)
response.json()
Our list of keywords to search for:
keywords = ['Aviation',
'Airline',
'Flights',
'Airport',
'Planes']
You will also need to determine in which markets you want to fetch the news, since we want a very international perspective and get the aviation news from every country we ask for every news country. So we need a list of markets to search in.
market_codes = ['da-DK',
'de-AT',
'de-CH',
'de-DE',
'en-AU',
'en-CA',
'en-GB',
'en-ID',
'en-IE',
'en-IN',
'en-MY',
'en-NZ',
'en-PH',
'en-SG',
'en-US',
'en-ZA',
'es-AR',
'es-CL',
'es-ES',
'es-MX',
'fi-FI',
'fr-FR',
'it-IT',
'ja-JP',
'ko-KR',
'nl-BE',
'nl-NL',
'pl-PL',
'pt-BR',
'ru-RU',
'sv-SE',
'tr-TR',
'zh-CN',
'zh-HK',
'zh-TW']
You can copy and paste the table with all the markets, but you can also remember that pandas has a very neat table scraping function, so with a little pd.read_html and filters these afterwards.
import pandas as pd
markets_df = pd.read_html('https://docs.microsoft.com/en-us/rest/api/cognitiveservices-bingsearch/bing-news-api-v7-reference#market-codes')[23]
markets_df
market_keywords_df.index.to_list()
import pycountry
from google_trans_new import google_translator
def generate_market_keywords(keywords:list,market_codes:list)->dict:
"""
Parameters
----------
keywords : list
list of keywords to be translated in the target language (as provided in the market-codes)
market_codes : list
list of market_codes, formatted as language_iso_alpha_2-COUNTRY_ISO_ALPHA_2 , e.g 'da-DK'
Returns
-------
market_codes_country_language_alpha3_keywords : dict
{
'da-DK': {
'Country/Region': 'Denmark',
'Language': 'Danish',
'iso_alpha': 'DNK',
'keywords': 'Luftfart Flyselskab Fly Lufthavn Planer',
'language_iso_alpha_2': 'da'
},
'de-AT': {
'Country/Region': 'Austria',
'Language': 'German',
'iso_alpha': 'AUT',
'keywords': 'Luftfahrt Fluggesellschaft Flüge Flughafen Flugzeug',
'language_iso_alpha_2': 'de'
}
}
"""
translate_keywords = lambda keywords, language_iso_alpha_2: [
google_translator().translate(
keyword,
lang_tgt=language_iso_alpha_2,
lang_src = 'en'
)
for keyword in keywords
]
# generate the set of languages in the target markets
target_languages = set([get_country_language_iso_alpha(market_code)['language_iso_alpha_2'] for market_code in market_codes])
# generate the dictionary with language as the key and translates keywords as values
keywords_translated = {language : " ".join(translate_keywords(keywords,language)) for language in target_languages}
return {market_code:{**get_country_language_iso_alpha(market_code),
**{"keywords":keywords_translated[get_country_language_iso_alpha(market_code)['language_iso_alpha_2']]}
} for market_code in market_codes}
generate_market_keywords(keywords,market_codes)
markets_dict = {'da-DK': {'Country/Region': 'Denmark',
'Language': 'Danish',
'iso_alpha': 'DNK',
'keywords': 'Luftfart Flyselskab Fly Lufthavn Plane'},
'de-AT': {'Country/Region': 'Austria',
'Language': 'German',
'iso_alpha': 'AUT',
'keywords': 'Luftfahrt Fluggesellschaft Flüge Flughafen Flugzeug'},
'de-CH': {'Country/Region': 'Switzerland',
'Language': 'German',
'iso_alpha': 'CHE',
'keywords': 'Luftfahrt Fluggesellschaft Flüge Flughafen Flugzeug'},
'de-DE': {'Country/Region': 'Germany',
'Language': 'German',
'iso_alpha': 'DEU',
'keywords': 'Luftfahrt Fluggesellschaft Flüge Flughafen Flugzeug'},
'en-AU': {'Country/Region': 'Australia',
'Language': 'English',
'iso_alpha': 'AUS',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-CA': {'Country/Region': 'Canada',
'Language': 'English',
'iso_alpha': 'CAN',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-GB': {'Country/Region': 'United Kingdom',
'Language': 'English',
'iso_alpha': 'GBR',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-ID': {'Country/Region': 'Indonesia',
'Language': 'English',
'iso_alpha': 'IDN',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-IE': {'Country/Region': 'Ireland',
'Language': 'English',
'iso_alpha': 'IRL',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-IN': {'Country/Region': 'India',
'Language': 'English',
'iso_alpha': 'IND',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-MY': {'Country/Region': 'Malaysia',
'Language': 'English',
'iso_alpha': 'MYS',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-NZ': {'Country/Region': 'New Zealand',
'Language': 'English',
'iso_alpha': 'NZL',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-PH': {'Country/Region': 'Republic of the Philippines',
'Language': 'English',
'iso_alpha': 'PHL',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-SG': {'Country/Region': 'Singapore',
'Language': 'English',
'iso_alpha': 'SGP',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-US': {'Country/Region': 'United States',
'Language': 'English',
'iso_alpha': 'USA',
'keywords': 'Aviation Airline Flights Airport Plane'},
'en-ZA': {'Country/Region': 'South Africa',
'Language': 'English',
'iso_alpha': 'ZAF',
'keywords': 'Aviation Airline Flights Airport Plane'},
'es-AR': {'Country/Region': 'Argentina',
'Language': 'Spanish',
'iso_alpha': 'ARG',
'keywords': 'Aviación Aerolínea Vuelos Aeropuerto Avión'},
'es-CL': {'Country/Region': 'Chile',
'Language': 'Spanish',
'iso_alpha': 'CHL',
'keywords': 'Aviación Aerolínea Vuelos Aeropuerto Avión'},
'es-ES': {'Country/Region': 'Spain',
'Language': 'Spanish',
'iso_alpha': 'ESP',
'keywords': 'Aviación Aerolínea Vuelos Aeropuerto Avión'},
'es-MX': {'Country/Region': 'Mexico',
'Language': 'Spanish',
'iso_alpha': 'MEX',
'keywords': 'Aviación Aerolínea Vuelos Aeropuerto Avión'},
'fi-FI': {'Country/Region': 'Finland',
'Language': 'Finnish',
'iso_alpha': 'FIN',
'keywords': 'ilmailu lentoyhtiö lennot Lentokenttä kone'},
'fr-FR': {'Country/Region': 'France',
'Language': 'French',
'iso_alpha': 'FRA',
'keywords': 'Aviation Compagnie aérienne Vols Aéroport Avion'},
'it-IT': {'Country/Region': 'Italy',
'Language': 'Italian',
'iso_alpha': 'ITA',
'keywords': 'aviazione linea aerea voli Aeroporto Aereo'},
'ja-JP': {'Country/Region': 'Japan',
'Language': 'Japanese',
'iso_alpha': 'JPN',
'keywords': '航空 エアライン フライト 空港 飛行機'},
'ko-KR': {'Country/Region': 'Korea',
'Language': 'Korean',
'iso_alpha': 'KOR',
'keywords': '비행 공기 호스 항공편 공항 비행기'},
'nl-BE': {'Country/Region': 'Belgium',
'Language': 'Dutch',
'iso_alpha': 'BEL',
'keywords': 'Luchtvaart vliegmaatschappij Vluchten Luchthaven Vliegtuig'},
'nl-NL': {'Country/Region': 'Netherlands',
'Language': 'Dutch',
'iso_alpha': 'NLD',
'keywords': 'Luchtvaart vliegmaatschappij Vluchten Luchthaven Vliegtuig'},
'pl-PL': {'Country/Region': 'Poland',
'Language': 'Polish',
'iso_alpha': 'POL',
'keywords': 'Lotnictwo Linia lotnicza Loty Lotnisko Samolot'},
'pt-BR': {'Country/Region': 'Brazil',
'Language': 'Portuguese',
'iso_alpha': 'BRA',
'keywords': 'Aviação CIA aérea Voos Aeroporto Avião'},
'ru-RU': {'Country/Region': 'Russia',
'Language': 'Russian',
'iso_alpha': 'RUS',
'keywords': 'авиации авиакомпания рейсы аэропорт Плане'},
'sv-SE': {'Country/Region': 'Sweden',
'Language': 'Swedish',
'iso_alpha': 'SWE',
'keywords': 'Flyg Flygbolag Flyg Flygplats Plan'},
'tr-TR': {'Country/Region': 'Turkey',
'Language': 'Turkish',
'iso_alpha': 'TUR',
'keywords': 'Havacılık Havayolu Uçak Havalimanı uçak'},
'zh-CN': {'Country/Region': "People's republic of China",
'Language': 'Chinese (simplified)',
'iso_alpha': 'CHN',
'keywords': '航空 航空公司 机票 飞机场 平面'},
'zh-HK': {'Country/Region': 'Hong Kong SAR',
'Language': 'Chinese (traditional)',
'iso_alpha': 'HKG',
'keywords': '航空 航空公司 機票 飛機場 平面'},
'zh-TW': {'Country/Region': 'Taiwan',
'Language': 'Chinese (traditional)',
'iso_alpha': 'TWN',
'keywords': '航空 航空公司 機票 飛機場 平面'}}
import pycountry
def get_country_language_iso_alpha(market_code:str)->dict:
"""
"""
language_iso_alpha_2,country_alpha_2 = market_code.split('-')
return {
'country': pycountry.countries.get(alpha_2=country_alpha_2).name,
'language': pycountry.languages.get(alpha_2=language_iso_alpha_2).name,
'language_iso_alpha_2':language_iso_alpha_2,
'iso_alpha': pycountry.countries.get(alpha_2=country_alpha_2).alpha_3
}
!pip install pycountry google_trans_new
market_keywords_df
market_keywords_dict = market_keywords_df['keywords'].to_dict()
market_keywords_dict
pip install backoff aiohttp asyncio
market_keywords_dict
markets_df['Country/Region'].unique()
markets_df['Market Code'].apply(lambda s:s.split('-')[1]).unique()
!pip install unsync
from unsync import unsync
from google_trans_new import google_translator
@unsync
async def async_series_translator(sentence, input_language_iso_alpha_2):
return google_translator().translate(
sentence,
lang_tgt='en',
lang_src = input_language_iso_alpha_2
)
%%timeit
# def translate_news_df_to_english(news_df:pd.DataFrame)->pd.DataFrame:
input_language_iso_alpha_2 = news_df['market'].unique()[0]
news_df[['name','description']].applymap(lambda sentence: (async_series_translator(sentence,input_language_iso_alpha_2)).result())
# return news_df
# translate_news_df_to_english(news_df)
input_language_iso_alpha_2 = news_df['market'].unique()[0]
news_df[['name','description']] = news_df[['name','description']].applymap(lambda sentence: google_translator().translate(
sentence,
lang_tgt='en',
lang_src = 'nl'
))
def create_news_df(response)
import backoff
import logging
import aiohttp
import asyncio
mkt = 'nl-NL'
keywords = 'Luchtvaart vliegmaatschappij Vluchten Luchthaven Vliegtuig'
logging.getLogger('backoff').addHandler(logging.StreamHandler())
search_url = "https://api.cognitive.microsoft.com/bing/v7.0/news/search"
freshness = 'Week'
category = 'Business'
headers = {"Ocp-Apim-Subscription-Key": os.getenv('BING_NEWS_API_KEY')}
@unsync
@backoff.on_exception(backoff.expo, aiohttp.ClientError, max_time=60)
async def get_bing_news(market_code:str,keywords:str):
params = {
"q": keywords,
'freshness': freshness,
'count': 10,
'mkt': mkt,
# 'category':category,
# "textDecorations": True, "textFormat": "RAW"
}
async with aiohttp.ClientSession() as session:
async with session.get(search_url, headers=headers, params=params) as response:
return await pd.DataFrame.from_dict(response.json()['value'])[['name', 'url', 'datePublished']]
get_bing_news(mkt,keywords).result()
# def bing_aviation_news(market_keywords:dict):
# search_url = "https://api.cognitive.microsoft.com/bing/v7.0/news/search"
# freshness = 'Week'
# category = 'Business'
# headers = {"Ocp-Apim-Subscription-Key": os.getenv('BING_NEWS_API_KEY')}
# params = {
# "q": keywords,
# 'freshness': freshness,
# 'count': 10,
# 'mkt': mkt,
# # 'category':category,
# # "textDecorations": True, "textFormat": "RAW"
# }
# translator = google_translator()
# def bing_news_country(keywords, mkt,iso_alpha):
# try:
# resp = req.get(search_url, headers=headers, params=params)
# resp.raise_for_status()
# df = pd.DataFrame.from_dict(resp.json()['value'])[['name', 'url', 'datePublished']]
# if mkt.split('-')[0] != 'en':
# try:
# df['name'] = [translator.translate(title, lang_tgt='en') for title in df['name']]
# except Exception as e:
# print(e)
# except:
# df = pd.DataFrame([['no_news_found', None, None]], columns=['name','url','datePublished'])
# df['date'] = today.date().strftime(dt_format)
# df['iso_alpha'] = iso_alpha
# return df
import aiohttp
import asyncio
import time
start_time = time.time()
async def get_pokemon(session, url):
async with session.get(url) as resp:
pokemon = await resp.json()
return pokemon['name']
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for number in range(1, 151):
url = f'https://pokeapi.co/api/v2/pokemon/{number}'
tasks.append(asyncio.ensure_future(get_pokemon(session, url)))
original_pokemon = await asyncio.gather(*tasks)
for pokemon in original_pokemon:
print(pokemon)
asyncio.run(main())
print("--- %s seconds ---" % (time.time() - start_time))
Why is this so much faster?
In the first version of this code, there was a single event loop which first did 35 queries one after the other, so the next request would only be sent of the previous one was finished. After that all the entries were translated, which are 10 35 2 = 700 calls one after the other. shuffling the order around would not have made this any faster. When using async, all 35 request were dispatched and serviced simultaneously, and for each request when the server responded, the 10 * 2 entries were immediately translated simultaneously as well. With the inclusion of the backoff package, connecting this to async requests and their logging system, by adding just 10 lines of code we made our code 10 times faster, more reliably and easier to maintain down the line.