Looking at COVID-19 Data
Lets work with building a API that takes covid data in csv form from a website (John Hopkins Data from Github) and into a database. Then lets plot some of the data using python in jupyter notebook.
import requests
import pandas as pd
import numpy as np
import os
import sqlite3
from tqdm.auto import tqdm # Progress Bars
OWNER = 'CSSEGISandData'
REPO = 'COVID-19'
PATH = 'csse_covid_19_data/csse_covid_19_daily_reports'
URL = f'https://api.github.com/repos/{OWNER}/{REPO}/contents/{PATH}'
print(f'Downloading paths from {URL}')
download_urls = []
response = requests.get(URL)
for data in tqdm(response.json()):
if data['name'].endswith('.csv'):
download_urls.append(data['download_url'])
# List of labels to be renamed!
relabel = {
# 'Last Update': 'Last_Update',
'Country/Region': 'Country_Region',
'Lat': 'Latitude', # Latitude
'Long_': 'Longitude', # longitude
'Province/State': 'Province_State',
}
Code to Update Column Names
def factor_dataframe(dat, filename):
""" Refactor the dataframe to be uploaded into a SQL database
as a pandas DataFrame
"""
# rename labels
for label in dat:
if label in relabel:
dat = dat.rename(columns = {label: relabel[label]})
# return a dataframe with these parameters
labels = ['Province_State', 'Country_Region', 'Last_Update', 'Confirmed', 'Deaths', 'Recovered', 'Latitude', 'Longitude']
# filename is datetime
if 'Last_Update' not in dat:
dat['Last_Update'] = pd.to_datetime(filename)
# replace columns not in dataframe with nan
for label in labels:
if label not in dat:
dat[label] = np.nan
return dat[labels]
def upload_to_sql(filenames, db_name, debug=False):
""" Given a list of paths, upload to a database
"""
conn = sqlite3.connect(f"{db_name}.db")
if debug:
print("Uploading into database")
for i, file_path in tqdm(list(enumerate(filenames))):
dat = pd.read_csv(file_path)
# rename labels
filename = os.path.basename(file_path).split('.')[0]
dat = factor_dataframe(dat, filename)
# write records to sql database
if i == 0: # if first entry, and table name already exist, replace
dat.to_sql(db_name, con=conn, index = False, if_exists='replace')
else: # otherwise append to current table given db_name
dat.to_sql(db_name, con=conn, index = False, if_exists='append')
# upload into sql database
upload_to_sql(download_urls, 'covid_19', debug=True)
The Code above will create a database for use to use.
Looking at some Covid Data, lets connect to the Covid-19 database that we worked with earlier
import sqlite3, csv, pandas
import matplotlib.pyplot as plt
## contecting to a database using sqlite3
db = sqlite3.connect('covid_19.db')
Lets create a helper function that we can write our queries.
def Q(sql):
res = pandas.read_sql_query(sql, db, chunksize = 100_000)
return next(res)
Lets write some SQL queries to.
view = Q("SELECT * FROM [Michigan Covid]")
Output:
Province_State | Latitude | Longitude | |
---|---|---|---|
0 | Michigan | 44.684686 | -83.595079 |
1 | Michigan | 46.412929 | -86.602601 |
2 | Michigan | 42.591470 | -85.891029 |
3 | Michigan | 45.034777 | -83.622124 |
4 | Michigan | 44.996902 | -85.155031 |
… | … | … | … |
56528 | Michigan | 43.466068 | -83.418970 |
56529 | Michigan | 42.251902 | -86.019391 |
56530 | Michigan | 42.253105 | -83.838532 |
56531 | Michigan | 42.280984 | -83.281255 |
56532 | Michigan | 44.338536 | -85.577127 |
Lets Plot!
import seaborn
import matplotlib.pyplot as plot
from matplotlib.pyplot import figure
figure(figsize=(20, 15), dpi=80)
plot.scatter(new.Longitude, new.Latitude)
plot.title('Long vs Lat')
plot.ylabel("Lat")
plot.xlabel("Long")
plot.grid()
Output
Lets Graph Texas!
Texas = Q('''SELECT *
FROM [Texas Covid]
'''
)
figure(figsize=(15, 12), dpi=80)
plot.scatter(Texas.Longitude, Texas.Latitude)
plot.title('Long vs Lat in TEXAS')
plot.ylabel("Latitude")
plot.xlabel("Longitude")
plot.grid()
Output
Pretty cool how using the Lat and Long to plot we can see the shape of two states!