Looking at COVID-19 Data

2 minute read

Lets work with building a API that takes covid data in csv form from a website (John Hopkins Data from Github) and into a database. Then lets plot some of the data using python in jupyter notebook.

import requests
import pandas as pd
import numpy as np
import os
import sqlite3
from tqdm.auto import tqdm # Progress Bars

PATH = 'csse_covid_19_data/csse_covid_19_daily_reports'
URL = f'https://api.github.com/repos/{OWNER}/{REPO}/contents/{PATH}'
print(f'Downloading paths from {URL}')

download_urls = []
response = requests.get(URL)
for data in tqdm(response.json()):
    if data['name'].endswith('.csv'):

# List of labels to be renamed!
relabel = {
    # 'Last Update': 'Last_Update',
    'Country/Region': 'Country_Region',             
    'Lat': 'Latitude',                            # Latitude
    'Long_': 'Longitude',                           # longitude
    'Province/State': 'Province_State',

Code to Update Column Names

def factor_dataframe(dat, filename):
    """ Refactor the dataframe to be uploaded into a SQL database
    as a pandas DataFrame
    # rename labels
    for label in dat:
        if label in relabel:
            dat = dat.rename(columns = {label: relabel[label]})
    # return a dataframe with these parameters
    labels = ['Province_State', 'Country_Region', 'Last_Update', 'Confirmed', 'Deaths', 'Recovered', 'Latitude', 'Longitude']
    # filename is datetime
    if 'Last_Update' not in dat:
        dat['Last_Update'] = pd.to_datetime(filename)

    # replace columns not in dataframe with nan
    for label in labels:
        if label not in dat:
            dat[label] = np.nan

    return dat[labels]

def upload_to_sql(filenames, db_name, debug=False):
    """ Given a list of paths, upload to a database
    conn = sqlite3.connect(f"{db_name}.db")
    if debug:
        print("Uploading into database")
    for i, file_path in tqdm(list(enumerate(filenames))):
        dat = pd.read_csv(file_path)

        # rename labels
        filename = os.path.basename(file_path).split('.')[0] 
        dat = factor_dataframe(dat, filename)

        # write records to sql database
        if i == 0: # if first entry, and table name already exist, replace
            dat.to_sql(db_name, con=conn, index = False, if_exists='replace')
        else: # otherwise append to current table given db_name
            dat.to_sql(db_name, con=conn, index = False, if_exists='append')

# upload into sql database
upload_to_sql(download_urls, 'covid_19', debug=True)

The Code above will create a database for use to use.

Looking at some Covid Data, lets connect to the Covid-19 database that we worked with earlier

import sqlite3, csv, pandas
import matplotlib.pyplot as plt

## contecting to a database using sqlite3
db = sqlite3.connect('covid_19.db')

Lets create a helper function that we can write our queries.

def Q(sql):
    res = pandas.read_sql_query(sql, db, chunksize = 100_000)
    return next(res)

Lets write some SQL queries to.

view = Q("SELECT * FROM [Michigan Covid]")


  Province_State Latitude Longitude
0 Michigan 44.684686 -83.595079
1 Michigan 46.412929 -86.602601
2 Michigan 42.591470 -85.891029
3 Michigan 45.034777 -83.622124
4 Michigan 44.996902 -85.155031
56528 Michigan 43.466068 -83.418970
56529 Michigan 42.251902 -86.019391
56530 Michigan 42.253105 -83.838532
56531 Michigan 42.280984 -83.281255
56532 Michigan 44.338536 -85.577127

Lets Plot!

import seaborn
import matplotlib.pyplot as plot

from matplotlib.pyplot import figure

figure(figsize=(20, 15), dpi=80)
plot.scatter(new.Longitude, new.Latitude)
plot.title('Long vs Lat')



Lets Graph Texas!

Texas = Q('''SELECT * 
             FROM [Texas Covid]
figure(figsize=(15, 12), dpi=80)
plot.scatter(Texas.Longitude, Texas.Latitude)
plot.title('Long vs Lat in TEXAS')



Pretty cool how using the Lat and Long to plot we can see the shape of two states!