# Libraries necessary for Python operations:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn import linear_model
import statsmodels.formula.api as smf


# Retrieve data from opendata.maryland.gov
cumulative = pd.read_csv('https://opendata.maryland.gov/resource/raxi-hzpn.csv')
daily = pd.read_csv('https://opendata.maryland.gov/resource/36md-srvk.csv')
gender = pd.read_csv('https://opendata.maryland.gov/resource/w2wg-xw5p.csv')
race = pd.read_csv('https://opendata.maryland.gov/resource/xtrn-8azc.csv')
age = pd.read_csv('https://opendata.maryland.gov/resource/daz6-3c89.csv')

# Merge DataFrames containing cumulative total to date and daily total
pdf = cumulative.merge(daily, how='inner', on='date')
pdf.drop(['objectid_x', 'objectid_y'], axis=1, inplace=True)
pdf.rename(columns={'count_x': 'total', 'count_y': 'dateTotal'}, inplace=True)

# Merge DataFrame containing cumulative gender data
pdf = pdf.merge(gender, how='inner', on='date')
pdf.drop(['objectid'], axis=1, inplace=True)
pdf.rename(columns={'unknown': 'genderUnknown'}, inplace=True)

# Merge DataFrame containing race data
pdf = pdf.merge(race, how='inner', on='date')
pdf.drop(['objectid'], axis=1, inplace=True)
pdf.rename(columns={'not_available': 'raceUnknown'}, inplace=True)

# Merge DataFrame containing age data
pdf = pdf.merge(age, how='inner', on='date')
pdf.drop(['objectid'], axis=1, inplace=True)
pdf.rename(columns={'age_unknown': 'ageUnknown'})

# Convert columns from type float to type int
for col in pdf:
    if col != 'date': # Don't want to attempt to convert datetime column
        pdf[col] = pdf[col].fillna(0.0).astype(int)
        
# Remove time from datetime for readability and to remove unecessary data
pdf['date'] = pd.to_datetime(pdf['date']).dt.date
        
# Remove first two rows which mainly consist of missing data values
# as well as the last few rows containing Decmeber data since the majority of 
# December to present has been unreported due to cyberattacks
pdf.drop([0, 1, 597, 598, 599, 600], axis=0, inplace=True)


pdf.head(5)


# Retrieve data from opendata.maryland.gov
cumulative2 = pd.read_csv('https://opendata.maryland.gov/resource/w9rb-g7zs.csv')
daily2 = pd.read_csv('https://opendata.maryland.gov/resource/65qq-j35q.csv')
gender2 = pd.read_csv('https://opendata.maryland.gov/resource/6wn5-z595.csv')
race2 = pd.read_csv('https://opendata.maryland.gov/resource/qwhp-7983.csv')
age2 = pd.read_csv('https://opendata.maryland.gov/resource/ix2d-fenx.csv')

# Merge DataFrames containing cumulative total to date and daily total
cdf = cumulative2.merge(daily2, how='inner', on='date')
cdf.drop(['objectid_x', 'objectid_y'], axis=1, inplace=True)
cdf.rename(columns={'count_x': 'total', 'count_y': 'dateTotal'}, inplace=True)

# Merge DataFrame containing cumulative gender data
cdf = cdf.merge(gender2, how='inner', on='date')
cdf.drop(['objectid'], axis=1, inplace=True)
cdf.rename(columns={'unknown': 'genderUnknown'}, inplace=True)

# Merge DataFrame containing race data
cdf = cdf.merge(race2, how='inner', on='date')
cdf.drop(['objectid'], axis=1, inplace=True)
cdf.rename(columns={'not_available': 'raceUnknown'}, inplace=True)

# Merge DataFrame containing age data
cdf = cdf.merge(age2, how='inner', on='date')
cdf.drop(['objectid'], axis=1, inplace=True)
cdf.rename(columns={'age_unknown': 'ageUnknown'})

# Convert columns from type float to type int
for col in cdf:
    if col != 'date': # Don't want to attempt to convert datetime column
        cdf[col] = cdf[col].fillna(0.0).astype(int)

# Remove time from datetime for readability and to remove unecessary data
cdf['date'] = pd.to_datetime(cdf['date']).dt.date        

# Remove first two rows which mainly consist of missing data values
# as well as the last few rows containing Decmeber data since the majority of 
# December to present has been unreported due to cyberattacks
cdf.drop([602, 603, 604, 605], axis=0, inplace=True)


cdf.head(5)


# Merge dataframes so NAN values can be treated as 0 during summation
temp_df = pdf.merge(cdf, how='right', on='date', suffixes=('_pdf', '_cdf')).fillna(0)

jdict = {} # Temporarily holds summations before creating a DataFrame
jdict['date'] = temp_df['date']

# Iterate through column pairs, retrieving values from merged DataFrame
for i in range(1, len(pdf.columns)):
    jdict[pdf.columns[i]] = temp_df[pdf.columns[i]+'_pdf'] + temp_df[cdf.columns[i]+'_cdf']
    
jdf = pd.DataFrame(jdict)


jdf.head(5)


plt.plot(jdf['date'], jdf['male'], label='Male')
plt.plot(jdf['date'], jdf['female'], label='Female')
plt.grid(b=True, which='major', axis='both')
plt.legend(['Male', 'Female'], loc='upper left')
plt.xlabel('Date')
plt.ylabel('Cumulative Deaths')
plt.title('Cumulative Deaths by Gender Over Time')
plt.show()


plt.plot(jdf['date'], jdf['african_american'], label='African American')
plt.plot(jdf['date'], jdf['white'], label='White')
plt.plot(jdf['date'], jdf['hispanic'], label='Hispanic')
plt.plot(jdf['date'], jdf['asian'], label='Asian')
plt.plot(jdf['date'], jdf['other'], label='Other')
plt.plot(jdf['date'], jdf['raceUnknown'], label='Race Unknown')
plt.grid(b=True, which='major', axis='both')
plt.legend(['African American', 'White', 'Hispanic', 'Asian', 'Other', 'Race Unknown'], loc='upper left')
plt.xlabel('Date')
plt.ylabel('Cumulative Deaths')
plt.title('Cumulative Deaths by Race Over Time')
plt.show()


plt.plot(jdf['date'], jdf['age_0_to_9'], label='0-9')
plt.plot(jdf['date'], jdf['age_10_to_19'], label='10-19')
plt.plot(jdf['date'], jdf['age_20_to_29'], label='20-19')
plt.plot(jdf['date'], jdf['age_30_to_39'], label='30-39')
plt.plot(jdf['date'], jdf['age_40_to_49'], label='40-49')
plt.plot(jdf['date'], jdf['age_50_to_59'], label='50-59')
plt.plot(jdf['date'], jdf['age_60_to_69'], label='60-69')
plt.plot(jdf['date'], jdf['age_70_to_79'], label='70-79')
plt.plot(jdf['date'], jdf['age_80plus'], label='80+')
plt.plot(jdf['date'], jdf['age_unknown'], label='age_unknown')
plt.grid(b=True, which='major', axis='both')
plt.legend(['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80+', 'unknown'], loc='upper left')
plt.xlabel('Date')
plt.ylabel('Cumulative Deaths')
plt.title('Cumulative Deaths by Age Over Time')
plt.show()

# Find mean age
ageData = jdf.iloc[-1][12:21]
avrgAge = 5*ageData[0]+15*ageData[1]+25*ageData[2]+35*ageData[3]+45*ageData[4]+55*ageData[5]+65*ageData[6]+ \
          75*ageData[7]+85*ageData[8]
avrgAge = avrgAge/sum(ageData)
print('Average age of individuals lost to COVID-19: ' + str(avrgAge))

Average age of individuals lost to COVID-19: 73.717307863965


# Store race makeup of population as percent
raceToPercent = {'african_american': .311, 'white': .5, 'hispanic': .106, 'asian': .067, 'other': .036}
raceToPopulation = {}
totalPopulation = 6177224 # Maryland's total population as of 2020 census

# Calculate total population of each race
for race in raceToPercent.keys():
    raceToPopulation[race] = totalPopulation*raceToPercent[race]
    
# Calculate deaths as a percentage of population and create new dataframe
raceToPercentOfPop = {'african_americanPercent': [], 'whitePercent': [], 'hispanicPercent': [], 
                      'asianPercent': [], 'otherPercent': []}

for i in range(len(jdf)):
    for race in raceToPopulation.keys():
        curr = jdf.iloc[i][race]
        pop = raceToPopulation[race]
        percentOfPop = float(curr) / pop
        raceToPercentOfPop[str(race)+'Percent'].append(float(percentOfPop))

# Convert dict to DataFrame and add date column
rdf = pd.DataFrame(raceToPercentOfPop)
rdf.insert(loc=0, column='date', value=jdf['date'])


# For labeling
races = ['African American', 'White', 'Hispanic', 'Asian', 'Other']

# Graph plot
for rp in raceToPercentOfPop.keys():
    plt.plot(rdf['date'], rdf[rp], label='rp')

plt.grid(b=True, which='major', axis='both')
plt.legend(races, loc='upper left')
plt.xlabel('Date')
plt.ylabel('Cumulative Deaths as a Percent of Race Population')
plt.title('Cumulative Deaths as a Percent of Race Population Over Time')
plt.show()

# Bar Chart
sizes = rdf.iloc[len(rdf)-1][1:]
print(sizes)

fig1, ax1 = plt.subplots()
ax1.bar(races, sizes, color=['blue', 'orange', 'green', 'red', 'purple'])
plt.title('Cumulative Deaths as a Percent of Race Population as of November 30th')
plt.show()

# Table
rdf.head(5)

african_americanPercent     0.00206494
whitePercent                0.00187754
hispanicPercent             0.00139435
asianPercent               0.000879495
otherPercent                0.00054861
Name: 601, dtype: object


# Do the following for each racial group
for race in raceToPopulation.keys():
    # Initialize training and testing lists
    trainX = []
    trainY = []
    testX = []
    testY = []

    # Create training and testing lists
    for i, r in jdf.iterrows():
        trainX.append(r['total'])
        trainY.append(r[race])
        if i >= 420: # Train on 70% of data and test on 30%
            testX.append(r['total'])
            testY.append(r[race])

    # Train regression model
    npTrainX = np.array(trainX)
    npTrainX = npTrainX.reshape(-1, 1)
    reg = linear_model.LinearRegression()
    reg.fit(npTrainX, trainY)

    fig, ax = plt.subplots()
    npTestX = np.array(testX)
    npTestX = npTestX.reshape(-1, 1)

    # Plot actual proportion with predicted proportion on test data
    plt.plot(testX, testY)
    ax.plot(npTestX, reg.predict(npTestX))
    plt.grid(b=True, which='major', axis='both')
    ax.set_xlabel('Total COVID-19 Deaths')
    ax.set_ylabel('Proportion of {} Deaths'.format(race.replace('_', ' ').title()))
    ax.set_title('Proportion of {} Deaths of Total COVID-19 Deaths'.format(race.replace('_', ' ').title()))
    plt.show()


# Do the following for each racial group
for race in raceToPopulation.keys():
    mod = smf.ols(formula='{} ~ total'.format(race), data=jdf)
    res = mod.fit()
    
    print(race.replace('_', ' ').title() + '\n~~~~~~~~~~~~~~~~~~~')
    print(res.summary())
    print('\n\n')

African American
~~~~~~~~~~~~~~~~~~~
                            OLS Regression Results                            
==============================================================================
Dep. Variable:       african_american   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.995
Method:                 Least Squares   F-statistic:                 1.162e+05
Date:                Mon, 20 Dec 2021   Prob (F-statistic):               0.00
Time:                        17:07:29   Log-Likelihood:                -3467.7
No. Observations:                 602   AIC:                             6939.
Df Residuals:                     600   BIC:                             6948.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    184.5021      7.274     25.365      0.000     170.217     198.788
total          0.3383      0.001    340.904      0.000       0.336       0.340
==============================================================================
Omnibus:                       62.884   Durbin-Watson:                   0.014
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               71.098
Skew:                          -0.804   Prob(JB):                     3.64e-16
Kurtosis:                       2.500   Cond. No.                     1.70e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


White
~~~~~~~~~~~~~~~~~~~
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  white   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 2.268e+05
Date:                Mon, 20 Dec 2021   Prob (F-statistic):               0.00
Time:                        17:07:29   Log-Likelihood:                -3548.7
No. Observations:                 602   AIC:                             7101.
Df Residuals:                     600   BIC:                             7110.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -334.3157      8.322    -40.173      0.000    -350.659    -317.972
total          0.5407      0.001    476.259      0.000       0.538       0.543
==============================================================================
Omnibus:                       53.409   Durbin-Watson:                   0.007
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               65.426
Skew:                           0.769   Prob(JB):                     6.21e-15
Kurtosis:                       3.494   Cond. No.                     1.70e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


Hispanic
~~~~~~~~~~~~~~~~~~~
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               hispanic   R-squared:                       0.972
Model:                            OLS   Adj. R-squared:                  0.972
Method:                 Least Squares   F-statistic:                 2.069e+04
Date:                Mon, 20 Dec 2021   Prob (F-statistic):               0.00
Time:                        17:07:29   Log-Likelihood:                -3104.9
No. Observations:                 602   AIC:                             6214.
Df Residuals:                     600   BIC:                             6223.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     92.5682      3.981     23.251      0.000      84.749     100.387
total          0.0781      0.001    143.835      0.000       0.077       0.079
==============================================================================
Omnibus:                       88.686   Durbin-Watson:                   0.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              125.586
Skew:                          -1.063   Prob(JB):                     5.36e-28
Kurtosis:                       3.695   Cond. No.                     1.70e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


Asian
~~~~~~~~~~~~~~~~~~~
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  asian   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.996
Method:                 Least Squares   F-statistic:                 1.419e+05
Date:                Mon, 20 Dec 2021   Prob (F-statistic):               0.00
Time:                        17:07:29   Log-Likelihood:                -1999.3
No. Observations:                 602   AIC:                             4003.
Df Residuals:                     600   BIC:                             4011.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     14.2147      0.635     22.402      0.000      12.968      15.461
total          0.0326   8.66e-05    376.737      0.000       0.032       0.033
==============================================================================
Omnibus:                       87.621   Durbin-Watson:                   0.024
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              126.951
Skew:                          -1.125   Prob(JB):                     2.71e-28
Kurtosis:                       3.000   Cond. No.                     1.70e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


Other
~~~~~~~~~~~~~~~~~~~
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  other   R-squared:                       0.987
Model:                            OLS   Adj. R-squared:                  0.987
Method:                 Least Squares   F-statistic:                 4.641e+04
Date:                Mon, 20 Dec 2021   Prob (F-statistic):               0.00
Time:                        17:07:29   Log-Likelihood:                -1642.5
No. Observations:                 602   AIC:                             3289.
Df Residuals:                     600   BIC:                             3298.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.0099      0.351      2.879      0.004       0.321       1.699
total          0.0103   4.79e-05    215.424      0.000       0.010       0.010
==============================================================================
Omnibus:                       19.086   Durbin-Watson:                   0.102
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               17.215
Skew:                          -0.355   Prob(JB):                     0.000183
Kurtosis:                       2.573   Cond. No.                     1.70e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.7e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

	date	total	dateTotal	male	female	african_american	white	asian	...	age_20_to_29	age_30_to_39	age_40_to_49	age_50_to_59	age_60_to_69	age_70_to_79	age_80plus
0	2020-04-08	124.0	29.0	78.0	46.0	55.0	39.0	6.0	...	0.0	4.0	4.0	16.0	26.0	39.0	35.0
1	2020-04-09	138.0	29.0	85.0	53.0	61.0	45.0	6.0	...	0.0	4.0	4.0	16.0	28.0	43.0	43.0
2	2020-04-10	171.0	34.0	102.0	69.0	77.0	58.0	6.0	...	0.0	5.0	5.0	20.0	32.0	51.0	58.0
3	2020-04-11	206.0	42.0	121.0	85.0	91.0	72.0	7.0	...	1.0	8.0	5.0	22.0	42.0	57.0	71.0
4	2020-04-12	235.0	31.0	136.0	99.0	104.0	83.0	8.0	...	1.0	9.0	5.0	25.0	51.0	65.0	79.0

	date	african_americanPercent	whitePercent	asianPercent	otherPercent
0	2020-04-08	0.000029	0.000013	0.000014	0.000013
1	2020-04-09	0.000032	0.000015	0.000014	0.000022
2	2020-04-10	0.000040	0.000019	0.000014	0.000031
3	2020-04-11	0.000047	0.000023	0.000017	0.000040
4	2020-04-12	0.000054	0.000027	0.000019	0.000054

Impacts of the Coronavirus on Various Demographics in Maryland from 04/2020 to 11/2021¶

Introduction¶

Definitions¶

About the Data¶

Data Retrieval and Cleaning¶

Probable Death Datasets¶

Confirmed Death Datasets¶

Joint Probable Death and Confirmed Death Dataset¶

Data Analysis¶

Gender¶

Race¶

Age¶

Digging Deeper¶

Organizing Race Data¶

Visualizing Race Data¶

Analyzing Race Data¶

Predicting Trends¶

Visual Analysis¶

Numerical Analysis¶

Conclusion¶

	date	total	dateTotal	male	female	african_american	white	hispanic	asian	...	age_50_to_59	age_60_to_69	age_70_to_79	age_80plus	age_unknown
2	2020-04-15	64	2	31	33	13	34	1	1	...	3	9	12	26	14
3	2020-04-16	67	0	34	33	13	39	0	1	...	3	9	12	30	13
4	2020-04-17	69	1	34	35	16	40	0	1	...	3	9	13	33	11
5	2020-04-18	71	4	34	37	18	40	0	1	...	4	12	11	33	11
6	2020-04-19	62	1	30	32	16	34	0	1	...	5	10	10	27	10