Instructions
Requirements and Specifications
Source Code
import pandas as pd
import numpy as np
data = pd.read_csv('COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv')
data.head()
# Part a)
cases_total = data.groupby(['case_month']).size().reset_index().rename(columns={0:'count'})
# Delete where case_month has the following values
invalid_vals = [' "error" : true', ' "message" : "Internal error"', ' "status" : 500', '}']
cases_total = cases_total[~cases_total['case_month'].isin(invalid_vals)]
# Set a new column named diff
cases_total['diff'] = 0
# For the first row set NaN
cases_total.loc[3,'diff'] = np.nan
cases_total.reset_index(drop=True, inplace=True)
# Calculate diff
cases_total['diff']=cases_total['count'].pct_change()*100
cases_total.head()
### Now, save dataframe to cases_total.csv
cases_total.to_csv('cases_total.csv', index = False)
# Part b)
cases_by_state = data.groupby(['case_month', 'res_state']).count().reset_index().rename(columns={0:'count'})
cases_by_state = cases_by_state.rename(columns = {'state_fips_code': 'count'})
cases_by_state = cases_by_state[['case_month', 'res_state', 'count']]
cases_by_state.tail()
cases_by_state = cases_by_state.rename(columns = {'state_fips_code': 'count'})
cases_by_state = cases_by_state[['case_month', 'res_state', 'count']]
cases_by_state.tail()
state_diff = cases_by_state.groupby(['case_month']).sum().reset_index()
state_diff.head()
state_diff['diff']=state_diff['count'].pct_change()*100
state_diff.head()
for i in range(len(state_diff)):
month = state_diff.loc[i, 'case_month']
diff = state_diff.loc[i, 'diff']
cases_by_state.loc[cases_by_state['case_month'] == month, 'diff'] = diff
cases_by_state.tail()
## Population by state
Dataset with population by state
pop_state = pd.read_csv('NST-EST2021-alldata.csv')
# Take only the columns we are interested
pop_state = pop_state[['NAME', 'POPESTIMATE2021']]
# Add a column named Abbrev
pop_state['Abbrev'] = ''
pop_state.head()
## Read states and their abreviations
states_abbrev = pd.read_csv('StatesAbbrev.csv')
states_abbrev.head()
## For each state in pop_state, get their abbreviation in states_abbrev and add to pop_state
for i in range(len(pop_state)):
state = pop_state.loc[i, 'NAME']
# get abbrev
abbrev = states_abbrev[states_abbrev['State'] == state]['Code']
if len(abbrev) > 0:
pop_state.loc[pop_state['NAME'] == state, 'Abbrev'] = abbrev.values[0]
# Drop rows where abbrev is empty
pop_state = pop_state.dropna()
pop_state.head()
### Now for each state-code in cases_by_state, compute the number of cases per 100000
for i in range(len(cases_by_state)):
state_code = cases_by_state.loc[i, 'res_state']
# Get the population for this state
pop = pop_state[pop_state['Abbrev'] == state_code]['POPESTIMATE2021']
if len(pop) > 0:
pop = pop.values[0]
# Update 'count
cases_by_state.loc[i, 'count'] = cases_by_state.loc[i, 'count']*100000/pop
cases_by_state.head()
# Sort by month and count
cases_by_state = cases_by_state.sort_values(by=['case_month', 'count'])
cases_by_state.head()
# Recalculate diff
states_diff = cases_by_state.groupby(['case_month']).sum().reset_index()
states_diff.head()
state_diff['diff']=state_diff['count'].pct_change()*100
state_diff.head()
for i in range(len(state_diff)):
month = state_diff.loc[i, 'case_month']
diff = state_diff.loc[i, 'diff']
cases_by_state.loc[cases_by_state['case_month'] == month, 'diff'] = diff
cases_by_state.tail()
### Write csv
with open('cases_by_state.csv', 'w') as f:
for month in cases_by_state['case_month'].unique():
rows = cases_by_state[cases_by_state['case_month'] == month]
rows = rows.to_numpy()
for row in rows:
row = list(map(str, row))
line = ','.join(row)
f.write(line + '\n')
f.write('\n')
for row in rows.to_numpy():
row = list(map(str, row))
print(','.join(row))
# Part c) CSV File with population