Skip to content
This repository was archived by the owner on Apr 22, 2024. It is now read-only.

Commit 849ec94

Browse files
committed
Covid experiments
1 parent 212ca51 commit 849ec94

File tree

13 files changed

+33127
-0
lines changed

13 files changed

+33127
-0
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ Apps and API for exploring open data sources including [AWS Registry of Open Dat
3232

3333
![](https://github.com/aws-samples/cloud-experiments/blob/master/open-data-explorer/s3-app-start.png)
3434

35+
## [COVID EDA and Models](https://github.com/aws-samples/cloud-experiments/tree/master/covid-insights)
36+
37+
Experiments for running exploratory data analysis (EDA) and models on COVID related open datasets. This includes Case Fatality Rate model on country data from John Hopkins. EDA techniques include growth factor analysis, cases growth rate, doubling rate, recovery and mortality rate, and country-wise analysis.
38+
39+
![](https://github.com/aws-samples/cloud-experiments/blob/master/covid-insights/cfr.png)
40+
3541
## Amazon SageMaker Notebooks
3642

3743
You may want to run these notebooks using [Amazon SageMaker](https://aws.amazon.com/sagemaker/). Amazon SageMaker is a fully-managed service that covers the entire machine learning workflow to label and prepare your data, choose an algorithm, train the model, tune and optimize it for deployment, make predictions, and take action.

covid-insights/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# COVID Insights
2+
3+
Experiments for running exploratory data analysis (EDA) and models on COVID related open datasets. This includes Case Fatality Rate model on country data from John Hopkins. EDA techniques include growth factor analysis, cases growth rate, doubling rate, recovery and mortality rate, and country-wise analysis.
4+
5+
Start the first experiment using `streamlit run covid-insights/cov-dash.py` to run Case Fatality Rate model directly on latest country data updated from John Hopkins repository.
6+
7+
![](cfr.png)
8+
9+
Second experiment runs using `streamlit run covid-insights/covid-app.py` on archived dataset from Kaggle.
10+
11+
![](covid-app.png)
12+
5.18 KB
Binary file not shown.

covid-insights/cfr.png

333 KB
Loading

covid-insights/cov-dash.py

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
# Source: https://github.com/cwerner/covid19
2+
# 1. Source data directly from GitHub (JHU COVID)
3+
# 2. Configurable UI based on two variables - inhabitants and countries
4+
# 3. Use Altair (https://altair-viz.github.io/) declarative statistical visualization charts
5+
6+
import datetime
7+
from functools import reduce
8+
import streamlit as st
9+
from streamlit import caching
10+
import pandas as pd
11+
import altair as alt
12+
import os
13+
14+
# numbers for 2019
15+
inhabitants = {'India': 1352.6,
16+
'US': 328.2,
17+
'Brazil': 209.5,
18+
'Russia': 144.5,
19+
'United Kingdom': 67.1,
20+
'China': 1392.7,
21+
'Italy': 60.23}
22+
23+
@st.cache
24+
def read_data():
25+
BASEURL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series"
26+
url_confirmed = f"{BASEURL}/time_series_covid19_confirmed_global.csv"
27+
url_deaths = f"{BASEURL}/time_series_covid19_deaths_global.csv"
28+
url_recovered = f"{BASEURL}/time_series_covid19_recovered_global.csv"
29+
30+
confirmed = pd.read_csv(url_confirmed, index_col=0)
31+
deaths = pd.read_csv(url_deaths, index_col=0)
32+
recovered = pd.read_csv(url_recovered, index_col=0)
33+
34+
# sum over potentially duplicate rows (France and their territories)
35+
confirmed = confirmed.groupby("Country/Region").sum().reset_index()
36+
deaths = deaths.groupby("Country/Region").sum().reset_index()
37+
recovered = recovered.groupby("Country/Region").sum().reset_index()
38+
39+
return (confirmed, deaths, recovered)
40+
41+
def transform(df, collabel='confirmed'):
42+
dfm = pd.melt(df)
43+
dfm["date"] = pd.to_datetime(dfm.variable, infer_datetime_format=True)
44+
dfm = dfm.set_index("date")
45+
dfm = dfm[["value"]]
46+
dfm.columns = [collabel]
47+
return dfm
48+
49+
def transform2(df, collabel='confirmed'):
50+
dfm = pd.melt(df, id_vars=["Country/Region"])
51+
dfm["date"] = pd.to_datetime(dfm.variable, infer_datetime_format=True)
52+
dfm = dfm.set_index("date")
53+
dfm = dfm[["Country/Region","value"]]
54+
dfm.columns = ["country", collabel]
55+
return dfm
56+
57+
def app():
58+
st.title("🦠 Covid-19 Data Explorer")
59+
st.markdown("""\
60+
This app illustrates the spread of COVID-19 in select countries over time.
61+
""")
62+
63+
#st.error("⚠️ There is currently an issue in the datasource of JHU. Data for 03/13 is invalid and thus removed!")
64+
65+
countries = ["India", "US", "Russia", "Brazil", "China", "Italy", "United Kingdom"]
66+
67+
analysis = st.sidebar.selectbox("Choose Analysis", ["Overview", "By Country"])
68+
69+
if analysis == "Overview":
70+
71+
st.header("COVID-19 cases and fatality rate")
72+
st.markdown("""\
73+
These are the reported case numbers for a selection of countries"""
74+
f""" (currently only {', '.join(countries)}). """
75+
"""The case fatality rate (CFR) is calculated as:
76+
$$
77+
CFR[\%] = \\frac{fatalities}{\\textit{all cases}}
78+
$$
79+
80+
ℹ️ You can select/ deselect countries and switch between linear and log scales.
81+
""")
82+
83+
confirmed, deaths, recovered = read_data()
84+
85+
multiselection = st.multiselect("Select countries:", countries, default=countries)
86+
logscale = st.checkbox("Log scale", False)
87+
88+
confirmed = confirmed[confirmed["Country/Region"].isin(multiselection)]
89+
confirmed = confirmed.drop(["Lat", "Long"],axis=1)
90+
confirmed = transform2(confirmed, collabel="confirmed")
91+
92+
deaths = deaths[deaths["Country/Region"].isin(multiselection)]
93+
deaths = deaths.drop(["Lat", "Long"],axis=1)
94+
deaths = transform2(deaths, collabel="deaths")
95+
96+
frate = confirmed[["country"]]
97+
frate["frate"] = (deaths.deaths / confirmed.confirmed)*100
98+
99+
# saveguard for empty selection
100+
if len(multiselection) == 0:
101+
return
102+
103+
SCALE = alt.Scale(type='linear')
104+
if logscale:
105+
confirmed["confirmed"] += 0.00001
106+
107+
confirmed = confirmed[confirmed.index > '2020-02-16']
108+
frate = frate[frate.index > '2020-02-16']
109+
110+
SCALE = alt.Scale(type='log', domain=[10, int(max(confirmed.confirmed))], clamp=True)
111+
112+
113+
c2 = alt.Chart(confirmed.reset_index()).properties(height=150).mark_line().encode(
114+
x=alt.X("date:T", title="Date"),
115+
y=alt.Y("confirmed:Q", title="Cases", scale=SCALE),
116+
color=alt.Color('country:N', title="Country")
117+
)
118+
119+
# case fatality rate...
120+
c3 = alt.Chart(frate.reset_index()).properties(height=100).mark_line().encode(
121+
x=alt.X("date:T", title="Date"),
122+
y=alt.Y("frate:Q", title="Fatality rate [%]", scale=alt.Scale(type='linear')),
123+
color=alt.Color('country:N', title="Country")
124+
)
125+
126+
per100k = confirmed.loc[[confirmed.index.max()]].copy()
127+
per100k.loc[:,'inhabitants'] = per100k.apply(lambda x: inhabitants[x['country']], axis=1)
128+
per100k.loc[:,'per100k'] = per100k.confirmed / (per100k.inhabitants * 1_000_000) * 100_000
129+
per100k = per100k.set_index("country")
130+
per100k = per100k.sort_values(ascending=False, by='per100k')
131+
per100k.loc[:,'per100k'] = per100k.per100k.round(2)
132+
133+
c4 = alt.Chart(per100k.reset_index()).properties(width=75).mark_bar().encode(
134+
x=alt.X("per100k:Q", title="Cases per 100k inhabitants"),
135+
y=alt.Y("country:N", title="Countries", sort=None),
136+
color=alt.Color('country:N', title="Country"),
137+
tooltip=[alt.Tooltip('country:N', title='Country'),
138+
alt.Tooltip('per100k:Q', title='Cases per 100k'),
139+
alt.Tooltip('inhabitants:Q', title='Inhabitants [mio]')]
140+
)
141+
142+
st.altair_chart(alt.hconcat(c4, alt.vconcat(c2, c3)), use_container_width=True)
143+
144+
st.markdown(f"""\
145+
<div style="font-size: small">
146+
⚠️ Please take the CFR with a grain of salt. The ratio is
147+
highly dependend on the total number of tests conducted in a country. In the early stages
148+
of the outbreak often mainly severe cases with clear symptoms are detected. Thus mild cases
149+
are not recorded which skews the CFR.
150+
</div><br/>
151+
152+
""", unsafe_allow_html=True)
153+
154+
155+
elif analysis == "By Country":
156+
157+
confirmed, deaths, recovered = read_data()
158+
159+
st.header("Country statistics")
160+
st.markdown("""\
161+
The reported number of active, recovered and deceased COVID-19 cases by country """
162+
f""" (currently only {', '.join(countries)}).
163+
"""
164+
"""
165+
ℹ️ You can select countries and plot data as cummulative counts or new active cases per day.
166+
""")
167+
168+
# selections
169+
selection = st.selectbox("Select country:", countries)
170+
cummulative = st.radio("Display type:", ["total", "new cases"])
171+
#scaletransform = st.radio("Plot y-axis", ["linear", "pow"])
172+
173+
confirmed = confirmed[confirmed["Country/Region"] == selection].iloc[:,3:]
174+
confirmed = transform(confirmed, collabel="confirmed")
175+
176+
deaths = deaths[deaths["Country/Region"] == selection].iloc[:,3:]
177+
deaths = transform(deaths, collabel="deaths")
178+
179+
recovered = recovered[recovered["Country/Region"] == selection].iloc[:,3:]
180+
recovered = transform(recovered, collabel="recovered")
181+
182+
183+
df = reduce(lambda a,b: pd.merge(a,b, on='date'), [confirmed, recovered, deaths])
184+
df["active"] = df.confirmed - (df.deaths + df.recovered)
185+
186+
variables = ["recovered", "active", "deaths"]
187+
colors = ["steelblue", "orange", "black"]
188+
189+
value_vars = variables
190+
SCALE = alt.Scale(domain=variables, range=colors)
191+
if cummulative == 'new cases':
192+
value_vars = ["new"]
193+
df["new"] = df.confirmed - df.shift(1).confirmed
194+
df["new"].loc[df.new < 0] = 0
195+
SCALE = alt.Scale(domain=["new"], range=["orange"])
196+
197+
dfm = pd.melt(df.reset_index(), id_vars=["date"], value_vars=value_vars)
198+
199+
# introduce order col as altair does auto-sort on stacked elements
200+
dfm['order'] = dfm['variable'].replace(
201+
{val: i for i, val in enumerate(variables[::-1])}
202+
)
203+
204+
c = alt.Chart(dfm.reset_index()).mark_bar().properties(height=200).encode(
205+
x=alt.X("date:T", title="Date"),
206+
y=alt.Y("sum(value):Q", title="Cases", scale=alt.Scale(type='linear')),
207+
color=alt.Color('variable:N', title="Category", scale=SCALE), #, sort=alt.EncodingSortField('value', order='ascending')),
208+
order='order'
209+
)
210+
211+
if cummulative != 'new cases':
212+
st.altair_chart(c, use_container_width=True)
213+
else:
214+
# add smooth 7-day trend
215+
rm_7day = df[['new']].rolling('7D').mean().rename(columns={'new': 'value'})
216+
c_7day = alt.Chart(rm_7day.reset_index()).properties(height=200).mark_line(strokeDash=[1,1], color='red').encode(
217+
x=alt.X("date:T", title="Date"),
218+
y=alt.Y("value:Q", title="Cases", scale=alt.Scale(type='linear')),
219+
)
220+
st.altair_chart((c + c_7day), use_container_width=True)
221+
st.markdown(f"""\
222+
<div style="font-size: small">Daily reported new cases (incl. 7-day average).</div><br/>
223+
""", unsafe_allow_html=True)
224+
225+
226+
st.info("""\
227+
228+
by: [C. Werner](https://www.christianwerner.net) | source: [GitHub](https://www.github.com/cwerner/covid19)
229+
| data source: [Johns Hopkins Univerity (GitHub)](https://github.com/CSSEGISandData/COVID-19).
230+
""")
231+
232+
233+
# ----------------------
234+
235+
app()

covid-insights/covid-app.png

262 KB
Loading

covid-insights/covid-app.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import streamlit as st
2+
import pandas as pd
3+
import numpy as np
4+
import matplotlib.pyplot as plt
5+
import seaborn as sns
6+
import plotly.graph_objects as go
7+
from plotly.subplots import make_subplots
8+
import covid as cov
9+
10+
st.title('COVID Exploratory Data Analysis')
11+
12+
# Data from https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset?select=covid_19_data.csv
13+
14+
covid = pd.read_csv('data/494724_1196190_compressed_covid_19_data.csv.zip')
15+
16+
# Dropping column as SNo is of no use, and 'Province/State' contains too many missing values
17+
covid.drop(['SNo'], 1, inplace=True)
18+
19+
st.header('Dataset')
20+
st.write(covid)
21+
22+
# Converting 'Observation Date' into Datetime format
23+
covid['ObservationDate']=pd.to_datetime(covid['ObservationDate'])
24+
25+
# Grouping different types of cases as per the date
26+
datewise = covid.groupby(['ObservationDate']).agg({
27+
'Confirmed': 'sum',
28+
'Recovered': 'sum',
29+
'Deaths': 'sum'
30+
})
31+
32+
datewise['Days Since'] = datewise.index-datewise.index.min()
33+
datewise["WeekOfYear"]=datewise.index.weekofyear
34+
35+
india_data=covid[covid["Country/Region"]=="India"]
36+
datewise_india=india_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
37+
datewise_india['Days Since'] = datewise_india.index-datewise.index.min()
38+
datewise_india["WeekOfYear"]=datewise_india.index.weekofyear
39+
40+
st.header('Global Analysis')
41+
42+
st.line_chart(datewise[['Confirmed', 'Deaths', 'Recovered']])
43+
44+
st.subheader('Global Growth Factor')
45+
cov.growth_factor(datewise)
46+
47+
st.subheader('India Growth Factor')
48+
cov.growth_factor(datewise_india)
49+
50+
st.subheader('Global Weekly Growth of Cases')
51+
cov.weekly_increase(datewise)
52+
53+
st.subheader('India Weekly Growth of Cases')
54+
cov.weekly_increase(datewise_india)
55+
56+
st.subheader('Global Doubling Rate')
57+
cov.double_days(datewise)
58+
59+
st.subheader('India Doubling Rate')
60+
cov.double_days(datewise_india)
61+
62+
st.subheader('Daily Growth')
63+
cov.growth_scatter(datewise)
64+
65+
st.subheader('Recovery and Mortality')
66+
cov.mortality(datewise)
67+
68+
st.subheader('Daily Increases Stats')
69+
cov.daily_increase(datewise)
70+
71+
st.header('Countrywise Analysis')
72+
73+
#Calculating countrywise Mortality and Recovery Rate
74+
countrywise=covid[covid["ObservationDate"]==covid["ObservationDate"].max()].groupby(["Country/Region"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'}).sort_values(["Confirmed"],ascending=False)
75+
countrywise["Mortality"]=(countrywise["Deaths"]/countrywise["Confirmed"])*100
76+
countrywise["Recovery"]=(countrywise["Recovered"]/countrywise["Confirmed"])*100
77+
78+
fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(10,12))
79+
top_15_confirmed=countrywise.sort_values(["Confirmed"],ascending=False).head(15)
80+
top_15_deaths=countrywise.sort_values(["Deaths"],ascending=False).head(15)
81+
sns.barplot(x=top_15_confirmed["Confirmed"],y=top_15_confirmed.index,ax=ax1)
82+
ax1.set_title("Top 15 countries as per Number of Confirmed Cases")
83+
sns.barplot(x=top_15_deaths["Deaths"],y=top_15_deaths.index,ax=ax2)
84+
ax2.set_title("Top 15 countries as per Number of Death Cases")
85+
86+
st.pyplot(fig)
87+
88+
st.header('India Analysis')
89+
90+
st.line_chart(datewise_india[['Confirmed', 'Deaths', 'Recovered']])
91+
92+
st.write(datewise_india.iloc[-1])
93+
st.write("Total Active Cases: ",datewise_india["Confirmed"].iloc[-1]-datewise_india["Recovered"].iloc[-1]-datewise_india["Deaths"].iloc[-1])
94+
st.write("Total Closed Cases: ",datewise_india["Recovered"].iloc[-1]+datewise_india["Deaths"].iloc[-1])
95+
96+
st.subheader('India Growth Daily')
97+
cov.growth_scatter(datewise_india)
98+
99+
st.subheader('India Daily Increase in Cases')
100+
cov.daily_increase(datewise_india)
101+
102+
st.subheader('India Recovery and Mortality')
103+
cov.mortality(datewise_india)
104+
105+
st.subheader('India Compared with Other Countries')
106+
107+
Italy_data=covid[covid["Country/Region"]=="Italy"]
108+
US_data=covid[covid["Country/Region"]=="US"]
109+
spain_data=covid[covid["Country/Region"]=="Spain"]
110+
datewise_Italy=Italy_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
111+
datewise_US=US_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
112+
datewise_Spain=spain_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
113+
114+
max_ind=datewise_india["Confirmed"].max()
115+
fig = plt.figure(figsize=(12,6))
116+
plt.plot(datewise_Italy[(datewise_Italy["Confirmed"]>0)&(datewise_Italy["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases Italy",linewidth=3)
117+
plt.plot(datewise_US[(datewise_US["Confirmed"]>0)&(datewise_US["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases USA",linewidth=3)
118+
plt.plot(datewise_Spain[(datewise_Spain["Confirmed"]>0)&(datewise_Spain["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases Spain",linewidth=3)
119+
plt.plot(datewise_india[datewise_india["Confirmed"]>0]["Confirmed"],label="Confirmed Cases India",linewidth=3)
120+
plt.xlabel("Date")
121+
plt.ylabel("Number of Confirmed Cases")
122+
plt.title("Growth of Confirmed Cases")
123+
plt.legend()
124+
plt.xticks(rotation=90)
125+
126+
st.write("It took",datewise_Italy[(datewise_Italy["Confirmed"]>0)&(datewise_Italy["Confirmed"]<=max_ind)].shape[0],"days in Italy to reach number of Confirmed Cases equivalent to India")
127+
st.write("It took",datewise_US[(datewise_US["Confirmed"]>0)&(datewise_US["Confirmed"]<=max_ind)].shape[0],"days in USA to reach number of Confirmed Cases equivalent to India")
128+
st.write("It took",datewise_Spain[(datewise_Spain["Confirmed"]>0)&(datewise_Spain["Confirmed"]<=max_ind)].shape[0],"days in Spain to reach number of Confirmed Cases equivalent to India")
129+
st.write("It took",datewise_india[datewise_india["Confirmed"]>0].shape[0],"days in India to reach",max_ind,"Confirmed Cases")
130+
131+
st.pyplot(fig)
132+

0 commit comments

Comments
 (0)