aws-samples
diff --git a/‎README.md
Lines changed: 6 additions & 0 deletions b/‎README.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎covid-insights/README.md
Lines changed: 12 additions & 0 deletions b/‎covid-insights/README.md
Lines changed: 12 additions & 0 deletions
diff --git a/‎covid-insights/__pycache__/covid.cpython-38.pyc
5.18 KB b/‎covid-insights/__pycache__/covid.cpython-38.pyc
5.18 KB
diff --git a/‎covid-insights/cfr.png
333 KB b/‎covid-insights/cfr.png
333 KB
diff --git a/‎covid-insights/cov-dash.py
Lines changed: 235 additions & 0 deletions b/‎covid-insights/cov-dash.py
Lines changed: 235 additions & 0 deletions
diff --git a/‎covid-insights/covid-app.png
262 KB b/‎covid-insights/covid-app.png
262 KB
diff --git a/‎covid-insights/covid-app.py
Lines changed: 132 additions & 0 deletions b/‎covid-insights/covid-app.py
Lines changed: 132 additions & 0 deletions
@@ -32,6 +32,12 @@ Apps and API for exploring open data sources including [AWS Registry of Open Dat
 
 ![](https://github.com/aws-samples/cloud-experiments/blob/master/open-data-explorer/s3-app-start.png)
 
+## [COVID EDA and Models](https://github.com/aws-samples/cloud-experiments/tree/master/covid-insights)
+
+Experiments for running exploratory data analysis (EDA) and models on COVID related open datasets. This includes Case Fatality Rate model on country data from John Hopkins. EDA techniques include growth factor analysis, cases growth rate, doubling rate, recovery and mortality rate, and country-wise analysis.
+
+![](https://github.com/aws-samples/cloud-experiments/blob/master/covid-insights/cfr.png)
+
 ## Amazon SageMaker Notebooks
 
 You may want to run these notebooks using [Amazon SageMaker](https://aws.amazon.com/sagemaker/). Amazon SageMaker is a fully-managed service that covers the entire machine learning workflow to label and prepare your data, choose an algorithm, train the model, tune and optimize it for deployment, make predictions, and take action.
 
@@ -0,0 +1,12 @@
+# COVID Insights
+
+Experiments for running exploratory data analysis (EDA) and models on COVID related open datasets. This includes Case Fatality Rate model on country data from John Hopkins. EDA techniques include growth factor analysis, cases growth rate, doubling rate, recovery and mortality rate, and country-wise analysis.
+
+Start the first experiment using `streamlit run covid-insights/cov-dash.py` to run Case Fatality Rate model directly on latest country data updated from John Hopkins repository.
+
+![](cfr.png)
+
+Second experiment runs using `streamlit run covid-insights/covid-app.py` on archived dataset from Kaggle.
+
+![](covid-app.png)
+
@@ -0,0 +1,235 @@
+# Source: https://github.com/cwerner/covid19
+# 1. Source data directly from GitHub (JHU COVID)
+# 2. Configurable UI based on two variables - inhabitants and countries
+# 3. Use Altair (https://altair-viz.github.io/) declarative statistical visualization charts
+
+import datetime
+from functools import reduce
+import streamlit as st
+from streamlit import caching
+import pandas as pd
+import altair as alt
+import os
+
+# numbers for 2019
+inhabitants = {'India': 1352.6,
+            'US': 328.2,
+            'Brazil': 209.5,
+            'Russia': 144.5,
+            'United Kingdom': 67.1,
+            'China': 1392.7,
+            'Italy': 60.23}
+
+@st.cache
+def read_data():
+    BASEURL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series"    
+    url_confirmed = f"{BASEURL}/time_series_covid19_confirmed_global.csv"
+    url_deaths = f"{BASEURL}/time_series_covid19_deaths_global.csv"
+    url_recovered = f"{BASEURL}/time_series_covid19_recovered_global.csv"
+
+    confirmed = pd.read_csv(url_confirmed, index_col=0)
+    deaths = pd.read_csv(url_deaths, index_col=0)
+    recovered = pd.read_csv(url_recovered, index_col=0)
+
+    # sum over potentially duplicate rows (France and their territories)
+    confirmed = confirmed.groupby("Country/Region").sum().reset_index()
+    deaths = deaths.groupby("Country/Region").sum().reset_index()
+    recovered = recovered.groupby("Country/Region").sum().reset_index()
+
+    return (confirmed, deaths, recovered)
+
+def transform(df, collabel='confirmed'):
+    dfm = pd.melt(df)
+    dfm["date"] = pd.to_datetime(dfm.variable, infer_datetime_format=True)
+    dfm = dfm.set_index("date")
+    dfm = dfm[["value"]]
+    dfm.columns = [collabel]
+    return dfm
+
+def transform2(df, collabel='confirmed'):
+    dfm = pd.melt(df, id_vars=["Country/Region"])
+    dfm["date"] = pd.to_datetime(dfm.variable, infer_datetime_format=True)
+    dfm = dfm.set_index("date")
+    dfm = dfm[["Country/Region","value"]]
+    dfm.columns = ["country", collabel]
+    return dfm
+
+def app():
+    st.title("🦠 Covid-19 Data Explorer")
+    st.markdown("""\
+        This app illustrates the spread of COVID-19 in select countries over time.
+    """)
+
+    #st.error("⚠️ There is currently an issue in the datasource of JHU. Data for 03/13 is invalid and thus removed!")
+
+    countries = ["India", "US", "Russia", "Brazil", "China", "Italy", "United Kingdom"]
+
+    analysis = st.sidebar.selectbox("Choose Analysis", ["Overview", "By Country"])
+
+    if analysis == "Overview":
+
+        st.header("COVID-19 cases and fatality rate")
+        st.markdown("""\
+            These are the reported case numbers for a selection of countries"""
+            f""" (currently only {', '.join(countries)}). """
+            """The case fatality rate (CFR) is calculated as:  
+            $$
+            CFR[\%] = \\frac{fatalities}{\\textit{all cases}}
+            $$
+
+            ℹ️ You can select/ deselect countries and switch between linear and log scales.
+            """)
+
+        confirmed, deaths, recovered = read_data()
+
+        multiselection = st.multiselect("Select countries:", countries, default=countries)
+        logscale = st.checkbox("Log scale", False)
+
+        confirmed = confirmed[confirmed["Country/Region"].isin(multiselection)]
+        confirmed = confirmed.drop(["Lat", "Long"],axis=1)
+        confirmed = transform2(confirmed, collabel="confirmed")
+
+        deaths = deaths[deaths["Country/Region"].isin(multiselection)]
+        deaths = deaths.drop(["Lat", "Long"],axis=1)
+        deaths = transform2(deaths, collabel="deaths")
+
+        frate = confirmed[["country"]]
+        frate["frate"] = (deaths.deaths / confirmed.confirmed)*100
+
+        # saveguard for empty selection 
+        if len(multiselection) == 0:
+            return 
+
+        SCALE = alt.Scale(type='linear')
+        if logscale:
+            confirmed["confirmed"] += 0.00001
+
+            confirmed = confirmed[confirmed.index > '2020-02-16']
+            frate = frate[frate.index > '2020-02-16']
+            
+            SCALE = alt.Scale(type='log', domain=[10, int(max(confirmed.confirmed))], clamp=True)
+
+
+        c2 = alt.Chart(confirmed.reset_index()).properties(height=150).mark_line().encode(
+            x=alt.X("date:T", title="Date"),
+            y=alt.Y("confirmed:Q", title="Cases", scale=SCALE),
+            color=alt.Color('country:N', title="Country")
+        )
+
+        # case fatality rate...
+        c3 = alt.Chart(frate.reset_index()).properties(height=100).mark_line().encode(
+            x=alt.X("date:T", title="Date"),
+            y=alt.Y("frate:Q", title="Fatality rate [%]", scale=alt.Scale(type='linear')),
+            color=alt.Color('country:N', title="Country")
+        )
+
+        per100k = confirmed.loc[[confirmed.index.max()]].copy()
+        per100k.loc[:,'inhabitants'] = per100k.apply(lambda x: inhabitants[x['country']], axis=1)
+        per100k.loc[:,'per100k'] = per100k.confirmed / (per100k.inhabitants * 1_000_000) * 100_000
+        per100k = per100k.set_index("country")
+        per100k = per100k.sort_values(ascending=False, by='per100k')
+        per100k.loc[:,'per100k'] = per100k.per100k.round(2)
+
+        c4 = alt.Chart(per100k.reset_index()).properties(width=75).mark_bar().encode(
+            x=alt.X("per100k:Q", title="Cases per 100k inhabitants"),
+            y=alt.Y("country:N", title="Countries", sort=None),
+            color=alt.Color('country:N', title="Country"),
+            tooltip=[alt.Tooltip('country:N', title='Country'), 
+                        alt.Tooltip('per100k:Q', title='Cases per 100k'),
+                        alt.Tooltip('inhabitants:Q', title='Inhabitants [mio]')]
+        )
+
+        st.altair_chart(alt.hconcat(c4, alt.vconcat(c2, c3)), use_container_width=True)
+
+        st.markdown(f"""\
+            <div style="font-size: small">
+            ⚠️ Please take the CFR with a grain of salt. The ratio is 
+            highly dependend on the total number of tests conducted in a country. In the early stages
+            of the outbreak often mainly severe cases with clear symptoms are detected. Thus mild cases
+            are not recorded which skews the CFR.
+            </div><br/>  
+
+            """, unsafe_allow_html=True)
+
+
+    elif analysis == "By Country":        
+
+        confirmed, deaths, recovered = read_data()
+
+        st.header("Country statistics")
+        st.markdown("""\
+            The reported number of active, recovered and deceased COVID-19 cases by country """
+            f""" (currently only {', '.join(countries)}).  
+            """
+            """  
+            ℹ️ You can select countries and plot data as cummulative counts or new active cases per day. 
+            """)
+
+        # selections
+        selection = st.selectbox("Select country:", countries)
+        cummulative = st.radio("Display type:", ["total", "new cases"])
+        #scaletransform = st.radio("Plot y-axis", ["linear", "pow"])
+        
+        confirmed = confirmed[confirmed["Country/Region"] == selection].iloc[:,3:]
+        confirmed = transform(confirmed, collabel="confirmed")
+
+        deaths = deaths[deaths["Country/Region"] == selection].iloc[:,3:]
+        deaths = transform(deaths, collabel="deaths")
+
+        recovered = recovered[recovered["Country/Region"] == selection].iloc[:,3:]
+        recovered = transform(recovered, collabel="recovered")
+
+        
+        df = reduce(lambda a,b: pd.merge(a,b, on='date'), [confirmed, recovered, deaths])
+        df["active"] = df.confirmed - (df.deaths + df.recovered)
+
+        variables = ["recovered", "active", "deaths"]
+        colors = ["steelblue", "orange", "black"]
+
+        value_vars = variables
+        SCALE = alt.Scale(domain=variables, range=colors)
+        if cummulative == 'new cases':
+            value_vars = ["new"]
+            df["new"] = df.confirmed - df.shift(1).confirmed
+            df["new"].loc[df.new < 0]  = 0
+            SCALE = alt.Scale(domain=["new"], range=["orange"]) 
+
+        dfm = pd.melt(df.reset_index(), id_vars=["date"], value_vars=value_vars)
+
+        # introduce order col as altair does auto-sort on stacked elements
+        dfm['order'] = dfm['variable'].replace(
+            {val: i for i, val in enumerate(variables[::-1])}
+        )
+
+        c = alt.Chart(dfm.reset_index()).mark_bar().properties(height=200).encode(
+            x=alt.X("date:T", title="Date"),
+            y=alt.Y("sum(value):Q", title="Cases", scale=alt.Scale(type='linear')),
+            color=alt.Color('variable:N', title="Category", scale=SCALE), #, sort=alt.EncodingSortField('value', order='ascending')),
+            order='order'
+        )
+
+        if cummulative != 'new cases':
+            st.altair_chart(c, use_container_width=True)
+        else:
+            # add smooth 7-day trend
+            rm_7day = df[['new']].rolling('7D').mean().rename(columns={'new': 'value'})
+            c_7day = alt.Chart(rm_7day.reset_index()).properties(height=200).mark_line(strokeDash=[1,1], color='red').encode(
+                x=alt.X("date:T", title="Date"),
+                y=alt.Y("value:Q", title="Cases", scale=alt.Scale(type='linear')),
+            )
+            st.altair_chart((c + c_7day), use_container_width=True)
+            st.markdown(f"""\
+                <div style="font-size: small">Daily reported new cases (incl. 7-day average).</div><br/>
+                """, unsafe_allow_html=True)
+
+
+    st.info("""\
+            
+        by: [C. Werner](https://www.christianwerner.net) | source: [GitHub](https://www.github.com/cwerner/covid19)
+        | data source: [Johns Hopkins Univerity (GitHub)](https://github.com/CSSEGISandData/COVID-19). 
+    """)
+
+
+    # ----------------------
+
+app()
@@ -0,0 +1,132 @@
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import covid as cov
+
+st.title('COVID Exploratory Data Analysis')
+
+# Data from https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset?select=covid_19_data.csv
+
+covid = pd.read_csv('data/494724_1196190_compressed_covid_19_data.csv.zip')
+
+# Dropping column as SNo is of no use, and 'Province/State' contains too many missing values
+covid.drop(['SNo'], 1, inplace=True)
+
+st.header('Dataset')
+st.write(covid)
+
+# Converting 'Observation Date' into Datetime format
+covid['ObservationDate']=pd.to_datetime(covid['ObservationDate'])
+
+# Grouping different types of cases as per the date
+datewise = covid.groupby(['ObservationDate']).agg({
+    'Confirmed': 'sum',
+    'Recovered': 'sum',
+    'Deaths': 'sum'
+    })
+
+datewise['Days Since'] = datewise.index-datewise.index.min()
+datewise["WeekOfYear"]=datewise.index.weekofyear
+
+india_data=covid[covid["Country/Region"]=="India"]
+datewise_india=india_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
+datewise_india['Days Since'] = datewise_india.index-datewise.index.min()
+datewise_india["WeekOfYear"]=datewise_india.index.weekofyear
+
+st.header('Global Analysis')
+
+st.line_chart(datewise[['Confirmed', 'Deaths', 'Recovered']])
+
+st.subheader('Global Growth Factor')
+cov.growth_factor(datewise)
+
+st.subheader('India Growth Factor')
+cov.growth_factor(datewise_india)
+
+st.subheader('Global Weekly Growth of Cases')
+cov.weekly_increase(datewise)
+
+st.subheader('India Weekly Growth of Cases')
+cov.weekly_increase(datewise_india)
+
+st.subheader('Global Doubling Rate')
+cov.double_days(datewise)
+
+st.subheader('India Doubling Rate')
+cov.double_days(datewise_india)
+
+st.subheader('Daily Growth')
+cov.growth_scatter(datewise)
+
+st.subheader('Recovery and Mortality')
+cov.mortality(datewise)
+
+st.subheader('Daily Increases Stats')
+cov.daily_increase(datewise)
+
+st.header('Countrywise Analysis')
+
+#Calculating countrywise Mortality and Recovery Rate
+countrywise=covid[covid["ObservationDate"]==covid["ObservationDate"].max()].groupby(["Country/Region"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'}).sort_values(["Confirmed"],ascending=False)
+countrywise["Mortality"]=(countrywise["Deaths"]/countrywise["Confirmed"])*100
+countrywise["Recovery"]=(countrywise["Recovered"]/countrywise["Confirmed"])*100
+
+fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(10,12))
+top_15_confirmed=countrywise.sort_values(["Confirmed"],ascending=False).head(15)
+top_15_deaths=countrywise.sort_values(["Deaths"],ascending=False).head(15)
+sns.barplot(x=top_15_confirmed["Confirmed"],y=top_15_confirmed.index,ax=ax1)
+ax1.set_title("Top 15 countries as per Number of Confirmed Cases")
+sns.barplot(x=top_15_deaths["Deaths"],y=top_15_deaths.index,ax=ax2)
+ax2.set_title("Top 15 countries as per Number of Death Cases")
+
+st.pyplot(fig)
+
+st.header('India Analysis')
+
+st.line_chart(datewise_india[['Confirmed', 'Deaths', 'Recovered']])
+
+st.write(datewise_india.iloc[-1])
+st.write("Total Active Cases: ",datewise_india["Confirmed"].iloc[-1]-datewise_india["Recovered"].iloc[-1]-datewise_india["Deaths"].iloc[-1])
+st.write("Total Closed Cases: ",datewise_india["Recovered"].iloc[-1]+datewise_india["Deaths"].iloc[-1])
+
+st.subheader('India Growth Daily')
+cov.growth_scatter(datewise_india)
+
+st.subheader('India Daily Increase in Cases')
+cov.daily_increase(datewise_india)
+
+st.subheader('India Recovery and Mortality')
+cov.mortality(datewise_india)
+
+st.subheader('India Compared with Other Countries')
+
+Italy_data=covid[covid["Country/Region"]=="Italy"]
+US_data=covid[covid["Country/Region"]=="US"]
+spain_data=covid[covid["Country/Region"]=="Spain"]
+datewise_Italy=Italy_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
+datewise_US=US_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
+datewise_Spain=spain_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
+
+max_ind=datewise_india["Confirmed"].max()
+fig = plt.figure(figsize=(12,6))
+plt.plot(datewise_Italy[(datewise_Italy["Confirmed"]>0)&(datewise_Italy["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases Italy",linewidth=3)
+plt.plot(datewise_US[(datewise_US["Confirmed"]>0)&(datewise_US["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases USA",linewidth=3)
+plt.plot(datewise_Spain[(datewise_Spain["Confirmed"]>0)&(datewise_Spain["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases Spain",linewidth=3)
+plt.plot(datewise_india[datewise_india["Confirmed"]>0]["Confirmed"],label="Confirmed Cases India",linewidth=3)
+plt.xlabel("Date")
+plt.ylabel("Number of Confirmed Cases")
+plt.title("Growth of Confirmed Cases")
+plt.legend()
+plt.xticks(rotation=90)
+
+st.write("It took",datewise_Italy[(datewise_Italy["Confirmed"]>0)&(datewise_Italy["Confirmed"]<=max_ind)].shape[0],"days in Italy to reach number of Confirmed Cases equivalent to India")
+st.write("It took",datewise_US[(datewise_US["Confirmed"]>0)&(datewise_US["Confirmed"]<=max_ind)].shape[0],"days in USA to reach number of Confirmed Cases equivalent to India")
+st.write("It took",datewise_Spain[(datewise_Spain["Confirmed"]>0)&(datewise_Spain["Confirmed"]<=max_ind)].shape[0],"days in Spain to reach number of Confirmed Cases equivalent to India")
+st.write("It took",datewise_india[datewise_india["Confirmed"]>0].shape[0],"days in India to reach",max_ind,"Confirmed Cases")
+
+st.pyplot(fig)
+