Commit 1207fd8b authored by Pascal Berger's avatar Pascal Berger
Browse files

fixed jupyter notebook

parent afbe0a33
kaggle.json
\ No newline at end of file
kaggle.json
*.ipynb
\ No newline at end of file
No preview for this file type
......@@ -15,6 +15,7 @@ ipywidgets = "*"
folium = "*"
bokeh = "*"
pyshp = "*"
ipynb-py-convert = "*"
[dev-packages]
ipykernel = "*"
......
{
"_meta": {
"hash": {
"sha256": "d993b78d8af164f802a2bb43bf2f2b742a7655de3d94224daec19680f149c056"
"sha256": "ee9d716043ab89202747397a76af2379cb8e87f4f23bb529d026e6b914e3a055"
},
"pipfile-spec": 6,
"requires": {
......@@ -74,10 +74,10 @@
},
"certifi": {
"hashes": [
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.5.30"
"version": "==2021.10.8"
},
"cffi": {
"hashes": [
......@@ -131,11 +131,11 @@
},
"charset-normalizer": {
"hashes": [
"sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6",
"sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f"
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
],
"markers": "python_version >= '3'",
"version": "==2.0.6"
"version": "==2.0.7"
},
"colorama": {
"hashes": [
......@@ -254,6 +254,13 @@
"markers": "python_version >= '3.7'",
"version": "==6.4.1"
},
"ipynb-py-convert": {
"hashes": [
"sha256:06c09644f85949939fa749a00fb7f398a0bfcad0e34320ca6666e0cb44b7ea74"
],
"index": "gdv",
"version": "==0.4.6"
},
"ipython": {
"hashes": [
"sha256:2097be5c814d1b974aea57673176a924c4c8c9583890e7a5f082f547b9975b11",
......@@ -303,11 +310,11 @@
},
"jsonschema": {
"hashes": [
"sha256:48f4e74f8bec0c2f75e9fcfffa264e78342873e1b57e2cfeae54864cc5e9e4dd",
"sha256:9938802041347f2c62cad2aef59e9a0826cd34584f3609db950efacb4dbf6518"
"sha256:2b3cca28580511d44326f0e7fc582eab3cbe31aabd1a1c2cfa74a399796ffd84",
"sha256:9dd7c33b4a96138dc37bb86b3610d3b12d30d96433d4d73435ca3025804154a8"
],
"markers": "python_version >= '3.7'",
"version": "==4.0.1"
"version": "==4.1.0"
},
"jupyter-client": {
"hashes": [
......@@ -935,19 +942,19 @@
},
"pywin32": {
"hashes": [
"sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe",
"sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf",
"sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17",
"sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96",
"sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7",
"sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72",
"sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b",
"sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0",
"sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78",
"sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"
"sha256:2393c1a40dc4497fd6161b76801b8acd727c5610167762b7c3e9fd058ef4a6ab",
"sha256:251b7a9367355ccd1a4cd69cd8dd24bd57b29ad83edb2957cfa30f7ed9941efa",
"sha256:48dd4e348f1ee9538dd4440bf201ea8c110ea6d9f3a5010d79452e9fa80480d9",
"sha256:496df89f10c054c9285cc99f9d509e243f4e14ec8dfc6d78c9f0bf147a893ab1",
"sha256:543552e66936378bd2d673c5a0a3d9903dba0b0a87235ef0c584f058ceef5872",
"sha256:79cf7e6ddaaf1cd47a9e50cc74b5d770801a9db6594464137b1b86aa91edafcc",
"sha256:af5aea18167a31efcacc9f98a2ca932c6b6a6d91ebe31f007509e293dea12580",
"sha256:d3761ab4e8c5c2dbc156e2c9ccf38dd51f936dc77e58deb940ffbc4b82a30528",
"sha256:e372e477d938a49266136bff78279ed14445e00718b6c75543334351bf535259",
"sha256:fe21c2fb332d03dac29de070f191bdbf14095167f8f2165fdc57db59b1ecc006"
],
"markers": "sys_platform == 'win32' and platform_python_implementation != 'PyPy'",
"version": "==301"
"version": "==302"
},
"pywinpty": {
"hashes": [
......@@ -1438,19 +1445,19 @@
},
"pywin32": {
"hashes": [
"sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe",
"sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf",
"sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17",
"sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96",
"sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7",
"sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72",
"sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b",
"sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0",
"sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78",
"sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"
"sha256:2393c1a40dc4497fd6161b76801b8acd727c5610167762b7c3e9fd058ef4a6ab",
"sha256:251b7a9367355ccd1a4cd69cd8dd24bd57b29ad83edb2957cfa30f7ed9941efa",
"sha256:48dd4e348f1ee9538dd4440bf201ea8c110ea6d9f3a5010d79452e9fa80480d9",
"sha256:496df89f10c054c9285cc99f9d509e243f4e14ec8dfc6d78c9f0bf147a893ab1",
"sha256:543552e66936378bd2d673c5a0a3d9903dba0b0a87235ef0c584f058ceef5872",
"sha256:79cf7e6ddaaf1cd47a9e50cc74b5d770801a9db6594464137b1b86aa91edafcc",
"sha256:af5aea18167a31efcacc9f98a2ca932c6b6a6d91ebe31f007509e293dea12580",
"sha256:d3761ab4e8c5c2dbc156e2c9ccf38dd51f936dc77e58deb940ffbc4b82a30528",
"sha256:e372e477d938a49266136bff78279ed14445e00718b6c75543334351bf535259",
"sha256:fe21c2fb332d03dac29de070f191bdbf14095167f8f2165fdc57db59b1ecc006"
],
"markers": "sys_platform == 'win32' and platform_python_implementation != 'PyPy'",
"version": "==301"
"version": "==302"
},
"pyzmq": {
"hashes": [
......
This diff is collapsed.
@ECHO OFF
setlocal ENABLEDELAYEDEXPANSION
set end_from=.py
set end_to=.ipynb
pipenv install
for /r %%a in (*!end_from!) do (
set b=%%a
set b=!b:%end_from%=!!end_to!!!
echo Converting %%a to !b!
pipenv run ipynb-py-convert %%a !b!
)
pause
\ No newline at end of file
@ECHO OFF
setlocal ENABLEDELAYEDEXPANSION
set end_from=.ipynb
set end_to=.py
REM pipenv install
for /r %%a in (*!end_from!) do (
set b=%%a
set b=!b:%end_from%=!!end_to!!!
echo Converting %%a to !b!
pipenv run ipynb-py-convert %%a !b!
)
pause
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
# %%
"""
# Analyse Covid 19 Impfdaten der Welt
## Fundamentals of Data Visualization
Zuletzt bearbeitet von:
Pascal Berger
27.03.2021
"""
# %%
"""
## Einleitung und Import
In dieser Datenanalyse werden die weltweiten COVID-19 Daten analysiert und visualisiert. Die Daten sind von der Kaggle Datenbank https://www.kaggle.com/gpreda/covid-world-vaccination-progress. Die Länder sollen zusätzlich untereinander verglichen werden.
Zuerst werden hier die notwendigen Python Pakete importiert.
"""
# %%
import pandas as pd
import pdb
import requests
import json
import os
import shutil
import zipfile
## matplotlib
# import matplotlib
# import matplotlib.pyplot as plt
# import matplotlib.dates as mdates
# import matplotlib.ticker as ticker
# matplotlib.use('TkAgg')
import datetime as dt
## plotly
import plotly.express as px
## git
import git
from git import Repo
import pandas_profiling
# %%
"""
## Daten herunterladen
"""
# %%
home = os.path.expanduser("~")
src = ".\\kaggle.json"
dst_dir = home + "\\.kaggle"
dst = home + "\\.kaggle\\kaggle.json"
if not os.path.exists(dst_dir):
os.mkdir(dst_dir)
if not os.path.exists(dst):
shutil.copyfile(src, dst)
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
# download zip
vac_zip_path = '.\\data\\'
api.dataset_download_files('gpreda/covid-world-vaccination-progress', path=vac_zip_path)
# unzip
with zipfile.ZipFile(vac_zip_path + 'covid-world-vaccination-progress.zip', 'r') as zipref:
zipref.extractall(vac_zip_path)
# %%
"""
## Daten einlesen
Als nächstes werden die täglichen CSV Reports aus dem Repository in pandas dataframes importiert.
"""
# %%
ch_filename = ".\\data\\country_vaccinations.csv"
vac_data = pd.read_csv(ch_filename, sep=",")
# %%
"""
## Daten analysieren
"""
# %%
print(vac_data.shape)
print(list(vac_data.columns))
# check for duplicated reports
len(vac_data[vac_data.duplicated(['country', 'date'])])
# list some values
vac_data.tail()
# %%
"""
Das Datenset besteht aus 15 Eigenschaften und 5824 Beobachtungen. Man sieht in den ersten Beobachtungen, dass viele fehlende Werte vorkommen. Es macht den Anschein, als wurde die Daten nicht zeitgleich publiziert, weshalb sie in mehreren Zeilen erscheinen.
"""
# %%
#print(vac_data.shape)
#print(list(vac_data.columns))
report = vac_data.profile_report(sort=None, html={'style':{'full_width':True}})
report.to_file(output_file="report_vac.html")
#report
# %%
"""
In diesem detiallierten Report sieht man die wichtigsten Eingeschaften der Variabeln.
- Die Länder liegen in vollem Namen vor, aber auch als ISO code. Da beim ISO Code missings vorliegen, verwende ich den vollen Namen
- Bei der Zeitstempfelspalte ist nur das Datum angegeben, es gibt dort keine fehlenden Werte
"""
# %%
"""
## Country ISO-Code berechnen
"""
# %%
vac_data.iso_code.isna().sum()
# %%
"""
## Wieviel wurde welcher Impfstoff geimpft?
In diesen Plots wird aufgezeigt, wieviel welcher Impfstoff geimpft wurde.
Dazu werden hier zuerst die täglichen reports nach Impfstoff gruppiert und nach Vorkommnisse summiert.
"""
# %%
px.bar(vac_data.vaccines.value_counts())
# %%
"""
Wir sehen hier, dass bei gewissen Reports mehrere Impfstoffe angegeben sind.
Da keine Daten vorliegen wieviel der jeweilige Impfstoff verwendet wurde, nehme ich an, dass jeder Impfstoff gleich viel geimpft wurde.
In folgender Grafik ist die Menge an Reports pro Impfstoff aufgezeigt.
"""
# %%
#print(vac_data.vaccines.value_counts())
vaccines_all = dict()
for name, count in vac_data.vaccines.value_counts().items():
vaccines = name.split(', ')
count_individual = count / len(vaccines)
for vaccine in vaccines:
# create key if not existing
if not vaccine in vaccines_all:
vaccines_all[vaccine] = 0
# append value
vaccines_all[vaccine] = vaccines_all[vaccine] + count_individual
for vaccine in vaccines_all:
vaccines_all[vaccine] = round(vaccines_all[vaccine])
# create series from dictionary
vaccines_all_series = pd.Series(vaccines_all)
vaccines_all_series = vaccines_all_series.sort_values()[::-1]
# sort descending and plot
#ax = vaccines_all_series.sort_values(ascending = False).plot(kind='bar', figsize=(15, 8))
# change x axis formatting
#plt.setp(ax.get_xticklabels(), ha="center", rotation=0)
# show plot
px.bar(vaccines_all_series)
# %%
"""
Nun ist es interessant zu sehen, wieviel der jeweilige Impfstoff etwa geimpft wurde. Dazu folgende Grafik.
"""
# %%
count_per_vac = vac_data[['vaccines', 'daily_vaccinations', 'daily_vaccinations_raw']]
count_per_vac = count_per_vac.groupby(by=['vaccines'], dropna=True).sum()
#count_per_vac.head(30)
vaccines_all = dict()
for i, row in count_per_vac.iterrows():
#print(i)
#print(row)
vaccines = i.split(', ')
count_daily = row.daily_vaccinations / len(vaccines)
for vaccine in vaccines:
# create key if not existing
if not vaccine in vaccines_all:
vaccines_all[vaccine] = 0
# append value
vaccines_all[vaccine] = vaccines_all[vaccine] + count_daily
for vaccine in vaccines_all:
vaccines_all[vaccine] = round(vaccines_all[vaccine])
# create series from dictionary
vaccines_all_series = pd.Series(vaccines_all)
# sort by size
vaccines_all_series = vaccines_all_series.sort_values()[::-1]
vaccines_all_df = vaccines_all_series.to_frame().reset_index()
vaccines_all_df.columns = ['vaccine', 'doses']
# sort descending and plot
#ax = vaccines_all_series.sort_values(ascending = False).plot(kind='bar', figsize=(15, 8))
#ax = vaccines_all_series.sort_values(ascending = False).plot(kind='bar', figsize=(22, 5))
# change x axis formatting
#plt.setp(ax.get_xticklabels(), ha="right", rotation=20)
# y axis in millions
#scale_y = 1e6
#ticks_y = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x/scale_y))
#ax.yaxis.set_major_formatter(ticks_y)
# labels
#ax.set_ylabel('vaccinations in millions')
#ax.set_xlabel('vaccination type')
#ax.set_title('Vaccinations for each vaccine type')
# show plot
px.bar(vaccines_all_df, x='vaccine', y='doses')
# %%
total_doses = vaccines_all_df['doses'].sum()
total_doses
# %%
#ax = vaccines_all_series.sort_values(ascending = False).plot(kind='pie', figsize=(22,5), autopct='%3.1f%%', shadow=True)
#ax.set_title('Vaccinations for each vaccine type')
#ax.set_ylabel('')
plt = px.pie(vaccines_all_df, names='vaccine', values='doses')
plt.show()
# %%
# cumsum vaccinations, check if previous calculations make sense
print(vaccines_all_series.sum())
print(vac_data.daily_vaccinations.sum())
# %%
"""
# Wie schreitet die Entwicklung der Impfungen voran?
In diesen Plots sehen wir, wie die Entwicklung alles Impfungen voran geht. Es werden einfach geimpfte den doppelt geimpften gegenübergestellt.
"""
# %%
vac_vaccinated = vac_data.loc[:, ['date', 'daily_vaccinations']]
# group by date
vac_vaccinated = vac_vaccinated.groupby(by=['date'], dropna=True, as_index=False).sum()
# calculate cummulative values
#vac_vaccinated['daily_vaccinations_cum'] = vac_vaccinated.daily_vaccinations.cumsum(skipna=True)
#vac_vaccinated = vac_vaccinated.loc[:, ['date', 'daily_vaccinations_cum']]
# # fill na's with previous values
# vac_vaccinated = vac_vaccinated.fillna(method='ffill')
# #rolling 7-day average
# vac_vaccinated['daily_vaccinations_rolling'] = vac_vaccinated.daily_vaccinations_cum.rolling(7).mean(skipna=True)
vac_vaccinated['daily_vaccinations_rolling'] = vac_vaccinated.daily_vaccinations.rolling(7).mean(skipna=True)
# # vac_vaccinated.people_fully_vaccinated = vac_vaccinated.people_fully_vaccinated.rolling(window=7, center=True).mean()
#plot data
ax = vac_vaccinated.plot.line(x='date', figsize=(22, 5))
# y axis in millions
scale_y = 1e6
ticks_y = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x/scale_y))
ax.yaxis.set_major_formatter(ticks_y)
# labels
ax.set_ylabel('vaccinations in millions')
ax.set_xlabel('date')
ax.set_title('Vaccinations over time')
# show plot
plt.show()
# print(type(vac_vaccinated.daily_vaccinations_cum.rolling(30)))
#print(vac_vaccinated.shape)
#vac_vaccinated.head(60)
# %%
"""
## Folium map
"""
# %%
# https://python-visualization.github.io/folium/quickstart.html
# pip install folium
import folium
from folium import plugins
# m = folium.Map(location=[45.5236, -122.6750])
# m
# %%
"""
## Data prep per country to fit map
"""
# %%
url = ("https://raw.githubusercontent.com/python-visualization/folium/master/examples/data")
state_geo = f"{url}/world-countries.json" # us-states.json
# state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
# state_data = pd.read_csv(state_unemployment)
#state_data
country_dict = json.loads(requests.get(state_geo).text)
# country_df = pd.DataFrame.from_dict(country_dict)
# country_df
test = vac_data.groupby('iso_code', as_index=False).max().loc[:, ['iso_code', 'total_vaccinations', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred']]
# million scale
test['vaccinations_millions'] = test['total_vaccinations'] / 1000000
# create entry for every country
for country in country_dict['features']:
if not country['id'] in test['iso_code'].values:
# print(country['id'] + ' missing')
test.append({'iso_code': country['id'], 'total_vaccinations': 0}, ignore_index=True)
# %%
#test.people_fully_vaccinated_per_hundred.describe()
test.info()
print()
print(test['total_vaccinations'].loc[test['iso_code']=='USA'].values[0])
print(vac_data.loc[vac_data['iso_code']=='USA'].total_vaccinations.max())
print(test['total_vaccinations_per_hundred'].loc[test['iso_code']=='GIB'].values[0])
print(test['people_vaccinated_per_hundred'].loc[test['iso_code']=='CHE'].values[0])
# %%
test.sort_values(by='people_vaccinated_per_hundred', ascending=False).plot(kind='bar',x="iso_code", y="people_vaccinated_per_hundred", figsize=(22, 5))
# %%
f = folium.Figure(width=1500, height=800)
m = folium.Map(location=[20, 0], zoom_start=2, zoom_control=False, no_touch=True).add_to(f)
folium.Choropleth(
geo_data=state_geo,
name="choropleth",
data=test,
columns=["iso_code", "vaccinations_millions"],
key_on="feature.id",
fill_color="YlGn",
fill_opacity=0.7,
line_opacity=0.2,
legend_name="Total Vaccinations in Millions",
).add_to(m)
folium.LayerControl().add_to(m)
m
# %%
f = folium.Figure(width=1500, height=800)
m = folium.Map(location=[20, 0], zoom_start=2, zoom_control=False, no_touch=True).add_to(f)
folium.Choropleth(
geo_data=state_geo,
name="choropleth",
data=test,
columns=["iso_code", "people_vaccinated_per_hundred"],
key_on="feature.id",
fill_color="YlGn",
fill_opacity=0.8,
line_opacity=0.2,
legend_name="People vaccinated per hundred",
).add_to(m)
folium.LayerControl().add_to(m)
m
# %%
# %%
"""
## Bokeh Map tests:
"""
# %%
#https://stackoverflow.com/questions/38336061/bokeh-mapping-counties
#pip install pyshp
import shapefile
import itertools
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.tile_providers import get_provider, WIKIMEDIA, CARTODBPOSITRON, STAMEN_TERRAIN, STAMEN_TONER, ESRI_IMAGERY, OSM
from bokeh.io import output_notebook, show
import warnings
warnings.filterwarnings("ignore")
pd.set_option("max_columns", 30)
output_notebook()
# %%
def get_map_data(shape_data_file, local_file_path):
url = "http://www2.census.gov/geo/tiger/GENZ2015/shp/" + \
shape_data_file + ".zip"
zfile = local_file_path + shape_data_file + ".zip"
sfile = local_file_path + shape_data_file + ".shp"
dfile = local_file_path + shape_data_file + ".dbf"
if not os.path.exists(zfile):
print("Getting file: ", url)
response = requests.get(url)
with open(zfile, "wb") as code:
code.write(response.content)
if not os.path.exists(sfile):
uz_cmd = 'tar -xf ' + zfile
print("Executing command: " + uz_cmd)
os.system(uz_cmd)
shp = open(sfile, "rb")
dbf = open(dfile, "rb")
sf = shapefile.Reader(shp=shp, dbf=dbf)
lats = []
lons = []
ct_name = []
st_id = []
for shprec in sf.shapeRecords():
st_id.append(int(shprec.record[0]))
ct_name.append(shprec.record[5])
lat, lon = map(list, zip(*shprec.shape.points))
indices = shprec.shape.parts.tolist()
lat = [lat[i:j] + [float('NaN')] for i, j in zip(indices, indices[1:]+[None])]
lon = [lon[i:j] + [float('NaN')] for i, j in zip(indices, indices[1:]+[None])]
lat = list(itertools.chain.from_iterable(lat))
lon = list(itertools.chain.from_iterable(lon))
lats.append(lat)
lons.append(lon)
map_data = pd.DataFrame({'x': lats, 'y': lons, 'state': st_id, 'county_name': ct_name})
return map_data
# %%
map_low_res = "cb_2015_us_county_20m"