import pandas as pd

data = pd.read_csv('WHO-COVID-19-global-data.csv')
data.head()


print("Earliest day: " + str(data['Date_reported'].min()))
print("Latest day: " + str(data['Date_reported'].max()))

Earliest day: 2020-01-03
Latest day: 2023-08-30


import geopandas as gpd
import matplotlib.pyplot as plt

world_countries = gpd.read_file('world-countries.json')
world_countries.rename(columns={'Alpha-2': 'iso2'}, inplace=True)

def process_data_for_date(data, date, ax):
    later_data = data[data['Date_reported'] == date].copy()
    later_data.rename(columns={'Country_code': 'iso2'}, inplace=True)
    later_data.loc[later_data['Country'] == 'Namibia', 'iso2'] = "NAM" # Pandas recognized the iso 'NA' as NaN.
    first_cases = later_data[['iso2', 'New_cases']]
    merged_df = world_countries.merge(first_cases, on='iso2', how='inner')
    merged_df.plot('New_cases', legend=True, ax=ax)
    ax.set_title(date)

dates_to_process = ['2020-01-03', '2020-02-03', '2020-03-03', '2020-04-03', '2020-05-03',
                    '2020-06-03', '2020-07-03', '2020-08-03', '2020-09-03', '2020-10-03', 
                    '2020-11-03', '2020-12-03']

num_rows = 4
num_cols = 3
fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 12))
axes = axes.flatten()
for i, date in enumerate(dates_to_process):
    if i < num_rows * num_cols:  # Check if there are more dates than subplots
        process_data_for_date(data, date, axes[i])
    else:
        break
plt.tight_layout()
plt.show()


def process_data_for_year(data, year, ax):
    later_data = data[data['Date_reported'].str.startswith(year)].copy()
    later_data.rename(columns={'Country_code': 'iso2'}, inplace=True)
    later_data.loc[later_data['Country'] == 'Namibia', 'iso2'] = "NAM"  # Replace 'NA' iso2 with 'NAM'
    yearly_mean_cases = later_data.groupby('iso2')['New_cases'].mean()
    merged_df = world_countries.merge(yearly_mean_cases, on='iso2', how='inner')
    merged_df.plot('New_cases', legend=True, ax=ax)
    ax.set_title(year)  

years_to_process = ['2020', '2021', '2022', '2023']
    
num_rows = 2
num_cols = 2
fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, 12))
axes = axes.flatten()
for i, year in enumerate(years_to_process):
    if i < num_rows * num_cols:  # Check if there are more dates than subplots
        process_data_for_year(data, year, axes[i])
    else:
        break
plt.tight_layout()
plt.show()


later_data = data[data['Date_reported'] == '2023-08-30'].copy()
later_data.rename(columns={'Country_code': 'iso2'}, inplace=True)
later_data.loc[later_data['Country'] == 'Namibia', 'iso2'] = "NAM" # Pandas recognized the iso 'NA' as NaN.
first_cases = later_data[['iso2', 'New_cases']]
merged_df = world_countries.merge(first_cases, on='iso2', how='inner')
merged_df.plot('New_cases', legend=True)

<Axes: >


data['Date_reported'] = pd.to_datetime(data['Date_reported'])
global_cases = data.groupby('Date_reported')['New_cases'].sum().reset_index()
plt.figure(figsize=(12, 6))
plt.plot(global_cases['Date_reported'], global_cases['New_cases'], marker='o', linestyle='-')
plt.xlabel('Date Reported')
plt.ylabel('Global New Cases')
plt.title('Daily Global COVID-19 Cases Over Time')
plt.grid(True)
plt.tight_layout()

plt.show()


#Two ways to find the current total cases.
total_cases = data['New_cases'].sum()
total_cases = data[data['Date_reported'] == '2023-08-30']['Cumulative_cases'].sum()
print(f'Up to 2023-08-30, there have been a total of {total_cases} cases of Covid-19 worldwide.')

Up to 2023-08-30, there have been a total of 770085713 cases of Covid-19 worldwide.


#Two ways to find the current total deaths by Covid-19.
total_deaths = data['New_deaths'].sum()
total_deaths = data[data['Date_reported'] == '2023-08-30']['Cumulative_deaths']
print(f'Up to 2023-08-30, there have been a total of {total_deaths} deaths worldwide due to Covid-19.')

Up to 2023-08-30, there have been a total of 6956173 deaths worldwide due to Covid-19.


# Top 10 Affected Countries

# Most COVID-19 Cases by Country (up to 2023-08-30)
cases_by_country = data[data['Date_reported'] == '2023-08-30'][['Country', 'Cumulative_cases']]
cases_by_country = cases_by_country.sort_values(by='Cumulative_cases', ascending=False)
top_countries1 = cases_by_country.head(10)

# Most COVID-19 Deaths by Country (up to 2023-08-30)
deaths_by_country = data[data['Date_reported'] == '2023-08-30'][['Country', 'Cumulative_deaths']]
deaths_by_country = deaths_by_country.sort_values(by='Cumulative_deaths', ascending=False)
top_countries2 = deaths_by_country.head(10)

# Highest COVID-19 Case-Fatality Ratios by Country (up to 2023-08-30)
data_selected = data[data['Date_reported'] == '2023-08-30'].copy()  
data_selected.loc[:, 'Death_to_Case_Ratio'] = data_selected['Cumulative_deaths'] / data_selected['Cumulative_cases']
data_selected = data_selected.sort_values(by='Death_to_Case_Ratio', ascending=False)
top_countries3 = data_selected.head(10)

# Highest COVID-19 Deaths-Population Ratios by Country (up to 2023-08-30)
# Extract population data from 'world_population.csv' https://www.kaggle.com/datasets/iamsouravbanerjee/world-population-dataset?resource=download
world_population = pd.read_csv('world_population.csv')
world_population.rename(columns={'CCA3': 'iso3'}, inplace=True)
world_population = world_population[['iso3', '2022 Population']]
world_countries.rename(columns={'id': 'iso3'}, inplace=True)
merged_df = world_countries.merge(world_population, on='iso3', how='inner')
data_selected = data[data['Date_reported'] == '2023-08-30'].copy()  # Create a copy to avoid the warning
data_selected.rename(columns={'Country_code': 'iso2'}, inplace=True)
merged_df2 = data_selected.merge(merged_df, on='iso2', how='inner')
merged_df2.loc[:, 'Death_to_Population_Ratio'] = merged_df2['Cumulative_deaths'] / merged_df2['2022 Population']
merged_df2 = merged_df2.sort_values(by='Death_to_Population_Ratio', ascending=False)
top_countries4 = merged_df2.head(10)

fig, axes = plt.subplots(2, 2, figsize=(12, 12))

axes[0, 0].pie(top_countries1['Cumulative_cases'], labels=top_countries1['Country'], autopct='%1.1f%%', startangle=140, textprops={'fontsize': 9})
axes[0, 0].set_title('Most COVID-19 Cases', fontsize=12)
axes[0, 1].pie(top_countries2['Cumulative_deaths'], labels=top_countries2['Country'], autopct='%1.1f%%', startangle=140, textprops={'fontsize': 9})
axes[0, 1].set_title('Most COVID-19 Deaths', fontsize=12)
axes[1, 0].pie(top_countries3['Death_to_Case_Ratio'], labels=top_countries3['Country'], autopct='%1.1f%%', startangle=140, textprops={'fontsize': 9})
axes[1, 0].set_title('Highest COVID-19 Case-Fatality Ratios', fontsize=12)
axes[1, 1].pie(top_countries4['Death_to_Population_Ratio'], labels=top_countries4['Country'], autopct='%1.1f%%', startangle=140, textprops={'fontsize': 9})
axes[1, 1].set_title('Highest COVID-19 Deaths-Population Ratios', fontsize=12)

plt.subplots_adjust(wspace=0.9, hspace=-0.2)
plt.show()


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from datetime import timedelta

# Load and preprocess the dataset 
data = pd.read_csv('WHO-COVID-19-global-data.csv')
data['Date_reported'] = pd.to_datetime(data['Date_reported'])
US_data = data[data['Country'] == 'United States of America']
US_data.set_index('Date_reported', inplace=True)

# Forecasting the following year
forecast_period = 365 

# Prepare data for Prophet
df = US_data.reset_index()[['Date_reported', 'New_cases']]
df = df.rename(columns={'Date_reported': 'ds', 'New_cases': 'y'})

# Create and fit the Prophet model
model = Prophet()
model.fit(df)

# Make future predictions
future = model.make_future_dataframe(periods=forecast_period)
forecast = model.predict(future)

# Visualize the forecast
fig = model.plot(forecast)
plt.xlabel('Year')
plt.ylabel('New Cases')
plt.title('COVID-19 Cases Forecast for the United States')
plt.show()

14:34:54 - cmdstanpy - INFO - Chain [1] start processing
14:34:54 - cmdstanpy - INFO - Chain [1] done processing


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from datetime import timedelta

# Load and preprocess the dataset 
data = pd.read_csv('WHO-COVID-19-global-data.csv')
data['Date_reported'] = pd.to_datetime(data['Date_reported'])
US_data = data[data['Country'] == 'United States of America']
US_data.set_index('Date_reported', inplace=True)

# Forecasting the following year
forecast_period = 365 

# Prepare data for Prophet
df = US_data.reset_index()[['Date_reported', 'New_deaths']]
df = df.rename(columns={'Date_reported': 'ds', 'New_deaths': 'y'})

# Create and fit the Prophet model
model = Prophet()
model.fit(df)

# Make future predictions
future = model.make_future_dataframe(periods=forecast_period)
forecast = model.predict(future)

# Visualize the forecast
fig = model.plot(forecast)
plt.xlabel('Year')
plt.ylabel('New Cases')
plt.title('COVID-19 Cases Forecast for the United States')
plt.show()

14:34:59 - cmdstanpy - INFO - Chain [1] start processing
14:34:59 - cmdstanpy - INFO - Chain [1] done processing


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet

# Prepare the data
data = pd.read_csv('WHO-COVID-19-global-data.csv')
data_cumulative_deaths = data.groupby('Date_reported')['Cumulative_deaths'].sum().reset_index()
data_cumulative_deaths.columns = ['ds', 'y']

# Create a Prophet model
model = Prophet()

# Fit the model to the historical data
model.fit(data_cumulative_deaths)

# Make a DataFrame for future predictions
future = model.make_future_dataframe(periods=2312)  # Extend the time horizon

# Make predictions for the future
forecast = model.predict(future)

# Plot the historical data and future predictions
fig = model.plot(forecast, xlabel='Year', ylabel='Cumulative Deaths')
plt.title('Historical and Predicted Cumulative Deaths')

target_date = '2030-01-01'
closest_row = forecast.iloc[(forecast['ds'] - pd.to_datetime(target_date)).abs().argsort()[:1]]

# Get the projected cumulative deaths for the closest date
cumulative_deaths_2030 = closest_row['yhat'].values[0]

print("Projected cumulative deaths by the start of 2030:", int(cumulative_deaths_2030))
print(f'Unfortunatelly, {9261555 - 6956173} more people are expected to die from Covid-19 by 2030.')

14:40:17 - cmdstanpy - INFO - Chain [1] start processing
14:40:18 - cmdstanpy - INFO - Chain [1] done processing

Projected cumulative deaths by the start of 2030: 9261555
Unfortunatelly, 2305382 more people are expected to die from Covid-19 by 2030.

	Date_reported	Country_code	Country	WHO_region
0	2020-01-03	AF	Afghanistan	EMRO
1	2020-01-04	AF	Afghanistan	EMRO
2	2020-01-05	AF	Afghanistan	EMRO
3	2020-01-06	AF	Afghanistan	EMRO
4	2020-01-07	AF	Afghanistan	EMRO

COVID-19 DATA VISUALIZATION AND ANALYSIS (up to August 30, 2023)¶

The progression of the Covid-19 pandemic¶

Countries most severely impacted by the Covid-19 pandemic¶

Time Series Forecasting of COVID-19 Progression¶