import glob
import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


project_directory = "/Applications/Projects/Covid19"

file_path_pattern = os.path.join(project_directory, "*.csv")

filenames = glob.glob(file_path_pattern)

dataframes = {}

for filename in filenames:
    short_filename = os.path.basename(filename)

    dataframe = pd.read_csv(filename)
    dataframes[short_filename] = dataframe


seoul = dataframes['SeoulFloating.csv']
timeage = dataframes['TimeAge.csv']
searchtrend = dataframes['SearchTrend.csv']
time_province = dataframes['TimeProvince.csv']
weather = dataframes['Weather.csv']
patientinfo = dataframes['PatientInfo.csv']
region = dataframes['Region.csv']
timegender = dataframes['TimeGender.csv']
policy = dataframes['Policy.csv']
case = dataframes['Case.csv']
time = dataframes['Time.csv']


case.head()


patientinfo.head()


time.head()


timeage.head()


timegender.head()


time_province.head()


region.head()


weather.head()


seoul.head()


policy.head()


case.head()


missing_values_case = case.isnull().sum()

if missing_values_case.sum() == 0:
    print(f'No missing values in the Case dataframe')
else:
    print(f'Missing values in the Case dataframe: \n{missing_values_case}')

No missing values in the Case dataframe


print(case.columns)

Index([' case_id', 'province', 'city', 'group', 'infection_case', 'confirmed',
       'latitude', 'longitude'],
      dtype='object')


case.columns = case.columns.str.lstrip()

duplicate_case_ids = case['case_id'].duplicated().sum()

if duplicate_case_ids == 0:
    print("There are no duplicate case_id's")
else:
    print(f"Duplicate case_id's: {duplicate_case_ids}")

There are no duplicate case_id's


case['case_number'] = case.groupby('province').cumcount() + 1


province_code = region.drop_duplicates(subset=['province']).reset_index(drop=True)[['province', 'code']]


province_to_code = dict(zip(province_code['province'], province_code['code']))

case['region_code'] = case['province'].map(province_to_code)

case[['region_code', 'case_number', 'case_id']] = case[['region_code', 'case_number', 'case_id']].astype(str)

case['case_number'] = case['case_number'].apply(lambda x: x.zfill(2))

case['correct_id'] = case['region_code'] + case['case_number']

missmatched_rows = case[case['case_id'] != case['correct_id']]

if not missmatched_rows.empty:
    print('Rows with missmatched case_id and correct_id')
    print(missmatched_rows[['case_id', 'correct_id']])
else:
    print('Every case_id matches its real value')

Every case_id matches its real value


most_affected_provinces = case.groupby('province')['confirmed'].sum().sort_values(ascending=False).head(5)
print(f'Top 5 most affected provinces: \n{most_affected_provinces}')

Top 5 most affected provinces: 
province
Daegu               6680
Gyeongsangbuk-do    1324
Seoul               1280
Gyeonggi-do         1000
Incheon              202
Name: confirmed, dtype: int64


fig = px.bar(
    most_affected_provinces,
    y='confirmed',
    text='confirmed', 
    title='Top 5 Most Affected Provinces'
)

fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(len(most_affected_provinces))),
        ticktext=most_affected_provinces.index.tolist(),
    ),
    xaxis_title='Province',
    yaxis_title='Confirmed COVID19 Cases',
    title_x=0.5 
)

fig.update_traces(
    textposition='outside' 
)

fig.update_layout(height=550, width=1000)
fig.show()


case['city'] = case['city'].str.strip().replace('-', 'unknown city')


most_affected_cities = case.groupby(['province', 'city'])['confirmed'].sum().sort_values(ascending=False).head(5)
print(most_affected_cities)

province          city           
Daegu             Nam-gu             4511
                  unknown city       1705
Gyeongsangbuk-do  from other city     607
Seoul             unknown city        561
Gyeonggi-do       unknown city        477
Name: confirmed, dtype: int64


most_affected_cities = case.groupby(['province', 'city'])['confirmed'].sum().nlargest(5).reset_index()

most_affected_cities['city'] = most_affected_cities['city'].replace(['unknown city', 'from other city'], 'Other city')


most_affected_cities = most_affected_cities.sort_values('confirmed', ascending=True)
most_affected_cities['hover_info'] = most_affected_cities['confirmed']

fig = px.bar(
    most_affected_cities,
    x='confirmed',
    y='city',
    color='province',
    orientation='h',
    text='hover_info',
    custom_data=['confirmed', 'province'],
    labels={'confirmed': 'Confirmed Cases', 'city': 'City'},
    title='Top 5 Most Affected Cities'
)

fig.update_traces(
    textposition='inside',
    insidetextanchor='middle',
    textfont=dict(color='white', size=10),
)

fig.update_layout(
    xaxis=dict(showline=True, linewidth=0.5, linecolor='gray', title='Confirmed COVID19 Cases'),
    yaxis=dict(showline=True, linewidth=0.5, linecolor='gray', title='Affected Cities'),
    barmode='stack',
    margin=dict(l=100, r=20, t=40, b=20),
    legend_title='Province: ',
    title_x=0.5,
    showlegend=True,
    legend=dict(orientation="v", yanchor="auto", y=1, xanchor="left", x=-0.3, traceorder='reversed')
)

fig.show()


case['latitude'] = pd.to_numeric(case['latitude'], errors='coerce')
case['longitude'] = pd.to_numeric(case['longitude'], errors='coerce')

outliers = case[(case['latitude'] < -90) | (case['latitude'] > 90) | (case['longitude'] < -180) | (case['longitude'] > 180)]
print("Outliers in Latitude and Longitude:")
print(outliers)

Outliers in Latitude and Longitude:
Empty DataFrame
Columns: [case_id, province, city, group, infection_case, confirmed, latitude, longitude, case_number, region_code, correct_id]
Index: []


fig = px.scatter_mapbox(case, 
                        lat="latitude", 
                        lon="longitude", 
                        color="confirmed", 
                        size="confirmed",
                        hover_name="infection_case",
                        hover_data=["province", "city"],
                        size_max=40, 
                        color_continuous_scale='viridis',
                        color_continuous_midpoint=np.average(case['confirmed'], weights=case['confirmed']),
                        mapbox_style="carto-positron",
                        zoom=5, 
                        center=dict(lat=36.5, lon=127.8), 
                        opacity=0.6  
)

fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Province: %{customdata[0]}<br>City: %{customdata[1]}<br>Confirmed Cases: %{marker.color}<extra></extra>')

fig.update_layout(
    title="Geographical Distribution of COVID19 Cases",
    title_x=0.5,
    coloraxis_colorbar=dict(title="Confirmed Cases")
)

fig.show()


common_infection_cases = case['infection_case'].value_counts()
print(common_infection_cases.head())

etc                     17
overseas inflow         17
contact with patient    16
Shincheonji Church      15
Itaewon Clubs            9
Name: infection_case, dtype: int64


case['group'] = case['group'].astype(str)

case['group'] = case['group'].str.strip()
group_infections_count = case[case['group'] == 'True']['case_id'].count()
non_group_infections_count = case[case['group'] == 'False']['case_id'].count() 
print(f'Number of Grouped COVID19 Cases: {group_infections_count}')
print(f'Number of Non-Grouped COVID19 Cases: {non_group_infections_count} ')

Number of Grouped COVID19 Cases: 124
Number of Non-Grouped COVID19 Cases: 50


fig_grouped = px.pie(
    names=['Grouped Cases', 'Non-Grouped Cases'],
    values=[group_infections_count, non_group_infections_count],
    title='Distribution of Grouped and Non-Grouped COVID19 Cases'
)

fig_grouped.update_traces(
    textinfo='percent+label',
    pull=[0.1, 0]  
)

fig_grouped.update_layout(
    title_x=0.5
)

fig_grouped.show()


patientinfo.head()


missing_values_patient_info = patientinfo.isnull().sum()
if missing_values_patient_info.sum() == 0:
    print(f'No missing values in the PatientInfo dataframe')
else:
    print(f'Missing values in PatientInfo dataframe: \n{missing_values_patient_info}')

Missing values in PatientInfo dataframe: 
patient_id               0
sex                   1122
age                   1380
country                  0
province                 0
city                    94
infection_case         919
infected_by           3819
contact_number        4374
symptom_onset_date    4475
confirmed_date           3
released_date         3578
deceased_date         5099
state                    0
dtype: int64


mode_age = patientinfo['age'].mode().iloc[0]
patientinfo['age'].fillna(mode_age, inplace=True)


mode_sex = patientinfo['sex'].mode().iloc[0]
patientinfo['sex'].fillna(mode_sex, inplace=True)


patientinfo['infection_case'] = patientinfo['infection_case'].fillna('Unknown')


duplicate_rows_patientinfo = patientinfo[patientinfo.duplicated()]
if  duplicate_rows_patientinfo.empty:
    print('There are no duplicate rows in the PatientInfo dataframe')
else:
    print(duplicate_rows_patientinfo)

There are no duplicate rows in the PatientInfo dataframe


patientinfo['symptom_onset_date'] = pd.to_datetime(patientinfo['symptom_onset_date'], errors='coerce')
patientinfo['confirmed_date'] = pd.to_datetime(patientinfo['confirmed_date'], errors='coerce')
patientinfo['released_date'] = pd.to_datetime(patientinfo['released_date'], errors='coerce')
patientinfo['deceased_date'] = pd.to_datetime(patientinfo['deceased_date'], errors='coerce')


patientinfo['age'] = pd.Categorical(patientinfo['age'],
                                     categories=['0s', '10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s', '100s'],
                                     ordered=True)

age_sex_count = patientinfo.groupby(['age', 'sex']).size().reset_index(name='count')

age_total_count = patientinfo.groupby('age').size()

age_sex_count['percentage'] = age_sex_count.apply(lambda row: row['count'] / age_total_count[row['age']] * 100, axis=1)

fig_age_sex_grouped = px.bar(age_sex_count, x='age', y='percentage', color='sex',
                             title='Distribution of Patients by Age and Gender',
                             labels={'percentage': 'Percentage'},
                             barmode='group',
                             text='percentage')

fig_age_sex_grouped.update_layout(
    title_x=0.5,
    xaxis_title='Age',
    legend_title='Gender',
    font=dict(size=14)
)

fig_age_sex_grouped.update_traces(texttemplate='%{text:.2f}%', textposition='outside')  # Change 'middle center' to 'outside'

fig_age_sex_grouped.show()


patientinfo['infection_case'] = patientinfo['infection_case'].str.strip()

infection_values = patientinfo['infection_case'].value_counts()
top5_infection_cases = infection_values.head()
print(top5_infection_cases)

contact with patient    1610
Unknown                  919
overseas inflow          840
etc                      703
Itaewon Clubs            162
Name: infection_case, dtype: int64


fig = px.bar(
    top5_infection_cases,
    x=top5_infection_cases.index, 
    y=top5_infection_cases.values, 
    title='Top 5 Infection Case sources'
    )
fig.update_layout(
    title_x=0.5,
    xaxis_title='Infection Case',
    yaxis_title='Count'
)
fig.show()


missing_values = patientinfo[['symptom_onset_date', 'confirmed_date', 'released_date', 'deceased_date']].isnull().sum()
print(f'Missing values in date columns:\n{missing_values}')

Missing values in date columns:
symptom_onset_date    4476
confirmed_date           3
released_date         3578
deceased_date         5099
dtype: int64


patientinfo['deceased'] = patientinfo['deceased_date'].notna()
patientinfo['deceased'] = patientinfo['deceased'].replace({True: 'Deceased', False: 'Not Deceased'})

timeline = pd.DataFrame({
    'State': ['Isolated'] * len(patientinfo) + ['Released'] * len(patientinfo) + ['Deceased'] * len(patientinfo),
    'Date': pd.concat([patientinfo['confirmed_date'], patientinfo['released_date'], patientinfo['deceased_date']])
})

fig = px.histogram(
    timeline, 
    x='Date', 
    color='State', 
    nbins=100, 
    title='Timeline of Patient States')

fig.update_layout(
    title_x=0.5,
    yaxis_title='Count'
)
fig.show()


time.head()


time['date'] = pd.to_datetime(time['date'])


print(time.isnull().sum())

date         0
time         0
test         0
negative     0
confirmed    0
released     0
deceased     0
dtype: int64


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=time['date'], 
    y=time['test'], 
    mode='lines', 
    name='Tests')
)

fig.update_layout(
    title='Trend of the COVID19 Tests Over Time',
    xaxis_title='Date',
    yaxis_title='Number of Tests'
)

fig.show()


time['daily_confirmed'] = time['confirmed'].diff()
time['daily_released'] = time['released'].diff()
time['daily_deceased'] = time['deceased'].diff()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=time['date'], 
    y=time['daily_confirmed'], 
    mode='lines', 
    name='Confirmed')
)

fig.add_trace(go.Scatter(
    x=time['date'], 
    y=time['daily_released'], 
    mode='lines', 
    name='Released')
)

fig.add_trace(go.Scatter(
    x=time['date'], 
    y=time['daily_deceased'], 
    mode='lines', 
    name='Deceased')
)

fig.update_layout(
    title='Daily Changes in Confirmed, Released, and Deceased Cases', 
    xaxis_title='Date', 
    yaxis_title='Daily Changes',
    title_x=0.5
)

fig.show()


timeage.head()


missing_values_timeage = timeage.isnull().sum()
if not missing_values_timeage.any():
    print('There are no missing values in TimeAge dataframe.')
else:
    print('Missing values found in TimeAge dataframe:')
    print(missing_values_timeage)

There are no missing values in TimeAge dataframe.


timeage['date'] = pd.to_datetime(timeage['date'])
timeage['age_group'] = timeage['age'].str.extract('(\d+)').astype(int)


fig_age_distribution = px.line(
    timeage,
    x='date',
    y='confirmed',
    color='age',
    labels={'date': 'Date', 'confirmed': 'Confirmed Cases', 'age': 'Age Group'},
    title='Age Distribution of Confirmed Cases Over Time'
)
fig_age_distribution.update_layout(
    title_x=0.5
)

fig_age_distribution.show()


fig_age_distribution_deceased = px.line(
    timeage,
    x='date',
    y='deceased',
    color='age',
    labels={'date': 'Date', 'deceased': 'Deceased Cases', 'age': 'Age Group'},
    title='Age Distribution of Deceased Cases Over Time'
)

fig_age_distribution_deceased.update_layout(
    title_x=0.5
)

fig_age_distribution_deceased.show()


timegender.head()


timegender_missing = timegender.isnull().sum()
if not timegender_missing.any():
    print('There are no missing values in TimeGender dataframe.')
else:
    print('Missing values found in TimeGender dataframe:')
    print(timegender_missing)

There are no missing values in TimeGender dataframe.


print(timegender['sex'].unique())

['male' 'female']


timegender['date'] = pd.to_datetime(timegender['date'])


fig_confirmed = px.line(
    timegender,
    x='date',
    y='confirmed',
    color='sex',
    labels={'date': 'Date', 'confirmed': 'Confirmed Cases', 'sex': 'Gender'},
    title='Confirmed Cases Over Time by Gender'
)
fig_confirmed.update_layout(title_x=0.5)

fig_confirmed.show()


fig_deceased = px.line(
    timegender,
    x='date',
    y='deceased',
    color='sex',
    labels={'date': 'Date', 'deceased': 'Deceased Cases', 'sex': 'Gender'},
    title='Deceased Cases Over Time by Gender'
)
fig_deceased.update_layout(title_x=0.5)

fig_deceased.show()


time_province.head()


time_province_missing = time_province.isnull().sum()

if not time_province_missing.any():
    print('There are no missing values in TimeProvince dataframe.')
else:
    print('Missing values found in TimeProvince dataframe:')
    print(time_province_missing)

There are no missing values in TimeProvince dataframe.


time_province['date'] = pd.to_datetime(time_province['date'])


print(time_province['province'].value_counts())

Daejeon              163
Incheon              163
Sejong               163
Gangwon-do           163
Chungcheongbuk-do    163
Jeollanam-do         163
Busan                163
Jeollabuk-do         163
Ulsan                163
Gwangju              163
Seoul                163
Gyeongsangbuk-do     163
Chungcheongnam-do    163
Gyeongsangnam-do     163
Daegu                163
Gyeonggi-do          163
Jeju-do              163
Name: province, dtype: int64


fig = px.line(time_province, x='date', y=['confirmed', 'released', 'deceased'], color='province',
              labels={'value': 'Number of Cases', 'date': 'Date'},
              title='Confirmed, Released, and Deceased Cases by Province Over Time')

fig.update_layout(
    title_x=0.5
)
fig.show()


region.head()


region_missing_values = region.isnull().sum()
if not region_missing_values.any():
    print('There are no missing values in Region dataframe.')
else:
    print('Missing values found in Region dataframe:')
    print(region_missing_values)

There are no missing values in Region dataframe.


region['latitude'] = pd.to_numeric(region['latitude'], errors='coerce')
region['longitude'] = pd.to_numeric(region['longitude'], errors='coerce')


south_korea_center = {'lat': 36.5, 'lon': 127.5}

fig = px.scatter_mapbox(
    region, 
    lat='latitude', 
    lon='longitude', 
    color='province',
    zoom=6, 
    center=south_korea_center, 
    mapbox_style="carto-positron",
    title='Geographical Distribution of COVID-19 Cases Across South Korea Provinces'
    )

fig.update_layout(
    title_x=0.5
)

fig.show()


fig = px.box(
    region,
    x='province',
    y='elderly_population_ratio',
    labels={'elderly_population_ratio': 'Elderly Population Ratio', 'province': 'Province'},
    title='Distribution of Elderly Population Ratio Across Different Provinces')

fig.update_layout(
    title_x=0.5
)

fig.show()


weather.head()


weather_missing_values = weather.isnull().sum()

if not weather_missing_values.any():
    print('There are no missing values in Weather dataframe')
else:
    print('Missing values found in Weather dataframe')
    print(weather_missing_values)

Missing values found in Weather dataframe
code                      0
province                  0
date                      0
avg_temp                 15
min_temp                  5
max_temp                  3
precipitation             0
max_wind_speed            9
most_wind_direction      29
avg_relative_humidity    20
dtype: int64


weather['date'] = pd.to_datetime(weather['date'])
time['date'] = pd.to_datetime(time['date'])

merged_df_weather_time = pd.merge(weather, time, on='date', how='inner')


merged_df = pd.merge(case, merged_df_weather_time, on='province', how='inner')


correlation = merged_df['avg_temp'].corr(merged_df['confirmed_x'])
print(f"Correlation between average temperature and confirmed cases: {correlation}")

Correlation between average temperature and confirmed cases: 0.015570451548215986


searchtrend.head()


missing_values_search_trend = searchtrend.isnull().sum()
if not missing_values_search_trend.any():
    print('There are no missing values in the SearchTrend dataframe')
else:
    print('Missing values found in the SearchTrend dataframe')
    print(missing_values_search_trend)

There are no missing values in the SearchTrend dataframe


searchtrend['date'] = pd.to_datetime(searchtrend['date'])


merged_df_searchtrend_time = pd.merge(searchtrend, time, on='date', how='inner')


fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=merged_df_searchtrend_time['date'],
        y=merged_df_searchtrend_time['cold'],
        mode='lines',
        name='Cold'))
fig.add_trace(
    go.Scatter(
        x=merged_df_searchtrend_time['date'],
        y=merged_df_searchtrend_time['flu'],
        mode='lines',
        name='Flu'))
fig.add_trace(
    go.Scatter(
        x=merged_df_searchtrend_time['date'],
        y=merged_df_searchtrend_time['pneumonia'],
        mode='lines',
        name='Pneumonia'))
fig.add_trace(
    go.Scatter(
        x=merged_df_searchtrend_time['date'],
        y=merged_df_searchtrend_time['coronavirus'],
        mode='lines',
        name='Coronavirus'))

fig.update_layout(
    title='Search Trend for COVID-19-related Keywords Over Time',
    xaxis_title='Date',
    yaxis_title='Search Trend',
    title_x=0.5
)

fig.show()


seoul.head()


seoul_missing_values = seoul.isnull().sum()

if not seoul_missing_values.any():
    print('There are no missing values in the Seoul dataframe')
else:
    print('Here are the missing values for Seoul dataframe:')
    print(seoul_missing_values)

There are no missing values in the Seoul dataframe


seoul['date'] = pd.to_datetime(seoul['date'])
seoul['fp_num'].dtypes

dtype('int64')


seoul_daily = seoul.groupby('date')['fp_num'].sum().reset_index()

fig = px.line(
    seoul_daily,
    x='date',
    y='fp_num',
    title='Daily Floating Population in Seoul'
)

fig.update_layout(
    title_x=0.5,
    yaxis_title='Floating Population',
    xaxis_title='Date'
    
)

fig.show()


seoul_daily['z_score'] = np.abs(stats.zscore(seoul_daily['fp_num']))

outliers = seoul_daily[seoul_daily['z_score'] > 3]

print(outliers)

          date     fp_num    z_score
53  2020-02-23  373087550  10.693830
148 2020-05-29  140847320   3.413225


seoul_daily['fp_num_log'] = np.log(seoul_daily['fp_num'])


seoul_daily_log = seoul_daily[['date', 'fp_num_log']]

seoul_daily_log.columns = ['date', 'fp_num']

fig = px.line(
    seoul_daily_log,
    x='date',
    y='fp_num',
    title='Daily Floating Population in Seoul (Log Transformed)'
)

fig.update_layout(
    title_x=0.5,
    yaxis_title='Floating Population (Log Transformed)',
    xaxis_title='Date'
)

fig.show()


policy.head()


policy_missing_values = policy.isnull().sum()

if not policy_missing_values.any():
    print('There are no missing values in the Policy dataframe')
else:
    print('Missing values were found in the Policy dataframe:')
    print(policy_missing_values)

Missing values were found in the Policy dataframe:
policy_id      0
country        0
type           0
gov_policy     0
detail         2
start_date     0
end_date      37
dtype: int64


policy.dropna(inplace=True)


policy['start_date'] = pd.to_datetime(policy['start_date'])
policy['end_date'] = pd.to_datetime(policy['end_date'])


policy_types = policy['type'].unique()
print("Types of Government Policies:\n")
for value in policy_types:
    print(value)

Types of Government Policies:

Alert
Social
Education
Administrative
Transformation
Health


policy_type_counts = policy['type'].value_counts().reset_index()
policy_type_counts.columns = ['Policy Type', 'Count']

fig = px.bar(
    policy_type_counts,
    x='Policy Type',
    y='Count',
    title='Distribution of Government Policy Types',
    labels={'Count': 'Number of Policies'}
)
fig.update_layout(
    title_x=0.5
)
fig.show()


policy_impacts = policy['gov_policy'].unique()
print("Impacts of Government Policies:\n")
for impact in policy_impacts:
    print(impact)

Impacts of Government Policies:

Infectious Disease Alert Level
Social Distancing Campaign
School Opening Delay
School Opening with Online Class
Close bars and clubs
Close karaoke
Logistics center
Extends Tightened Quarantine Measures


policy_impact_counts = policy['gov_policy'].value_counts().reset_index()
policy_impact_counts.columns = ['Policy Impact', 'Count']

fig = px.bar(
    policy_impact_counts,
    x='Policy Impact',
    y='Count',
    title='Distribution of Government Policy Impacts',
    labels={'Count': 'Number of Policies'})

fig.update_layout(
    title_x=0.5
)
fig.show()

	case_id	province	city	group	infection_case	confirmed	latitude	longitude
0	1000001	Seoul	Yongsan-gu	True	Itaewon Clubs	139	37.538621	126.992652
1	1000002	Seoul	Gwanak-gu	True	Richway	119	37.48208	126.901384
2	1000003	Seoul	Guro-gu	True	Guro-gu Call Center	95	37.508163	126.884387
3	1000004	Seoul	Yangcheon-gu	True	Yangcheon Table Tennis Club	43	37.546061	126.874209
4	1000005	Seoul	Dobong-gu	True	Day Care Center	43	37.679422	127.044374

	code	province	city	latitude	longitude	elementary_school_count	kindergarten_count	university_count	academy_ratio	elderly_population_ratio	elderly_alone_ratio	nursing_home_count
0	10000	Seoul	Seoul	37.566953	126.977977	607	830	48	1.44	15.38	5.8	22739
1	10010	Seoul	Gangnam-gu	37.518421	127.047222	33	38	0	4.18	13.17	4.3	3088
2	10020	Seoul	Gangdong-gu	37.530492	127.123837	27	32	0	1.54	14.55	5.4	1023
3	10030	Seoul	Gangbuk-gu	37.639938	127.025508	14	21	0	0.67	19.49	8.5	628
4	10040	Seoul	Gangseo-gu	37.551166	126.849506	36	56	1	1.17	14.39	5.7	1080

	code	province	date	avg_temp	min_temp	max_temp	max_wind_speed	most_wind_direction	avg_relative_humidity
0	10000	Seoul	2016-01-01	1.2	-3.3	4.0	3.5	90.0	73.0
1	11000	Busan	2016-01-01	5.3	1.1	10.9	7.4	340.0	52.1
2	12000	Daegu	2016-01-01	1.7	-4.0	8.0	3.7	270.0	70.5
3	13000	Gwangju	2016-01-01	3.2	-1.5	8.1	2.7	230.0	73.1
4	14000	Incheon	2016-01-01	3.1	-0.4	5.7	5.3	180.0	83.9

	case_id	province	city	group	infection_case	confirmed	latitude	longitude
0	1000001	Seoul	Yongsan-gu	True	Itaewon Clubs	139	37.538621	126.992652
1	1000002	Seoul	Gwanak-gu	True	Richway	119	37.48208	126.901384
2	1000003	Seoul	Guro-gu	True	Guro-gu Call Center	95	37.508163	126.884387
3	1000004	Seoul	Yangcheon-gu	True	Yangcheon Table Tennis Club	43	37.546061	126.874209
4	1000005	Seoul	Dobong-gu	True	Day Care Center	43	37.679422	127.044374

	code	province	city	latitude	longitude	elementary_school_count	kindergarten_count	university_count	academy_ratio	elderly_population_ratio	elderly_alone_ratio	nursing_home_count
0	10000	Seoul	Seoul	37.566953	126.977977	607	830	48	1.44	15.38	5.8	22739
1	10010	Seoul	Gangnam-gu	37.518421	127.047222	33	38	0	4.18	13.17	4.3	3088
2	10020	Seoul	Gangdong-gu	37.530492	127.123837	27	32	0	1.54	14.55	5.4	1023
3	10030	Seoul	Gangbuk-gu	37.639938	127.025508	14	21	0	0.67	19.49	8.5	628
4	10040	Seoul	Gangseo-gu	37.551166	126.849506	36	56	1	1.17	14.39	5.7	1080

	patient_id	sex	age	country	province	city	infection_case	infected_by	contact_number	symptom_onset_date	confirmed_date	released_date	deceased_date	state
0	1000000001	male	50s	Korea	Seoul	Gangseo-gu	overseas inflow	NaN	75	2020-01-22	2020-01-23	2020-02-05	NaN	released
1	1000000002	male	30s	Korea	Seoul	Jungnang-gu	overseas inflow	NaN	31	NaN	2020-01-30	2020-03-02	NaN	released
2	1000000003	male	50s	Korea	Seoul	Jongno-gu	contact with patient	2002000001	17	NaN	2020-01-30	2020-02-19	NaN	released
3	1000000004	male	20s	Korea	Seoul	Mapo-gu	overseas inflow	NaN	9	2020-01-26	2020-01-30	2020-02-15	NaN	released
4	1000000005	female	20s	Korea	Seoul	Seongbuk-gu	contact with patient	1000000002	2	NaN	2020-01-31	2020-02-24	NaN	released

	date	time	test	negative	confirmed
0	2020-01-20	16	1	0	1
1	2020-01-21	16	1	0	1
2	2020-01-22	16	4	3	1
3	2020-01-23	16	22	21	1
4	2020-01-24	16	27	25	2

	date	sex	confirmed	deceased
0	2020-03-02	male	1591	13
1	2020-03-02	female	2621	9
2	2020-03-03	male	1810	16
3	2020-03-03	female	3002	12
4	2020-03-04	male	1996	20

	date	birth_year	sex	province	city	fp_num
0	2020-01-01	20	female	Seoul	Dobong-gu	19140
1	2020-01-01	20	male	Seoul	Dobong-gu	19950
2	2020-01-01	20	female	Seoul	Dongdaemun-gu	25450
3	2020-01-01	20	male	Seoul	Dongdaemun-gu	27050
4	2020-01-01	20	female	Seoul	Dongjag-gu	28880

	policy_id	country	type	gov_policy	detail	start_date	end_date
0	1	Korea	Alert	Infectious Disease Alert Level	Level 1 (Blue)	2020-01-03	2020-01-19
1	2	Korea	Alert	Infectious Disease Alert Level	Level 2 (Yellow)	2020-01-20	2020-01-27
2	3	Korea	Alert	Infectious Disease Alert Level	Level 3 (Orange)	2020-01-28	2020-02-22
3	4	Korea	Alert	Infectious Disease Alert Level	Level 4 (Red)	2020-02-23	NaN
4	5	Korea	Immigration	Special Immigration Procedure	from China	2020-02-04	NaN

	date	cold	flu	pneumonia	coronavirus
0	2016-01-01	0.11663	0.05590	0.15726	0.00736
1	2016-01-02	0.13372	0.17135	0.20826	0.00890
2	2016-01-03	0.14917	0.22317	0.19326	0.00845
3	2016-01-04	0.17463	0.18626	0.29008	0.01145
4	2016-01-05	0.17226	0.15072	0.24562	0.01381

1) Case Data¶

2) Patient Data¶

3) Time Series Data¶

4) Additional Data¶

Before the Start..¶

Levels of administrative divisions in South Korea¶

Upper Level (Provincial-level divisions)¶

Lower Level (Municipal-level divisions)¶

Sources¶

1) Case¶

Data of COVID-19 infection cases in South Korea¶

2) PatientInfo¶

Epidemiological data of COVID-19 patients in South Korea¶

3) Time¶

Time series data of COVID-19 status in South Korea¶

4) TimeAge¶

Time series data of COVID-19 status in terms of the age in South Korea¶

5) TimeGender¶

Time series data of COVID-19 status in terms of the gender in South Korea¶

6) TimeProvince¶

Time series data of COVID-19 status in terms of the Province in South Korea¶

7) Region¶

Location and statistical data of the regions in South Korea¶

8) Weather¶

Data of the weather in the regions of South Korea¶

9) SearchTrend¶

Trend data of the keywords searched in NAVER which is one of the largest portal in South Korea¶

10) SeoulFloating¶

Data of floating population in Seoul, South Korea (from SK Telecom Big Data Hub)¶

11) Policy¶

Data of the government policy for COVID-19 in South Korea¶

1) Case DataFrame:¶

Initial plan:¶

2) PatientInfo DataFrame:¶

Initial plan:¶

3) Time DataFrame:¶

Initial plan:¶

4) TimeAge DataFrame:¶

Initial plan:¶

5) TimeGender DataFrame:¶

Initial plan:¶

6) TimeProvince DataFrame:¶

Initial plan:¶

7) Region DataFrame:¶

Initial plan:¶

8) Weather DataFrame:¶

Initial plan:¶

9) SearchTrend DataFrame:¶

Initial plan:¶

10) SeoulFloating DataFrame:¶

Initial plan:¶

11) Policy DataFrame:¶

Initial plan:¶

Summary of COVID-19 Data Analysis Project in South Korea¶

Case DataFrame:¶

PatientInfo DataFrame:¶