%load_ext autoreload
%autoreload 2

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from travel_insurance_utils import *
from IPython.display import Image
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_recall_curve,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.ensemble import VotingClassifier
from sklearn.inspection import permutation_importance

path = "../travel-insurance-prediction/travel_insurance_dataset.csv"

travel_df = pd.read_csv(path)

travel_df.head()

travel_df = travel_df.drop("Unnamed: 0", axis=1, errors="ignore")
travel_df.head()

travel_df = travel_df.rename(columns={"Employment Type": "EmploymentType"})
travel_df.head()

rupee_to_euro = 0.011

travel_df["AnnualIncomeEuro"] = travel_df["AnnualIncome"] * rupee_to_euro

travel_df = travel_df.drop("AnnualIncome", axis=1, errors="ignore")
travel_df.head()

print(f"Initial shape: {travel_df.shape}")
print(f"Number of duplicate rows: {travel_df.duplicated().sum()}")

Initial shape: (1987, 9)
Number of duplicate rows: 738

travel_df = travel_df.drop_duplicates().reset_index(drop=True)
print(f"Shape after removing duplicates: {travel_df.shape}")

Shape after removing duplicates: (1249, 9)

print("\nMissing values:")
print(travel_df.isnull().sum())

Missing values:
Age                    0
EmploymentType         0
GraduateOrNot          0
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
AnnualIncomeEuro       0
dtype: int64

print(travel_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1249 entries, 0 to 1248
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1249 non-null   int64  
 1   EmploymentType       1249 non-null   object 
 2   GraduateOrNot        1249 non-null   object 
 3   FamilyMembers        1249 non-null   int64  
 4   ChronicDiseases      1249 non-null   int64  
 5   FrequentFlyer        1249 non-null   object 
 6   EverTravelledAbroad  1249 non-null   object 
 7   TravelInsurance      1249 non-null   int64  
 8   AnnualIncomeEuro     1249 non-null   float64
dtypes: float64(1), int64(4), object(4)
memory usage: 87.9+ KB
None

travel_df.describe().T

travel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1249 entries, 0 to 1248
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  1249 non-null   int64  
 1   EmploymentType       1249 non-null   object 
 2   GraduateOrNot        1249 non-null   object 
 3   FamilyMembers        1249 non-null   int64  
 4   ChronicDiseases      1249 non-null   int64  
 5   FrequentFlyer        1249 non-null   object 
 6   EverTravelledAbroad  1249 non-null   object 
 7   TravelInsurance      1249 non-null   int64  
 8   AnnualIncomeEuro     1249 non-null   float64
dtypes: float64(1), int64(4), object(4)
memory usage: 87.9+ KB

binary_columns = ["GraduateOrNot", "FrequentFlyer", "EverTravelledAbroad"]
travel_df[binary_columns] = travel_df[binary_columns].replace({"Yes": 1, "No": 0})

travel_df = pd.get_dummies(travel_df, columns=["EmploymentType"], prefix="Emp")

/var/folders/q8/33t91vn13hxggltq0p7cn9480000gn/T/ipykernel_25375/3704500232.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  travel_df[binary_columns] = travel_df[binary_columns].replace({"Yes": 1, "No": 0})

boolean_columns = ["Emp_Government Sector", "Emp_Private Sector/Self Employed"]
travel_df[boolean_columns] = travel_df[boolean_columns].astype(int)
travel_df.head()

numerical_features = ["Age", "FamilyMembers", "AnnualIncomeEuro"]
categorical_features_one = [
    "Emp_Government Sector",
    "Emp_Private Sector/Self Employed",
    "GraduateOrNot",
    "FrequentFlyer",
]
categorical_features_two = [
    "EverTravelledAbroad",
    "TravelInsurance",
    "ChronicDiseases",
]

categorical_features = categorical_features_one + categorical_features_two

plot_combined_histograms(
    travel_df, numerical_features, save_path="images/combined_histograms.png"
)

Image(filename="images/combined_histograms.png")

plot_combined_bar_charts(
    travel_df,
    categorical_features_one,
    max_features_per_plot=4,
    save_path="images/combined_bar_charts1",
)

Image(filename="images/combined_bar_charts1.png_chunk_1.png")

plot_combined_bar_charts(
    travel_df,
    categorical_features_two,
    save_path="images/combined_bar_charts2.png_chunk_1",
)

Image(filename="images/combined_bar_charts2.png_chunk_1_chunk_1.png")

plot_combined_boxplots(
    travel_df, numerical_features, save_path="images/combined_boxplot.png"
)

Image(filename="images/combined_boxplot.png")

detect_anomalies_iqr(travel_df, numerical_features)

No anomalies detected in feature 'Age'.
No anomalies detected in feature 'FamilyMembers'.
No anomalies detected in feature 'AnnualIncomeEuro'.

plot_correlation_matrix(
    travel_df, numerical_features, save_path="images/correlation_matrix.png"
)

Image(filename="images/correlation_matrix.png")

target_feature = "TravelInsurance"
categorical_features_test = [
    "Emp_Government Sector",
    "Emp_Private Sector/Self Employed",
    "GraduateOrNot",
    "FrequentFlyer",
    "EverTravelledAbroad",
    "ChronicDiseases",
]

chi_square_test(travel_df, categorical_features_test, target_feature)

Chi-Square test results for 'Emp_Government Sector':
Chi2 statistic = 6.9346510708008005, p-value = 0.008454155421443737
Significant association between 'Emp_Government Sector' and 'TravelInsurance'.

Chi-Square test results for 'Emp_Private Sector/Self Employed':
Chi2 statistic = 6.9346510708008005, p-value = 0.008454155421443737
Significant association between 'Emp_Private Sector/Self Employed' and 'TravelInsurance'.

Chi-Square test results for 'GraduateOrNot':
Chi2 statistic = 1.089711243342709, p-value = 0.2965351968551507
No significant association between 'GraduateOrNot' and 'TravelInsurance'.

Chi-Square test results for 'FrequentFlyer':
Chi2 statistic = 19.669203761086663, p-value = 9.207331224191983e-06
Significant association between 'FrequentFlyer' and 'TravelInsurance'.

Chi-Square test results for 'EverTravelledAbroad':
Chi2 statistic = 111.77454004049142, p-value = 4.0034414530103906e-26
Significant association between 'EverTravelledAbroad' and 'TravelInsurance'.

Chi-Square test results for 'ChronicDiseases':
Chi2 statistic = 0.0854420539603025, p-value = 0.7700536442454782
No significant association between 'ChronicDiseases' and 'TravelInsurance'.

analyze_features(travel_df, numerical_features, target_feature)

95% confidence interval for Age (insured): (29.71809068414484, 30.290190889354122)
95% confidence interval for Age (not insured): (29.40618556894996, 29.79224785141557)
95% confidence interval for FamilyMembers (insured): (4.917778746696541, 5.2395711497837905)
95% confidence interval for FamilyMembers (not insured): (4.648902321311049, 4.89417861863673)
95% confidence interval for AnnualIncomeEuro (insured): (11380.76257409634, 12115.510717829127)
95% confidence interval for AnnualIncomeEuro (not insured): (9100.257499502131, 9608.358688487424)

analyze_mannwhitneyu(travel_df, numerical_features, target_feature)

Mann-Whitney U test for Age: U-statistic = 198733.0, p-value = 0.0254561854659559
Significant difference in distributions for Age.
Mann-Whitney U test for FamilyMembers: U-statistic = 202869.0, p-value = 0.0034698536817751656
Significant difference in distributions for FamilyMembers.
Mann-Whitney U test for AnnualIncomeEuro: U-statistic = 249205.0, p-value = 3.9357884191093554e-25
Significant difference in distributions for AnnualIncomeEuro.

scaler = StandardScaler()
travel_df[numerical_features] = scaler.fit_transform(travel_df[numerical_features])

X = travel_df.drop("TravelInsurance", axis=1)
y = travel_df["TravelInsurance"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
}

results = {}
y_pred_dict = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    y_pred_dict[name] = y_pred

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": {
            "True Negatives": tn,
            "False Positives": fp,
            "False Negatives": fn,
            "True Positives": tp,
        },
    }

metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC"]
plot_model_performance(results, metrics, "images/inital_models_performance.png")

plot_combined_confusion_matrices(
    results,
    y_test,
    y_pred_dict,
    labels=["Class 0", "Class 1"],
    save_path="images/initial_confusion_matrices.png",
)

Image(filename="images/inital_models_performance.png")

Image(filename="images/initial_confusion_matrices.png")

param_distributions = {
    "Logistic Regression": {
        "C": uniform(0.1, 10),
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
        "class_weight": ["balanced", None],
        "max_iter": [1000, 2000, 5000]
    },
    "Random Forest": {
        "n_estimators": randint(50, 300),
        "max_depth": randint(3, 20),
        "min_samples_split": randint(2, 20),
        "min_samples_leaf": randint(1, 10),
        "class_weight": ["balanced", "balanced_subsample", None],
    },
    "Gradient Boosting": {
        "n_estimators": randint(50, 300),
        "learning_rate": uniform(0.01, 0.5),
        "max_depth": randint(3, 10),
        "min_samples_split": randint(2, 20),
        "min_samples_leaf": randint(1, 10),
        "subsample": uniform(0.5, 0.5),
    },
    "SVM": {
        "C": uniform(0.1, 10),
        "kernel": ["rbf", "poly"],
        "gamma": uniform(0.01, 1),
        "class_weight": ["balanced", None],
    },
}

best_models = {}
for name, model in models.items():
    random_search = RandomizedSearchCV(
        model,
        param_distributions[name],
        n_iter=50,
        cv=5,
        scoring="recall",
        n_jobs=-1,
        random_state=42,
    )
    random_search.fit(X_train, y_train)
    best_models[name] = random_search.best_estimator_

    print(f"\nBest parameters for {name}:")
    print(random_search.best_params_)
    print(f"Best recall score: {random_search.best_score_:.4f}")

results = {}
y_pred_dict = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred_dict[name] = y_pred

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba),
    }

metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC"]
plot_model_performance(results, metrics, "images/tuned_models_performance.png")

plot_combined_confusion_matrices(
    results,
    y_test,
    y_pred_dict,
    labels=["No Purchase", "Purchase"],
    save_path="images/tuned_models_confusion_matrices.png",
)

Best parameters for Logistic Regression:
{'C': 0.8404465173409036, 'class_weight': 'balanced', 'max_iter': 2000, 'penalty': 'l1', 'solver': 'saga'}
Best recall score: 0.5691

Best parameters for Random Forest:
{'class_weight': 'balanced', 'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 193}
Best recall score: 0.5117

Best parameters for Gradient Boosting:
{'learning_rate': 0.5028252270553003, 'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 233, 'subsample': 0.6188187719961998}
Best recall score: 0.5979

Best parameters for SVM:
{'C': 6.475574713552131, 'class_weight': 'balanced', 'gamma': 0.5712434258477012, 'kernel': 'rbf'}
Best recall score: 0.5430

Image(filename="images/tuned_models_performance.png")

Image(filename="images/tuned_models_confusion_matrices.png")

recall_optimized_results = {}
y_pred_dict = {}

for name, model in best_models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    optimal_threshold = adjust_threshold_for_recall(y_test, y_proba)
    y_pred = (y_proba >= optimal_threshold).astype(int)
    y_pred_dict[name] = y_pred

    recall_optimized_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba),
    }

voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in best_models.items()], voting="soft"
)

voting_clf.fit(X_train, y_train)

y_proba_ensemble = voting_clf.predict_proba(X_test)[:, 1]
optimal_threshold_ensemble = adjust_threshold_for_recall(y_test, y_proba_ensemble)
y_pred_ensemble = (y_proba_ensemble >= optimal_threshold_ensemble).astype(int)
y_pred_dict["Recall-Optimized Voting Classifier"] = y_pred_ensemble

ensemble_results = {
    "Accuracy": accuracy_score(y_test, y_pred_ensemble),
    "Precision": precision_score(y_test, y_pred_ensemble),
    "Recall": recall_score(y_test, y_pred_ensemble),
    "F1 Score": f1_score(y_test, y_pred_ensemble),
    "ROC AUC": roc_auc_score(y_test, y_proba_ensemble),
}

recall_optimized_results["Recall-Optimized Voting Classifier"] = ensemble_results

metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC"]
plot_model_performance(
    recall_optimized_results, metrics, "images/recall_focused_models_performance.png"
)

plot_combined_confusion_matrices(
    recall_optimized_results,
    y_test,
    y_pred_dict,
    labels=["No Purchase", "Purchase"],
    save_path="images/recall_focused_confusion_matrices.png",
)

Warning: Only the first 4 models will be plotted.

Image(filename="images/recall_focused_models_performance.png")

Image(filename="images/recall_focused_confusion_matrices.png")

feature_names = X_test.columns.tolist()

model_feature_importances = {}

for name, model in best_models.items():
    model_feature_importances[name] = extract_feature_importances(model, X_test, y_test)

model_feature_importances['Recall-Optimized Voting Classifier'] = extract_feature_importances(voting_clf, X_test, y_test)

feature_importance_df = pd.DataFrame(model_feature_importances, index=feature_names)

transposed_df = feature_importance_df.T 
print(transposed_df)

                                         Age  GraduateOrNot  FamilyMembers  \
Logistic Regression                -0.007067       0.000000       0.013600   
Random Forest                       0.196774       0.024893       0.168446   
Gradient Boosting                   0.247488       0.041184       0.176940   
SVM                                 0.032800       0.003200       0.021733   
Recall-Optimized Voting Classifier -0.017600      -0.008933      -0.002533   

                                    ChronicDiseases  FrequentFlyer  \
Logistic Regression                        0.000000       0.012533   
Random Forest                              0.031553       0.035472   
Gradient Boosting                          0.043798       0.065074   
SVM                                        0.001467       0.001200   
Recall-Optimized Voting Classifier        -0.006933      -0.013333   

                                    EverTravelledAbroad  AnnualIncomeEuro  \
Logistic Regression                            0.023467          0.040133   
Random Forest                                  0.078426          0.430768   
Gradient Boosting                              0.083571          0.303348   
SVM                                            0.007867          0.046267   
Recall-Optimized Voting Classifier             0.019600          0.059733   

                                    Emp_Government Sector  \
Logistic Regression                              0.002267   
Random Forest                                    0.016085   
Gradient Boosting                                0.019399   
SVM                                             -0.024400   
Recall-Optimized Voting Classifier               0.007733   

                                    Emp_Private Sector/Self Employed  
Logistic Regression                                         0.001067  
Random Forest                                               0.017582  
Gradient Boosting                                           0.019199  
SVM                                                        -0.024400  
Recall-Optimized Voting Classifier                          0.007467

%run -i travel_insurance_utils.py

	count	mean	std	min	25%	50%	75%	max
Age	1249.0	29.755805	2.921039	25.0	28.0	29.0	32.0	35.0
FamilyMembers	1249.0	4.890312	1.762313	2.0	4.0	5.0	6.0	9.0
ChronicDiseases	1249.0	0.333066	0.471499	0.0	0.0	0.0	1.0	1.0
TravelInsurance	1249.0	0.386709	0.487191	0.0	0.0	0.0	1.0	1.0
AnnualIncomeEuro	1249.0	10280.024019	3968.022127	3300.0	6600.0	9900.0	13200.0	19800.0

Model	Accuracy	Precision	Recall	F1 Score	ROC AUC
Logistic Regression	0.69	0.73	0.36	0.48	0.67
Random Forest	0.64	0.55	0.47	0.51	0.63
Gradient Boosting	0.76	0.86	0.49	0.62	0.74
SVM	0.74	0.81	0.47	0.59	0.75

Model	Accuracy	Precision	Recall	F1 Score	ROC AUC
Logistic Regression	0.64	0.56	0.54	0.55	0.67
Random Forest	0.75	0.76	0.54	0.63	0.70
Gradient Boosting	0.52	0.43	0.55	0.48	0.53
SVM	0.66	0.58	0.53	0.55	0.67

Model	ROC AUC	Threshold
Logistic Regression	0.6712	0.2224
Random Forest	0.7012	0.0188
Gradient Boosting	0.7052	0.0000
SVM	0.6689	0.1567
Recall-Optimized Voting Classifier	0.6874	0.1273

Travel Insurance Prediction Analysis¶

Introduction¶

Project Overview¶

About the Dataset¶

Context¶

Requirements and Approach¶

Emp_Government Sector¶

Emp_Private Sector/Self Employed¶

GraduateOrNot¶

FrequentFlyer¶

EverTravelledAbroad¶

ChronicDiseases¶

Age¶

Family Members¶

Annual Income (Euro)¶

Conclusions:¶

	Unnamed: 0	Age	Employment Type	GraduateOrNot	AnnualIncome	FamilyMembers	ChronicDiseases	FrequentFlyer	EverTravelledAbroad	TravelInsurance
0	0	31	Government Sector	Yes	400000	6	1	No	No	0
1	1	31	Private Sector/Self Employed	Yes	1250000	7	0	No	No	0
2	2	34	Private Sector/Self Employed	Yes	500000	4	1	No	No	1
3	3	28	Private Sector/Self Employed	Yes	700000	3	1	No	No	0
4	4	28	Private Sector/Self Employed	Yes	700000	8	1	Yes	No	0

	Age	EmploymentType	GraduateOrNot	FamilyMembers	ChronicDiseases	FrequentFlyer	EverTravelledAbroad	TravelInsurance	AnnualIncomeEuro
0	31	Government Sector	Yes	6	1	No	No	0	4400.0
1	31	Private Sector/Self Employed	Yes	7	0	No	No	0	13750.0
2	34	Private Sector/Self Employed	Yes	4	1	No	No	1	5500.0
3	28	Private Sector/Self Employed	Yes	3	1	No	No	0	7700.0
4	28	Private Sector/Self Employed	Yes	8	1	Yes	No	0	7700.0

Model	Best Parameters	Best Recall Score
Logistic Regression	{'C': 0.8404465173409036, 'class_weight': 'balanced', 'max_iter': 2000, 'penalty': 'l1', 'solver': 'saga'}	0.5691
Random Forest	{'class_weight': 'balanced', 'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 193}	0.5117
Gradient Boosting	{'learning_rate': 0.5028252270553003, 'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 233, 'subsample': 0.6188187719961998}	0.5979
SVM	{'C': 6.475574713552131, 'class_weight': 'balanced', 'gamma': 0.5712434258477012, 'kernel': 'rbf'}	0.5430