%load_ext autoreload
%autoreload 2


import os
import sqlite3

import pandas as pd
from IPython.display import Image

from red_wine_quality_analysis.utils.red_wine_quality_utils import (
    get_columns,
    identify_outliers,
    log_transform_features,
    plot_box_chart,
    plot_correlation,
    plot_heatmap,
    plot_histograms,
    plot_model_predictions,
    remove_duplicates,
    test_correlation,
    train_linear_model,
)


csv_path = os.path.join("..", "data", "winequality_red.csv")
db_path = os.path.join("..", "data", "wine_quality.db")

wine_df = pd.read_csv(csv_path)
conn = sqlite3.connect(db_path)
wine_df.to_sql("wine_quality", conn, if_exists="replace", index=False)

1599


wine_df.head()


wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


wine_df = remove_duplicates(wine_df)

Removed 240 duplicate rows


wine_df.describe().T


get_columns(wine_df)

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']


wine_df.rename(
    columns={
        "fixed acidity": "fixed_acidity",
        "volatile acidity": "volatile_acidity",
        "citric acid": "citric_acid",
        "residual sugar": "residual_sugar",
        "free sulfur dioxide": "free_sulfur_dioxide",
        "total sulfur dioxide": "total_sulfur_dioxide",
    },
    inplace=True,
)


plot_box_chart(
    wine_df,
    "Feature",
    "Value",
    "Boxplot of Features",
    save_path="../images/boxplot_features.png",
)


Image(filename="../images/boxplot_features.png")


outlier_info = identify_outliers(wine_df)
print(outlier_info["outliers_per_column"])
print(
    f"Total number of outliers across all columns: {outlier_info['total_outliers']}"
)

fixed_acidity            41
volatile_acidity         19
citric_acid               1
residual_sugar          126
chlorides                87
free_sulfur_dioxide      26
total_sulfur_dioxide     45
density                  35
pH                       28
sulphates                55
alcohol                  12
quality                  27
dtype: int64
Total number of outliers across all columns: 502


plot_histograms(
    wine_df,
    ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar"],
    save_path="../images/histograms.png",
)


Image(filename="../images/histograms.png")


plot_histograms(
    wine_df,
    ["chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density"],
    save_path="../images/histograms1.png",
)


Image(filename="../images/histograms1.png")


plot_histograms(
    wine_df,
    ["pH", "sulphates", "alcohol", "quality"],
    save_path="../images/histograms2.png",
)


Image(filename="../images/histograms2.png")


columns_to_transform = [
    "sulphates",
    "alcohol",
    "chlorides",
    "free_sulfur_dioxide",
    "total_sulfur_dioxide",
    "fixed_acidity",
    "volatile_acidity",
    "citric_acid",
    "residual_sugar",
]
transformed_wine_df = log_transform_features(wine_df, columns_to_transform)


outlier_info = identify_outliers(transformed_wine_df)
print(outlier_info["outliers_per_column"])
print(
    f"Total number of outliers across all columns: {outlier_info['total_outliers']}"
)

fixed_acidity            12
volatile_acidity          8
citric_acid               0
residual_sugar          108
chlorides                87
free_sulfur_dioxide       0
total_sulfur_dioxide      0
density                  35
pH                       28
sulphates                48
alcohol                   7
quality                  27
dtype: int64
Total number of outliers across all columns: 360


corr_matrix = transformed_wine_df.corr()
print(corr_matrix)

                      fixed_acidity  volatile_acidity  citric_acid  \
fixed_acidity              1.000000         -0.259980     0.656309   
volatile_acidity          -0.259980          1.000000    -0.575063   
citric_acid                0.656309         -0.575063     1.000000   
residual_sugar             0.159338          0.020262     0.163785   
chlorides                  0.112359          0.067558     0.194126   
free_sulfur_dioxide       -0.161301          0.005780    -0.061352   
total_sulfur_dioxide      -0.106640          0.077575     0.027812   
density                    0.677643          0.031674     0.352629   
pH                        -0.708034          0.245293    -0.551118   
sulphates                  0.198090         -0.278336     0.331680   
alcohol                   -0.091192         -0.209158     0.095063   
quality                    0.109715         -0.397329     0.227422   

                      residual_sugar  chlorides  free_sulfur_dioxide  \
fixed_acidity               0.159338   0.112359            -0.161301   
volatile_acidity            0.020262   0.067558             0.005780   
citric_acid                 0.163785   0.194126            -0.061352   
residual_sugar              1.000000   0.032933             0.089687   
chlorides                   0.032933   1.000000            -0.006385   
free_sulfur_dioxide         0.089687  -0.006385             1.000000   
total_sulfur_dioxide        0.146414   0.062295             0.786095   
density                     0.380607   0.211604            -0.030833   
pH                         -0.093706  -0.275122             0.079773   
sulphates                  -0.002417   0.359596             0.056486   
alcohol                     0.089509  -0.238845            -0.093764   
quality                     0.020154  -0.137302            -0.047132   

                      total_sulfur_dioxide   density        pH  sulphates  \
fixed_acidity                    -0.106640  0.677643 -0.708034   0.198090   
volatile_acidity                  0.077575  0.031674  0.245293  -0.278336   
citric_acid                       0.027812  0.352629 -0.551118   0.331680   
residual_sugar                    0.146414  0.380607 -0.093706  -0.002417   
chlorides                         0.062295  0.211604 -0.275122   0.359596   
free_sulfur_dioxide               0.786095 -0.030833  0.079773   0.056486   
total_sulfur_dioxide              1.000000  0.109176 -0.030143   0.053566   
density                           0.109176  1.000000 -0.355617   0.152989   
pH                               -0.030143 -0.355617  1.000000  -0.196653   
sulphates                         0.053566  0.152989 -0.196653   1.000000   
alcohol                          -0.247685 -0.500142  0.213968   0.113559   
quality                          -0.165289 -0.184252 -0.055245   0.279517   

                       alcohol   quality  
fixed_acidity        -0.091192  0.109715  
volatile_acidity     -0.209158 -0.397329  
citric_acid           0.095063  0.227422  
residual_sugar        0.089509  0.020154  
chlorides            -0.238845 -0.137302  
free_sulfur_dioxide  -0.093764 -0.047132  
total_sulfur_dioxide -0.247685 -0.165289  
density              -0.500142 -0.184252  
pH                    0.213968 -0.055245  
sulphates             0.113559  0.279517  
alcohol               1.000000  0.481462  
quality               0.481462  1.000000


plot_heatmap(corr_matrix, save_path="../images/correlation_heatmap.png")


Image(filename="../images/correlation_heatmap.png")


plot_histograms(
    transformed_wine_df,
    ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar"],
    save_path="../images/histograms_transformed.png",
)


Image(filename="../images/histograms_transformed.png")


plot_histograms(
    transformed_wine_df,
    ["chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density"],
    save_path="../images/histograms1_transformed.png",
)


Image(filename="../images/histograms1_transformed.png")


plot_histograms(
    transformed_wine_df,
    ["pH", "sulphates", "alcohol", "quality"],
    save_path="../images/histograms2_transformed.png",
)


Image(filename="../images/histograms2_transformed.png")


test_correlation(transformed_wine_df, "fixed_acidity", "citric_acid")

{'Correlation': np.float64(0.6563086255295857),
 'P-Value': np.float64(3.131882291957192e-168),
 'Reject H0': np.True_,
 '95% CI': (np.float64(0.6181577261929445), np.float64(0.6944595248662269))}


plot_correlation(
    transformed_wine_df,
    "fixed_acidity",
    "citric_acid",
    save_path="../images/correlation_plot.png",
)


Image(filename="../images/correlation_plot.png")


test_correlation(transformed_wine_df, "quality", "alcohol")

{'Correlation': np.float64(0.4814619565501097),
 'P-Value': np.float64(8.792571112773603e-80),
 'Reject H0': np.True_,
 '95% CI': (np.float64(0.4371434215430752), np.float64(0.5257804915571442))}


plot_correlation(
    transformed_wine_df,
    "quality",
    "alcohol",
    save_path="../images/correlation_plot1.png",
)


Image(filename="../images/correlation_plot1.png")


test_correlation(transformed_wine_df, "volatile_acidity", "quality")

{'Correlation': np.float64(-0.39732873875315033),
 'P-Value': np.float64(1.2718423787494175e-52),
 'Reject H0': np.True_,
 '95% CI': (np.float64(-0.4437310239159599), np.float64(-0.35092645359034075))}


plot_correlation(
    transformed_wine_df,
    "volatile_acidity",
    "quality",
    save_path="../images/correlation_plot2.png",
)


Image(filename="../images/correlation_plot2.png")


test_correlation(transformed_wine_df, "pH", "citric_acid")

{'Correlation': np.float64(-0.5511177680998307),
 'P-Value': np.float64(8.277860767471543e-109),
 'Reject H0': np.True_,
 '95% CI': (np.float64(-0.5933105757530911), np.float64(-0.5089249604465703))}


plot_correlation(
    transformed_wine_df,
    "pH",
    "citric_acid",
    save_path="../images/correlation_plot3.png",
)


Image(filename="../images/correlation_plot3.png")


model_results, X_test, y_test, y_pred = train_linear_model(
    transformed_wine_df, "quality"
)
plot_model_predictions(
    X_test,
    y_test,
    y_pred,
    "Predicted Quality vs Actual Quality",
    save_path="../images/model_predictions.png",
)

Model summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                quality   R-squared:                       0.354
Model:                            OLS   Adj. R-squared:                  0.348
Method:                 Least Squares   F-statistic:                     53.65
Date:                Thu, 13 Feb 2025   Prob (F-statistic):           2.58e-94
Time:                        10:09:43   Log-Likelihood:                -1086.3
No. Observations:                1087   AIC:                             2197.
Df Residuals:                    1075   BIC:                             2256.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  -15.8513     28.148     -0.563      0.573     -71.082      39.380
fixed_acidity           -0.1069      0.331     -0.323      0.747      -0.756       0.543
volatile_acidity        -1.5706      0.238     -6.589      0.000      -2.038      -1.103
citric_acid             -0.2693      0.232     -1.160      0.246      -0.725       0.186
residual_sugar          -0.0229      0.103     -0.222      0.825      -0.226       0.180
chlorides               -2.5049      0.577     -4.343      0.000      -3.637      -1.373
free_sulfur_dioxide      0.1132      0.055      2.039      0.042       0.004       0.222
total_sulfur_dioxide    -0.1681      0.053     -3.175      0.002      -0.272      -0.064
density                 15.9937     28.720      0.557      0.578     -40.360      72.347
pH                      -0.7253      0.253     -2.863      0.004      -1.222      -0.228
sulphates                1.7184      0.260      6.611      0.000       1.208       2.228
alcohol                  3.5327      0.396      8.932      0.000       2.757       4.309
==============================================================================
Omnibus:                       17.971   Durbin-Watson:                   1.954
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               23.847
Skew:                          -0.194   Prob(JB):                     6.63e-06
Kurtosis:                       3.613   Cond. No.                     1.36e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.36e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Mean Squared Error: 0.4263608394154518
R-squared: 0.3980979574615455
Adjusted R-squared: 0.3927318174091967


Image(filename="../images/model_predictions.png")


model_results, X_test, y_test, y_pred = train_linear_model(
    transformed_wine_df, "alcohol"
)
plot_model_predictions(
    X_test,
    y_test,
    y_pred,
    "Predicted Alcohol vs Actual Alcohol",
    save_path="../images/model_predictions1.png",
)

Model summary:
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                alcohol   R-squared:                       0.719
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                     250.7
Date:                Thu, 13 Feb 2025   Prob (F-statistic):          1.63e-287
Time:                        10:09:43   Log-Likelihood:                 1738.0
No. Observations:                1087   AIC:                            -3452.
Df Residuals:                    1075   BIC:                            -3392.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   51.1606      1.397     36.614      0.000      48.419      53.902
fixed_acidity            0.4277      0.021     20.477      0.000       0.387       0.469
volatile_acidity         0.0676      0.018      3.759      0.000       0.032       0.103
citric_acid              0.0801      0.017      4.680      0.000       0.047       0.114
residual_sugar           0.1363      0.006     21.040      0.000       0.124       0.149
chlorides               -0.0770      0.043     -1.782      0.075      -0.162       0.008
free_sulfur_dioxide     -0.0020      0.004     -0.478      0.633      -0.010       0.006
total_sulfur_dioxide    -0.0104      0.004     -2.646      0.008      -0.018      -0.003
density                -51.2766      1.457    -35.205      0.000     -54.135     -48.419
pH                       0.3253      0.016     20.185      0.000       0.294       0.357
sulphates                0.1652      0.019      8.660      0.000       0.128       0.203
quality                  0.0196      0.002      8.932      0.000       0.015       0.024
==============================================================================
Omnibus:                       93.390   Durbin-Watson:                   1.965
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              148.154
Skew:                           0.624   Prob(JB):                     6.74e-33
Kurtosis:                       4.308   Cond. No.                     1.15e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.15e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Mean Squared Error: 0.002494729526941092
R-squared: 0.6849929074277996
Adjusted R-squared: 0.6821845232443922


Image(filename="../images/model_predictions1.png")

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5

	count	mean	std	min	25%	50%	75%	max
fixed acidity	1359.0	8.310596	1.736990	4.60000	7.1000	7.9000	9.20000	15.90000
volatile acidity	1359.0	0.529478	0.183031	0.12000	0.3900	0.5200	0.64000	1.58000
citric acid	1359.0	0.272333	0.195537	0.00000	0.0900	0.2600	0.43000	1.00000
residual sugar	1359.0	2.523400	1.352314	0.90000	1.9000	2.2000	2.60000	15.50000
chlorides	1359.0	0.088124	0.049377	0.01200	0.0700	0.0790	0.09100	0.61100
free sulfur dioxide	1359.0	15.893304	10.447270	1.00000	7.0000	14.0000	21.00000	72.00000
total sulfur dioxide	1359.0	46.825975	33.408946	6.00000	22.0000	38.0000	63.00000	289.00000
density	1359.0	0.996709	0.001869	0.99007	0.9956	0.9967	0.99782	1.00369
pH	1359.0	3.309787	0.155036	2.74000	3.2100	3.3100	3.40000	4.01000
sulphates	1359.0	0.658705	0.170667	0.33000	0.5500	0.6200	0.73000	2.00000
alcohol	1359.0	10.432315	1.082065	8.40000	9.5000	10.2000	11.10000	14.90000
quality	1359.0	5.623252	0.823578	3.00000	5.0000	6.0000	6.00000	8.00000

Red Wine Quality Analysis¶

Positive Correlations:¶

Negative Correlations:¶

Hypotheses for Statistical Testing:¶

Key Findings from Red Wine Quality Analysis¶

1. Key Physicochemical Properties¶

2. Distribution of Properties¶

3. Outliers¶

4. Influence on Wine Quality¶

5. Predictive Modeling of Wine Quality¶

6. Predictors of Alcohol Content¶

7. Issues Affecting Model Reliability¶

8. Suggestions for Model Improvement¶

Conclusion¶