%load_ext autoreload
%autoreload 2


import sqlite3 as sql
from pathlib import Path

import kaggle
import pandas as pd
from IPython.display import Image
from statsmodels import stats
from scipy.stats import f_oneway

from podcast_review_insights.utils.podcast_utilities import (
    add_length_column,
    analyze_dataframe,
    convert_arabic_numerals_to_english,
    create_bar_chart,
    create_scatter_plot,
    load_data_from_sql,
    parallel_process,
    perform_sentiment_and_frequency_analysis,
    plot_histogram,
    plot_monthly_reviews_and_ratings,
    remove_punctuation,
    remove_stopwords,
    save_dataframe_to_sql,
    to_lowercase,
    create_box_plot,
    create_horizontal_bar_chart,
    create_xyz_scatter_plot
)


try:
    root_dir = Path.cwd().parent

    data_dir = root_dir / "data"
    data_dir.mkdir(exist_ok=True)

    required_files = [
        "categories.json",
        "database.sqlite",
        "podcasts.json",
        "reviews.json",
    ]
    files_exist = all((data_dir / file).exists() for file in required_files)

    if not files_exist:
        print("Downloading dataset files...")
        kaggle.api.dataset_download_files(
            "thoughtvector/podcastreviews", path=str(data_dir), unzip=True
        )
        print("Download complete!")
    else:
        print("Files already exist in data directory")

except Exception as e:
    print(f"Error: {str(e)}")

Files already exist in data directory


db_path = Path.cwd().parent / "data" / "database.sqlite"
conn = sql.connect(db_path)

categories_df = load_data_from_sql("categories", conn)
podcasts_df = load_data_from_sql("podcasts", conn)
runs_df = load_data_from_sql("runs", conn)
reviews_df = load_data_from_sql("reviews", conn)


reviews_df["created_at"] = pd.to_datetime(
    reviews_df["created_at"].apply(convert_arabic_numerals_to_english)
)
runs_df["run_at"] = pd.to_datetime(
    runs_df["run_at"].apply(convert_arabic_numerals_to_english)
)


analyze_dataframe(categories_df, "Categories")

Categories table:

Columns: ['podcast_id', 'category']

Summary statistics:
                              podcast_id         category
count                             212372           212372
unique                            110024              110
top     fa8c359031380cd4f3b38358c8a75f92  society-culture
freq                                   8            18710

Number of duplicates: 0

Number of missing values:
podcast_id    0
category      0
dtype: int64

First few rows:
                         podcast_id              category
0  c61aa81c9b929a66f0c1db6cbe5d8548                  arts
1  c61aa81c9b929a66f0c1db6cbe5d8548  arts-performing-arts
2  c61aa81c9b929a66f0c1db6cbe5d8548                 music
3  ad4f2bf69c72b8db75978423c25f379e                  arts
4  ad4f2bf69c72b8db75978423c25f379e           arts-design


analyze_dataframe(podcasts_df, "Podcasts")

Podcasts table:

Columns: ['podcast_id', 'itunes_id', 'slug', 'itunes_url', 'title']

Summary statistics:
          itunes_id
count  1.100240e+05
mean   1.309424e+09
std    3.575526e+08
min    7.332927e+07
25%    1.227235e+09
50%    1.467889e+09
75%    1.528936e+09
max    1.665764e+09

Number of duplicates: 0

Number of missing values:
podcast_id    0
itunes_id     0
slug          0
itunes_url    0
title         0
dtype: int64

First few rows:
                         podcast_id   itunes_id  \
0  a00018b54eb342567c94dacfb2a3e504  1313466221   
1  a00043d34e734b09246d17dc5d56f63c   158973461   
2  a0004b1ef445af9dc84dad1e7821b1e3   139076942   
3  a00071f9aaae9ac725c3a586701abf4d  1332508972   
4  a000a500f06555f81220c3eb641aded7  1544900779   

                                    slug  \
0                         scaling-global   
1  cornerstone-baptist-church-of-orlando   
2            mystery-dancing-in-the-dark   
3                      kts-money-matters   
4       word-on-the-street-w-dreak-swift   

                                          itunes_url  \
0  https://podcasts.apple.com/us/podcast/scaling-...   
1  https://podcasts.apple.com/us/podcast/cornerst...   
2  https://podcasts.apple.com/us/podcast/mystery-...   
3  https://podcasts.apple.com/us/podcast/kts-mone...   
4  https://podcasts.apple.com/us/podcast/word-on-...   

                                   title  
0                         Scaling Global  
1  Cornerstone Baptist Church of Orlando  
2           Mystery: Dancing in the Dark  
3                      KTs Money Matters  
4      Word on the Street w/ Dreak Swift


podcasts_df = parallel_process(podcasts_df, n_processes=8)

podcasts_df = to_lowercase(podcasts_df, ["title"])
podcasts_df = remove_punctuation(podcasts_df, ["title"])
podcasts_df = remove_stopwords(podcasts_df, ["title"])

save_dataframe_to_sql(podcasts_df, conn, "podcasts_cleaned")
podcasts_cleaned_df = load_data_from_sql("podcasts_cleaned", conn)


analyze_dataframe(runs_df, "Runs")

Runs table:

Columns: ['run_at', 'max_rowid', 'reviews_added']

Summary statistics:
                              run_at     max_rowid  reviews_added
count                             16  1.600000e+01   1.600000e+01
mean   2022-04-01 18:28:13.437499904  4.368318e+06   1.292206e+05
min              2021-05-10 02:53:00  3.266481e+06   0.000000e+00
25%              2021-08-25 17:59:03  3.382702e+06   1.281800e+04
50%              2022-06-14 05:59:54  4.167376e+06   1.908900e+04
75%    2022-09-13 17:05:22.249999872  5.330927e+06   3.799075e+04
max              2023-02-18 02:11:53  5.599789e+06   1.215223e+06
std                              NaN  9.678974e+05   3.194665e+05

Number of duplicates: 0

Number of missing values:
run_at           0
max_rowid        0
reviews_added    0
dtype: int64

First few rows:
               run_at  max_rowid  reviews_added
0 2021-05-10 02:53:00    3266481        1215223
1 2021-06-06 21:34:36    3300773          13139
2 2021-07-02 18:04:55    3329699          11561
3 2021-08-01 17:54:42    3360315          11855
4 2021-09-02 18:00:30    3390165          11714


analyze_dataframe(reviews_df, "Reviews")

Reviews table:

Columns: ['podcast_id', 'title', 'content', 'rating', 'author_id', 'created_at']

Summary statistics:
             rating
count  2.067529e+06
mean   4.627081e+00
std    1.045882e+00
min    1.000000e+00
25%    5.000000e+00
50%    5.000000e+00
75%    5.000000e+00
max    5.000000e+00

Number of duplicates: 655

Number of missing values:
podcast_id    0
title         0
content       0
rating        0
author_id     0
created_at    0
dtype: int64

First few rows:
                         podcast_id  \
0  c61aa81c9b929a66f0c1db6cbe5d8548   
1  c61aa81c9b929a66f0c1db6cbe5d8548   
2  ad4f2bf69c72b8db75978423c25f379e   
3  ad4f2bf69c72b8db75978423c25f379e   
4  ad4f2bf69c72b8db75978423c25f379e   

                                              title  \
0                               really interesting!   
1  Must listen for anyone interested in the arts!!!   
2                                 nauseatingly left   
3                                   Diverse stories   
4                                              👍👍👍👍   

                                             content  rating        author_id  \
0  Thanks for providing these insights.  Really e...       5  F7E5A318989779D   
1  Super excited to see this podcast grow. So man...       5  F6BF5472689BD12   
2  I'm a liberal myself, but its pretty obvious a...       1  1AB95B8E6E1309E   
3  I find Tedx talks very inspirational but I oft...       5  11BB760AA5DEBD1   
4                I love this podcast, it is so good.       5  D86032C8E57D15A   

                 created_at  
0 2018-04-24 12:05:16-07:00  
1 2018-05-09 18:14:32-07:00  
2 2019-06-11 14:53:39-07:00  
3 2018-05-31 13:08:09-07:00  
4 2019-06-19 13:56:05-07:00


reviews_df = parallel_process(reviews_df, n_processes=8)

reviews_df = to_lowercase(reviews_df, ["title", "content"])
reviews_df = remove_punctuation(reviews_df, ["title", "content"])
reviews_df = remove_stopwords(reviews_df, ["title", "content"])
reviews_df = add_length_column(reviews_df, "content", "content_length")

save_dataframe_to_sql(reviews_df, conn, "reviews_cleaned")
reviews_cleaned_df = load_data_from_sql("reviews_cleaned", conn)


query = """SELECT p.title, COUNT(r.content) as review_count, r.rating as avg_rating
FROM podcasts_cleaned as p
JOIN reviews_cleaned as r
ON p.podcast_id = r.podcast_id
GROUP BY p.title
ORDER BY review_count DESC
LIMIT 5"""

podcast_review_counts = pd.read_sql_query(query, conn)

create_bar_chart(
    podcast_review_counts,
    "title",
    "review_count",
    "Title",
    "Review count",
    "Top 5 Podcasts by Review Count",
    "#dcb0f2",
    "../images/top_5_podcasts_by_review_count.png"
)


Image(filename="../images/top_5_podcasts_by_review_count.png")


sample_size = 50000

query = f""" 
SELECT r.podcast_id,
p.title,
COUNT(r.rating) as num_reviews,
AVG(r.rating) as avg_rating
FROM (SELECT * FROM reviews_cleaned ORDER BY RANDOM() LIMIT {sample_size}) as r
JOIN podcasts_cleaned as p ON r.podcast_id = p.podcast_id
GROUP BY r.podcast_id, p.title"""

podcast_avg_ratings = pd.read_sql_query(query, conn)

correlation = podcast_avg_ratings["num_reviews"].corr(podcast_avg_ratings["avg_rating"])

print(
    f"The correlation between the number of reviews and the average rating is {correlation}"
)

create_scatter_plot(
    podcast_avg_ratings,
    "num_reviews",
    "avg_rating",
    "Number of reviews",
    "Average rating",
    "Number of Reviews vs. Average Rating",
    "#dcb0f2",
    "rgb(128, 0, 128)",
    "../images/number_of_reviews_vs_average_rating.png"
)

The correlation between the number of reviews and the average rating is -0.08419362311496865


Image(filename="../images/number_of_reviews_vs_average_rating.png")


sample_df = reviews_cleaned_df.sample(n=sample_size)

lengths = sample_df["content_length"]
ratings = sample_df["rating"]

create_box_plot(
    sample_df,
    "rating",
    "content_length",
    "Review Length vs Rating",
    "Rating",
    "Review Length",
    "#dcb0f2",
    "../images/review_length_vs_rating.png"
)
 
groups = [sample_df.loc[sample_df.rating == i, "content_length"] for i in range(1, 6)]

F, p = f_oneway(*groups)

print(f"F statistic: {F}")
print(f"p-value: {p}")

F statistic: 232.92963187877754
p-value: 1.5966844017614888e-198


Image(filename="../images/review_length_vs_rating.png")


plot_histogram(
    reviews_cleaned_df,
    "rating",
    "Distribution of Reviews",
    "Rating",
    "Percentage",
    "#dcb0f2",
    "../images/rating_distribution.png"
)


Image(filename="../images/rating_distribution.png")


common_words_dict = perform_sentiment_and_frequency_analysis(
    reviews_cleaned_df, "content"
)

Common words in negative reviews: [('’', 120508), ('podcast', 68627), ('like', 33461), ('listen', 29845), ('show', 23171), ('one', 20182), ('listening', 19311), ('get', 18945), ('episode', 18739), ('time', 16114)]

Common words in positive reviews: [('’', 795258), ('podcast', 712919), ('love', 426439), ('great', 364560), ('listen', 248064), ('show', 215540), ('like', 213910), ('listening', 198268), ('one', 169926), ('really', 165311)]


query = """
SELECT 
    c.category, 
    COUNT(DISTINCT c.podcast_id) as num_podcasts
FROM 
    categories as c
JOIN
    podcasts_cleaned as p
ON
    c.podcast_id = p.podcast_id
GROUP BY 
    c.category
ORDER BY 
    num_podcasts DESC
LIMIT 10;
"""
most_common_categories = pd.read_sql_query(query, conn)

create_bar_chart(
    most_common_categories,
    "category",
    "num_podcasts",
    "Category",
    "Number of Podcasts",
    "Top 10 Most Common Podcast Categories",
    "#dcb0f2",
    "../images/top_10_most_common_podcast_categories.png"
)


Image(filename="../images/top_10_most_common_podcast_categories.png")


query = """
SELECT 
    c.category, 
    r.rating, 
    COUNT(*) as count
FROM 
    reviews_cleaned as r
JOIN 
    categories as c ON r.podcast_id = c.podcast_id
WHERE 
    c.category IN (
        SELECT category 
        FROM categories 
        GROUP BY category 
        ORDER BY COUNT(*) DESC 
        LIMIT 10
    )
GROUP BY 
    c.category, 
    r.rating;
"""
categories_distribution_by_rating = pd.read_sql_query(query, conn)

color_list = ["#ffa600", "#ff8ca3", "#ff9061", "#ff9cd7", "#dcb0f2"]

create_horizontal_bar_chart(
    categories_distribution_by_rating,
    "count",
    "category",
    "Number of Reviews",
    "Category",
    "Distribution of Ratings Across Top 10 Categories",
    color_list,
    "../images/distribution_of_ratings_across_top_10_categories.png"
)


Image(filename="../images/distribution_of_ratings_across_top_10_categories.png")


query = """
    SELECT 
        DATE(created_at) as date, 
        COUNT(*) as num_reviews, 
        AVG(rating) as avg_rating
    FROM 
        reviews_cleaned
    GROUP BY 
        date
    ORDER BY 
        date;
    """
reviews_over_time = pd.read_sql_query(query, conn)

color_list = ["#ff9cd7", "#ffa600", "#dcb0f2"]

create_xyz_scatter_plot(
    reviews_over_time,
    "date",
    "num_reviews",
    "avg_rating",
    "Date",
    "Number of Reviews",
    "Number of Reviews Over Time",
    color_list,
    "../images/number_of_reviews_over_time.png"
)


Image(filename="../images/number_of_reviews_over_time.png")


query = """
SELECT 
    strftime('%m', created_at) as month, 
    COUNT(*) as num_reviews, 
    AVG(rating) as avg_rating
FROM 
    reviews_cleaned
GROUP BY 
    month
ORDER BY 
    month;
"""
monthly_reviews = pd.read_sql_query(query, conn)

plot_monthly_reviews_and_ratings(
    monthly_reviews,
    "month",
    "num_reviews",
    "avg_rating",
    "Month",
    "Number of Reviews",
    "Average Rating",
    "#dcb0f2",
    "rgb(128, 0, 128)",
    "../images/monthly_reviews_and_ratings.png"
)


Image(filename="../images/monthly_reviews_and_ratings.png")


query = """SELECT author_id, COUNT(*) as num_reviews
FROM reviews_cleaned
GROUP BY author_id
ORDER BY num_reviews DESC
LIMIT 5"""

active_authors = pd.read_sql_query(query, conn)

color_dict = {
    'D3307ADEFFA285C': '#dcb0f2',  
    '96A3F0358E7808D': '#ff9cd7',  
    '7F9EA8B1A449BFC': '#ff8ca3',  
    '4B25015E4D7A1DA': '#ff9061',  
    '791F376ACA3C08D' : '#ffa600'  
}


create_bar_chart(
    active_authors,
    "author_id",
    "num_reviews",
    "Author ID",
    "Number of Reviews",
    "Top 5 Most Active Authors",
    color_dict,
    "../images/top_5_most_active_authors.png"
)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[36], line 18
      7 active_authors = pd.read_sql_query(query, conn)
      9 color_dict = {
     10     'D3307ADEFFA285C': '#dcb0f2',  
     11     '96A3F0358E7808D': '#ff9cd7',  
   (...)
     14     '791F376ACA3C08D' : '#ffa600'  
     15 }
---> 18 create_bar_chart(
     19     active_authors,
     20     "author_id",
     21     "num_reviews",
     22     "Author ID",
     23     "Number of Reviews",
     24     "Top 5 Most Active Authors",
     25     color_dict,
     26     "../images/top_5_most_active_authors.png"
     27 )

File ~/podcast-review-insights/src/podcast_review_insights/utils/podcast_utilities.py:365, in create_bar_chart(df, x, y, x_name, y_name, title, color, save_path)
    363 fig.update_layout(xaxis_title=x_name, yaxis_title=y_name, title_x=0.5)
    364 if save_path:
--> 365     fig.write_image(save_path)
    366 else:
    367     fig.show()

File ~/podcast-review-insights/.venv/lib/python3.11/site-packages/plotly/basedatatypes.py:3827, in BaseFigure.write_image(self, *args, **kwargs)
   3767 """
   3768 Convert a figure to a static image and write it to a file or writeable
   3769 object
   (...)
   3823 None
   3824 """
   3825 import plotly.io as pio
-> 3827 return pio.write_image(self, *args, **kwargs)

File ~/podcast-review-insights/.venv/lib/python3.11/site-packages/plotly/io/_kaleido.py:266, in write_image(fig, file, format, scale, width, height, validate, engine)
    250             raise ValueError(
    251                 """
    252 Cannot infer image type from output path '{file}'.
   (...)
    260                 )
    261             )
    263     # Request image
    264     # -------------
    265     # Do this first so we don't create a file if image conversion fails
--> 266     img_data = to_image(
    267         fig,
    268         format=format,
    269         scale=scale,
    270         width=width,
    271         height=height,
    272         validate=validate,
    273         engine=engine,
    274     )
    276     # Open file
    277     # ---------
    278     if path is None:
    279         # We previously failed to make sense of `file` as a pathlib object.
    280         # Attempt to write to `file` as an open file descriptor.

File ~/podcast-review-insights/.venv/lib/python3.11/site-packages/plotly/io/_kaleido.py:132, in to_image(fig, format, width, height, scale, validate, engine)
    130     # Raise informative error message if Kaleido is not installed
    131     if scope is None:
--> 132         raise ValueError(
    133             """
    134 Image export using the "kaleido" engine requires the kaleido package,
    135 which can be installed using pip:
    136     $ pip install -U kaleido
    137 """
    138         )
    140     # Validate figure
    141     # ---------------
    142     fig_dict = validate_coerce_fig_to_dict(fig, validate)

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


Image(filename="../images/top_5_most_active_authors.png")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[37], line 1
----> 1 Image(filename="../images/top_5_most_active_authors.png")

NameError: name 'Image' is not defined


query = """SELECT author_id, AVG(rating) as avg_rating
FROM reviews_cleaned
WHERE author_id IN (SELECT author_id FROM reviews_cleaned GROUP BY author_id ORDER BY COUNT(*) DESC LIMIT 5)
GROUP BY author_id
ORDER BY avg_rating DESC;"""

author_ratings = pd.read_sql_query(query, conn)

create_bar_chart(
    author_ratings,
    "author_id",
    "avg_rating",
    "Author ID",
    "Average Rating",
    "Average Rating of Top 5 Authors",
    color_dict,
    "../images/average_rating_of_top_5_authors.png"
)


Image(filename="../images/average_rating_of_top_5_authors.png")


query = """
WITH TopAuthorsReviews AS (
    SELECT r.author_id, c.category, COUNT(*) as num_reviews
    FROM reviews_cleaned as r
    JOIN categories as c ON r.podcast_id = c.podcast_id
    WHERE r.author_id IN (SELECT author_id FROM reviews_cleaned GROUP BY author_id ORDER BY COUNT(*) DESC LIMIT 5)
    GROUP BY r.author_id, c.category
)

SELECT category, SUM(num_reviews) as total_reviews
FROM TopAuthorsReviews
GROUP BY category
ORDER BY total_reviews DESC
LIMIT 10;
"""

active_authors_categories = pd.read_sql_query(query, conn)

create_bar_chart(
    active_authors_categories,
    "category",
    "total_reviews",
    "Category",
    "Total Reviews",
    "Top 10 Categories Reviewed by Top 5 Authors",
    "#DCB0F2",
    "../images/top_10_categories_reviewed_by_top_5_authors.png"
)


Image(filename="../images/top_10_categories_reviewed_by_top_5_authors.png")

Podcast Reviews Analysis¶

Project Overview¶

Data Loading and Preprocessing¶

EDA Questions¶

Hypotheses¶

Dashboard Creation¶

Conclusion and Recommendations¶

Data Loading and Preprocessing¶

Categories DataFrame¶

Podcasts DataFrame¶

Runs DataFrame¶

Reviews DataFrame¶

Exploratory Data Analysis¶

Key Findings¶

Data Cleaning:¶

Podcast Popularity:¶

Review Length and Rating:¶

Distribution of Ratings:¶

Sentiment Analysis:¶

Category Analysis:¶

Trends Over Time:¶

Author Analysis:¶

Suggestions for Improvement¶

Author Analysis:¶

Further Exploration:¶

Asking the right questions:¶