us_do_deaths.py

# -*- coding: utf-8 -*-
"""US DO deaths

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1KERzGNBjSb7d69OQuE3uXXFPFa1nKc14
"""

import pandas as pd
import numpy as np
# import matplotlib as mat
import matplotlib.pyplot as plt
import seaborn as sns
import math


"""## Data Wrangling"""

# define a function to clean up the column names
def clean_column_name(name):
    # convert to lower case
    name = name.lower()
    # replace spaces with underscores
    name = name.replace(" ", "_")
    # remove non-alphanumeric characters
    name = ''.join(e for e in name if e.isalnum() or e == '_')
    # check for duplicates
    for i in range(1, len(name)):
        if name[:i] == name[i:]:
            name = name[:i]
            break
    # remove "_2017" from the end of the name
    # name = name.replace("_2017", "")
    name = name.replace("_2017", "").replace("_cdc_wonder", "").replace("_us","")
    # return the cleaned name
    return name
## ------------- End of function ---------------##

overdose_deaths = pd.read_csv("https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/Drug%20overdose%20deaths%20in%20the%20US%20(CDC%20WONDER)/Drug%20overdose%20deaths%20in%20the%20US%20(CDC%20WONDER).csv")
overdose_deaths['Year'] = overdose_deaths['Year'].astype('int')
overdose_deaths.drop('Entity', axis=1, inplace=True)


# loop through the column names and clean them up
new_columns = {}
for col in overdose_deaths.columns:
    new_col = clean_column_name(col)
    new_columns[col] = new_col

# rename the columns
overdose_deaths.rename(columns=new_columns, inplace=True)

# Rename the dataframe from data to overdose_deaths
# overdose_deaths = overdose_deaths.rename_axis('year').reset_index()

overdose_deaths.columns

# check for missing values in each column
print(overdose_deaths.isna().any())

"""## Begin Exploratory Data Analysis"""

overdose_deaths.describe()

# Calculate the percent of total for each column using Total Overdose Deaths
percent_of_total = (overdose_deaths / overdose_deaths['total_overdose_deaths']) * 100

"""#### Calculate Quant"""

# Calculate the quartiles and IQR for each column
quartiles = overdose_deaths.quantile([0.25, 0.5, 0.75])
q1 = quartiles.loc[0.25]
q2 = quartiles.loc[0.5]
q3 = quartiles.loc[0.75]
iqr = q3 - q1

# Drop the 'year' column
overdose_deaths_no_year = overdose_deaths.drop(columns=['year'])

# Calculate quartiles for each column
q1 = overdose_deaths_no_year.quantile(0.25)
q2 = overdose_deaths_no_year.quantile(0.5)
q3 = overdose_deaths_no_year.quantile(0.75)
iqr = q3 - q1

# Create a new dataframe with quartile information
quartile_df = pd.DataFrame({
    'q1': q1.values,
    'q2': q2.values,
    'q3': q3.values,
    'iqr': iqr.values
}, index=q1.index)
print(quartile_df)

"""#### Percent of Total Overdose Deaths"""

# Calculate percent of total deaths
percent_of_total = overdose_deaths_no_year.apply(lambda x: x / x.sum() * 100, axis=1)

# Round the percent values to two decimal places
percent_of_total = percent_of_total.round(2)

# Create a new dataframe with percent of total information
percent_df = pd.DataFrame({
    '% of Total': percent_of_total['total_overdose_deaths'].values,
    '% Opioid Pain Relievers': percent_of_total['opioid_pain_relievers'].values,
    '% Benzodiazepines': percent_of_total['benzodiazepines'].values,
    '% Cocaine': percent_of_total['cocaine'].values,
    '% Heroin': percent_of_total['heroin'].values,
    '% Any Opioids': percent_of_total['any_opioids'].values,
    '% Synthetic Opioids other than Methadone': percent_of_total['synthetic_opioids_other_than_methadone'].values,
    '% Psychostimulants death rates': percent_of_total['psychostimulants_death_rates'].values,
    '% Antidepressants death rates': percent_of_total['antidepressants_death_rates'].values
})
print(percent_df)

"""## Good Stuff: Data Visualizations"""

sns.histplot(data=overdose_deaths_no_year, x='any_opioids', kde=True)

fig, axs = plt.subplots(1, len(overdose_deaths_no_year.columns), figsize=(20, 5))


for i, col in enumerate(overdose_deaths_no_year.columns):
    axs[i].hist(overdose_deaths_no_year[col], bins=10)
    axs[i].set_title(col)
    axs[i].set_xlabel('Number of Deaths')
    axs[i].set_ylabel('Frequency')

plt.show()

# Create subplots with 5 rows and 2 columns
fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(18.75, 31.25))

# Flatten the axs array to make it easier to iterate over
axs = axs.flatten()

# Iterate over the columns and plot each one
for i, col in enumerate(overdose_deaths.columns[1:]):
    axs[i].plot(overdose_deaths['year'], overdose_deaths[col], color='b')
    axs[i].set_xlabel('Year')
    axs[i].set_ylabel(col)
    axs[i].set_title(col)

# Remove any unused subplots
for j in range(len(overdose_deaths.columns) - 1, len(axs)):
    fig.delaxes(axs[j])
    
# Adjust the spacing between the subplots
fig.subplots_adjust(hspace=0.5, wspace=0.3)

sns.scatterplot(x="total_overdose_deaths", y="heroin", hue="year", data=overdose_deaths)