-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathus_do_deaths.py
155 lines (116 loc) · 4.93 KB
/
us_do_deaths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
"""US DO deaths
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1KERzGNBjSb7d69OQuE3uXXFPFa1nKc14
"""
import pandas as pd
import numpy as np
# import matplotlib as mat
import matplotlib.pyplot as plt
import seaborn as sns
import math
"""## Data Wrangling"""
# define a function to clean up the column names
def clean_column_name(name):
# convert to lower case
name = name.lower()
# replace spaces with underscores
name = name.replace(" ", "_")
# remove non-alphanumeric characters
name = ''.join(e for e in name if e.isalnum() or e == '_')
# check for duplicates
for i in range(1, len(name)):
if name[:i] == name[i:]:
name = name[:i]
break
# remove "_2017" from the end of the name
# name = name.replace("_2017", "")
name = name.replace("_2017", "").replace("_cdc_wonder", "").replace("_us","")
# return the cleaned name
return name
## ------------- End of function ---------------##
overdose_deaths = pd.read_csv("https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/Drug%20overdose%20deaths%20in%20the%20US%20(CDC%20WONDER)/Drug%20overdose%20deaths%20in%20the%20US%20(CDC%20WONDER).csv")
overdose_deaths['Year'] = overdose_deaths['Year'].astype('int')
overdose_deaths.drop('Entity', axis=1, inplace=True)
# loop through the column names and clean them up
new_columns = {}
for col in overdose_deaths.columns:
new_col = clean_column_name(col)
new_columns[col] = new_col
# rename the columns
overdose_deaths.rename(columns=new_columns, inplace=True)
# Rename the dataframe from data to overdose_deaths
# overdose_deaths = overdose_deaths.rename_axis('year').reset_index()
overdose_deaths.columns
# check for missing values in each column
print(overdose_deaths.isna().any())
"""## Begin Exploratory Data Analysis"""
overdose_deaths.describe()
# Calculate the percent of total for each column using Total Overdose Deaths
percent_of_total = (overdose_deaths / overdose_deaths['total_overdose_deaths']) * 100
"""#### Calculate Quant"""
# Calculate the quartiles and IQR for each column
quartiles = overdose_deaths.quantile([0.25, 0.5, 0.75])
q1 = quartiles.loc[0.25]
q2 = quartiles.loc[0.5]
q3 = quartiles.loc[0.75]
iqr = q3 - q1
# Drop the 'year' column
overdose_deaths_no_year = overdose_deaths.drop(columns=['year'])
# Calculate quartiles for each column
q1 = overdose_deaths_no_year.quantile(0.25)
q2 = overdose_deaths_no_year.quantile(0.5)
q3 = overdose_deaths_no_year.quantile(0.75)
iqr = q3 - q1
# Create a new dataframe with quartile information
quartile_df = pd.DataFrame({
'q1': q1.values,
'q2': q2.values,
'q3': q3.values,
'iqr': iqr.values
}, index=q1.index)
print(quartile_df)
"""#### Percent of Total Overdose Deaths"""
# Calculate percent of total deaths
percent_of_total = overdose_deaths_no_year.apply(lambda x: x / x.sum() * 100, axis=1)
# Round the percent values to two decimal places
percent_of_total = percent_of_total.round(2)
# Create a new dataframe with percent of total information
percent_df = pd.DataFrame({
'% of Total': percent_of_total['total_overdose_deaths'].values,
'% Opioid Pain Relievers': percent_of_total['opioid_pain_relievers'].values,
'% Benzodiazepines': percent_of_total['benzodiazepines'].values,
'% Cocaine': percent_of_total['cocaine'].values,
'% Heroin': percent_of_total['heroin'].values,
'% Any Opioids': percent_of_total['any_opioids'].values,
'% Synthetic Opioids other than Methadone': percent_of_total['synthetic_opioids_other_than_methadone'].values,
'% Psychostimulants death rates': percent_of_total['psychostimulants_death_rates'].values,
'% Antidepressants death rates': percent_of_total['antidepressants_death_rates'].values
})
print(percent_df)
"""## Good Stuff: Data Visualizations"""
sns.histplot(data=overdose_deaths_no_year, x='any_opioids', kde=True)
fig, axs = plt.subplots(1, len(overdose_deaths_no_year.columns), figsize=(20, 5))
for i, col in enumerate(overdose_deaths_no_year.columns):
axs[i].hist(overdose_deaths_no_year[col], bins=10)
axs[i].set_title(col)
axs[i].set_xlabel('Number of Deaths')
axs[i].set_ylabel('Frequency')
plt.show()
# Create subplots with 5 rows and 2 columns
fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(18.75, 31.25))
# Flatten the axs array to make it easier to iterate over
axs = axs.flatten()
# Iterate over the columns and plot each one
for i, col in enumerate(overdose_deaths.columns[1:]):
axs[i].plot(overdose_deaths['year'], overdose_deaths[col], color='b')
axs[i].set_xlabel('Year')
axs[i].set_ylabel(col)
axs[i].set_title(col)
# Remove any unused subplots
for j in range(len(overdose_deaths.columns) - 1, len(axs)):
fig.delaxes(axs[j])
# Adjust the spacing between the subplots
fig.subplots_adjust(hspace=0.5, wspace=0.3)
sns.scatterplot(x="total_overdose_deaths", y="heroin", hue="year", data=overdose_deaths)