-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b8631a3
commit 04d36fd
Showing
18 changed files
with
1,195 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Wed Oct 27 16:27:46 2021 | ||
@author: hantswilliams | ||
""" | ||
|
||
|
||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
diabetes = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/AHI_DataSci_507/main/Datasets/Diabetes/DB1_Diabetes/diabetic_data.csv') | ||
|
||
diabetes_small = diabetes.sample(100) | ||
|
||
|
||
## Data transformation step | ||
|
||
diabetes = diabetes.replace('?', np.NaN) | ||
|
||
|
||
|
||
### generate list of var names | ||
list(diabetes_small) | ||
|
||
|
||
# 1 factor of RACE has 5 levels: | ||
|
||
# Lets replace nan for diabetes.race with 'Other' | ||
|
||
diabetes['race'] = diabetes['race'].replace(np.NaN, 'Other') | ||
diabetes.race.value_counts() | ||
diabetes['race'].value_counts() | ||
len(diabetes['race'].value_counts() ) | ||
|
||
# 1 factor of AGE we have 10 levels | ||
diabetes.age.value_counts() | ||
diabetes['age'].value_counts() | ||
|
||
# 1 factor of PAYER_CODE we have 17 levels | ||
diabetes.payer_code.value_counts() | ||
len(diabetes.payer_code.value_counts()) | ||
|
||
# 1 factor of medical_specialty we have 72 levels | ||
diabetes.medical_specialty.value_counts() | ||
len(diabetes.medical_specialty.value_counts()) | ||
|
||
specialtycounts = pd.DataFrame(diabetes.medical_specialty.value_counts()) | ||
specialtycounts = specialtycounts.reset_index() | ||
|
||
|
||
#### Continuous values: | ||
|
||
timeinhospital = diabetes['time_in_hospital'] | ||
labprocedures = diabetes['num_lab_procedures'] | ||
|
||
|
||
|
||
|
||
# 1 way anova | ||
# 1 DV - time_in_hospital | ||
# 1 IV - Race | ||
|
||
# is there a difference between the "levels" of race | ||
# and time in hospital? | ||
|
||
|
||
### Checking assumptions.... | ||
|
||
# From regression or ANOVA framework | ||
import pandas as pd | ||
import scipy.stats as stats | ||
import statsmodels.formula.api as smf | ||
import matplotlib.pyplot as plt | ||
from scipy.stats import kurtosis, skew, bartlett | ||
|
||
|
||
# DV ~ C(IV) + C(IV) | ||
model = smf.ols("time_in_hospital ~ C(race)", data = diabetes).fit() | ||
stats.shapiro(model.resid) | ||
|
||
|
||
|
||
# Lets create a chart | ||
|
||
race1 = diabetes[diabetes['race'] == 'Caucasian'] | ||
race2 = diabetes[diabetes['race'] == 'AfricanAmerican'] | ||
race3 = diabetes[diabetes['race'] == 'Other'] | ||
race4 = diabetes[diabetes['race'] == 'Hispanic'] | ||
race5 = diabetes[diabetes['race'] == 'Asian'] | ||
|
||
plt.hist(race1['time_in_hospital']) | ||
plt.show() | ||
|
||
plt.hist(race2['time_in_hospital']) | ||
plt.show() | ||
|
||
plt.hist(race3['time_in_hospital']) | ||
plt.show() | ||
|
||
plt.hist(race4['time_in_hospital']) | ||
plt.show() | ||
|
||
plt.hist(race5['time_in_hospital']) | ||
plt.show() | ||
|
||
|
||
|
||
# kertosis | ||
print(kurtosis(race1['time_in_hospital'])) | ||
print(kurtosis(race2['time_in_hospital'])) | ||
print(kurtosis(race3['time_in_hospital'])) | ||
print(kurtosis(race4['time_in_hospital'])) | ||
print(kurtosis(race5['time_in_hospital'])) | ||
|
||
# skewness | ||
print('skew white: ', skew(race1['time_in_hospital'])) | ||
print('skew black: ', skew(race2['time_in_hospital'])) | ||
print(skew(race3['time_in_hospital'])) | ||
print(skew(race4['time_in_hospital'])) | ||
print(skew(race5['time_in_hospital'])) | ||
|
||
|
||
#### Homogeneity of Variance | ||
## barlett test | ||
|
||
|
||
stats.bartlett(race1['time_in_hospital'], | ||
race2['time_in_hospital'], | ||
race3['time_in_hospital'], | ||
race4['time_in_hospital'], | ||
race5['time_in_hospital'] | ||
) | ||
|
||
|
||
|
||
|
||
|
||
|
||
## Template | ||
stats.f_oneway(race1['time_in_hospital'], | ||
race2['time_in_hospital'], | ||
race3['time_in_hospital'], | ||
race4['time_in_hospital'], | ||
race5['time_in_hospital']) | ||
|
||
## Post-hoc analysis for significant differences between groups | ||
# TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD) | ||
import statsmodels.stats.multicomp as mc | ||
comp = mc.MultiComparison(diabetes['time_in_hospital'], diabetes['race']) | ||
post_hoc_res = comp.tukeyhsd() | ||
tukey1way = pd.DataFrame(post_hoc_res.summary()) | ||
|
||
|
||
|
||
race1['time_in_hospital'].describe() | ||
race2['time_in_hospital'].describe() | ||
race5['time_in_hospital'].describe() | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#### Good walkthrough: | ||
# https://www.pythonfordatascience.org/anova-python/ | ||
|
||
## Template | ||
import scipy.stats as stats | ||
stats.f_oneway(df['dataColumn'][df['categoryColumn'] == 'category1'], | ||
df['dataColumn'][df['categoryColumn'] == 'category2'], | ||
df['dataColumn'][df['categoryColumn'] == 'category3']) | ||
|
||
## Post-hoc analysis for significant differences between groups | ||
# TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD) | ||
import statsmodels.stats.multicomp as mc | ||
comp = mc.MultiComparison(df['dataColumn'], df['categoryColumn']) | ||
post_hoc_res = comp.tukeyhsd() | ||
post_hoc_res.summary() | ||
|
||
|
||
## Example data: | ||
|
||
df = pd.read_csv("https://raw.githubusercontent.com/researchpy/Data-sets/master/difficile.csv") | ||
df.drop('person', axis= 1, inplace= True) | ||
df['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace= True) # Recoding value from numeric to string | ||
df.info() | ||
|
||
import pandas as pd | ||
import researchpy as rp | ||
|
||
import scipy.stats as stats | ||
|
||
stats.f_oneway(df['libido'][df['dose'] == 'high'], | ||
df['libido'][df['dose'] == 'low'], | ||
df['libido'][df['dose'] == 'placebo']) | ||
|
||
|
||
|
||
|
||
## Non-parametric | ||
from scipy import stats | ||
# Kruskal-Wallis | ||
# Kruskal-Wallis | ||
x = [1, 1, 1] | ||
y = [2, 2, 2] | ||
z = [2, 2] | ||
stats.kruskal(x, y, z) | ||
# expected output: KruskalResult(statistic=7.0, pvalue=0.0301973834223185) | ||
|
||
# Friedman test | ||
# Friedman test | ||
stats.friedmanchisquare(group1, group2, group3) | ||
# expected output: (statistic=13.3514, pvalue=0.00126)) | ||
|
||
# posthoc tuskey equivalent: howell test | ||
|
||
import pingouin as pg | ||
df = pg.read_dataset('penguins') | ||
pg.pairwise_gameshowell(data=df, dv='body_mass_g', | ||
between='species').round(3) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Analysis for variance (1-way ANOVA) in R | ||
|
||
res.aov <- aov(numericalColumn ~ grouperColumn, data = yourDataframe) | ||
summary(res.aov) | ||
|
||
## Tukey HSD post-hoc analysis | ||
TukeyHSD(res.aov) | ||
|
||
|
||
|
||
|
||
# Non-parametric: when no homogeneity (equal variance): | ||
oneway.test(weight ~ group, data = my_data) | ||
# Posthoc for above: | ||
pairwise.t.test(my_data$weight, my_data$group, p.adjust.method = "BH", pool.sd = FALSE) | ||
|
||
|
||
# Non-parametric: | ||
kruskal.test(DV ~ IVfactor, data = clean_nomiss) | ||
|
||
|
||
# Non-parametric / tuskey equivalent / | ||
library(userfriendlyscience) | ||
games.howell(df$var1, df$var2) | ||
|
||
## |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Two-way - no interaction prediction | ||
two.way <- aov(DV ~ IV1 + IV2, data = yourdataframe) | ||
|
||
# Two-way - with interaction prediction | ||
two.way <- aov(DV ~ IV1 * IV2, data = yourdataframe) | ||
|
||
# Two-way - with interaction prediction | ||
two.way <- aov(DV ~ IV1 + IV2 + IV1:IV2, data = yourdataframe) | ||
|
||
|
||
|
||
|
||
# Requires ‘car’ package | ||
# Two-way - no interaction prediction | ||
two.way <- aov(DV ~ IV1 + IV2, data = yourdataframe) | ||
Anova(two.way, type=“III”) | ||
Anova(two.way, type=“I”) | ||
|
||
# Two-way - with interaction prediction | ||
two.way <- aov(DV ~ IV1 * IV2, data = yourdataframe) | ||
Anova(two.way, type=“III”) | ||
Anova(two.way, type=“I”) | ||
|
||
# Two-way - with interaction prediction | ||
two.way <- aov(DV ~ IV1 + IV2 + IV1:IV2, data = yourdataframe) | ||
Anova(two.way, type=“III”) | ||
Anova(two.way, type=“I”) | ||
|
||
|
||
|
||
# TUKEY | ||
TukeyHSD(two.way) | ||
TukeyHSD(two.way, which=“IV1”) | ||
TukeyHSD(two.way, which=“IV2”) | ||
TukeyHSD(two.way, which=“IV1:IV2”) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import statsmodels.api as sm | ||
from statsmodels.formula.api import ols | ||
|
||
# https://www.statsmodels.org/dev/anova.html | ||
# https://www.statsmodels.org/dev/generated/statsmodels.stats.anova.anova_lm.html#statsmodels.stats.anova.anova_lm | ||
|
||
#perform two-way ANOVA | ||
model = ols('DV ~ C(IV) + C(IV) + C(IV):C(IV)', data=df).fit() | ||
sm.stats.anova_lm(model, typ=2) | ||
|
||
|
||
# Example | ||
|
||
|
||
import pandas as pd | ||
import seaborn as sns | ||
# load data file | ||
d = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/twowayanova.txt", sep="\t") | ||
# reshape the d dataframe suitable for statsmodels package | ||
# you do not need to reshape if your data is already in stacked format. Compare d and d_melt tables for detail | ||
# understanding | ||
d_melt = pd.melt(d, id_vars=['Genotype'], value_vars=['1_year', '2_year', '3_year']) | ||
# replace column names | ||
d_melt.columns = ['Genotype', 'years', 'value'] | ||
d_melt.head() | ||
|
||
|
||
|
||
|
||
# generate a boxplot to see the data distribution by genotypes and years. Using boxplot, we can easily detect the | ||
# differences between different groups | ||
sns.boxplot(x="Genotype", y="value", hue="years", data=d_melt, palette="Set3") | ||
|
||
import statsmodels.api as sm | ||
from statsmodels.formula.api import ols | ||
model = ols('value ~ C(Genotype) + C(years) + C(Genotype):C(years)', data=d_melt).fit() | ||
anova_table = sm.stats.anova_lm(model, typ=2) | ||
anova_table | ||
|
||
# ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm) | ||
from bioinfokit.analys import stat | ||
res = stat() | ||
res.anova_stat(df=d_melt, res_var='value', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)') | ||
res.anova_summary | ||
|
||
from statsmodels.graphics.factorplots import interaction_plot | ||
import matplotlib.pyplot as plt | ||
fig = interaction_plot(x=d_melt['Genotype'], trace=d_melt['years'], response=d_melt['value'], | ||
colors=['#4c061d','#d17a22', '#b4c292']) | ||
plt.show() | ||
|
||
|
||
|
||
|
||
# we will use bioinfokit (v1.0.3 or later) for performing tukey HSD test | ||
# check documentation here https://github.com/reneshbedre/bioinfokit | ||
from bioinfokit.analys import stat | ||
# perform multiple pairwise comparison (Tukey HSD) | ||
# unequal sample size data, tukey_hsd uses Tukey-Kramer test | ||
res = stat() | ||
# for main effect Genotype | ||
res.tukey_hsd(df=d_melt, res_var='value', xfac_var='Genotype', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)') | ||
res.tukey_summary | ||
|
||
|
||
|
||
|
||
##### Testing assumptions in our 2-way: | ||
##### Testing assumptions in our 2-way: | ||
##### Testing assumptions in our 2-way: | ||
|
||
|
||
# QQ-plot | ||
import statsmodels.api as sm | ||
import matplotlib.pyplot as plt | ||
# res.anova_std_residuals are standardized residuals obtained from two-way ANOVA (check above) | ||
sm.qqplot(res.anova_std_residuals, line='45') | ||
plt.xlabel("Theoretical Quantiles") | ||
plt.ylabel("Standardized Residuals") | ||
plt.show() | ||
|
||
# histogram | ||
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k') | ||
plt.xlabel("Residuals") | ||
plt.ylabel('Frequency') | ||
plt.show() | ||
|
||
# Shapiro-Wilk test | ||
import scipy.stats as stats | ||
w, pvalue = stats.shapiro(res.anova_model_out.resid) | ||
print(w, pvalue) | ||
|
||
# if you have a stacked table, you can use bioinfokit v1.0.3 or later for the Levene's test | ||
from bioinfokit.analys import stat | ||
res = stat() | ||
res.levene(df=d_melt, res_var='value', xfac_var=['Genotype', 'years']) | ||
res.levene_summary | ||
|
||
#interpretation: As the p value (0.09) is non-significant, we fail to reject the null | ||
# hypothesis and conclude that treatments have equal variance |
Oops, something went wrong.