Merge

DanielJanowicz · Oct 12, 2022 · 04d36fd · 04d36fd
1 parent b8631a3
commit 04d36fd
Show file tree

Hide file tree

Showing 18 changed files with 1,195 additions and 0 deletions.
diff --git a/inferential/anovas/1way/practice.py b/inferential/anovas/1way/practice.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct 27 16:27:46 2021
+
+@author: hantswilliams
+"""
+
+
+
+import pandas as pd
+import numpy as np 
+
+diabetes = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/AHI_DataSci_507/main/Datasets/Diabetes/DB1_Diabetes/diabetic_data.csv')
+
+diabetes_small = diabetes.sample(100)
+
+
+## Data transformation step 
+
+diabetes = diabetes.replace('?', np.NaN)
+
+
+
+### generate list of var names 
+list(diabetes_small)
+
+
+# 1 factor of RACE has 5 levels: 
+
+# Lets replace nan for diabetes.race with 'Other'
+
+diabetes['race'] = diabetes['race'].replace(np.NaN, 'Other')    
+diabetes.race.value_counts()
+diabetes['race'].value_counts() 
+len(diabetes['race'].value_counts() )
+
+# 1 factor of AGE we have 10 levels 
+diabetes.age.value_counts() 
+diabetes['age'].value_counts() 
+
+# 1 factor of PAYER_CODE we have 17 levels 
+diabetes.payer_code.value_counts()
+len(diabetes.payer_code.value_counts())
+
+# 1 factor of medical_specialty we have 72 levels 
+diabetes.medical_specialty.value_counts()
+len(diabetes.medical_specialty.value_counts())
+
+specialtycounts = pd.DataFrame(diabetes.medical_specialty.value_counts())      
+specialtycounts = specialtycounts.reset_index()
+
+
+#### Continuous values: 
+
+timeinhospital = diabetes['time_in_hospital']
+labprocedures = diabetes['num_lab_procedures'] 
+
+
+
+
+# 1 way anova 
+# 1 DV - time_in_hospital
+# 1 IV - Race 
+
+# is there a difference between the "levels" of race 
+# and time in hospital? 
+
+
+### Checking assumptions....
+
+# From regression or ANOVA framework
+import pandas as pd
+import scipy.stats as stats
+import statsmodels.formula.api as smf
+import matplotlib.pyplot as plt
+from scipy.stats import kurtosis, skew, bartlett
+
+
+# DV ~ C(IV) + C(IV)
+model = smf.ols("time_in_hospital ~ C(race)", data = diabetes).fit()
+stats.shapiro(model.resid)
+
+
+
+# Lets create a chart 
+
+race1 = diabetes[diabetes['race'] == 'Caucasian']
+race2 = diabetes[diabetes['race'] == 'AfricanAmerican']
+race3 = diabetes[diabetes['race'] == 'Other']
+race4 = diabetes[diabetes['race'] == 'Hispanic']
+race5 = diabetes[diabetes['race'] == 'Asian']
+
+plt.hist(race1['time_in_hospital'])
+plt.show()
+
+plt.hist(race2['time_in_hospital'])
+plt.show()
+
+plt.hist(race3['time_in_hospital'])
+plt.show()
+
+plt.hist(race4['time_in_hospital'])
+plt.show()
+
+plt.hist(race5['time_in_hospital'])
+plt.show()
+
+
+
+# kertosis 
+print(kurtosis(race1['time_in_hospital']))
+print(kurtosis(race2['time_in_hospital']))
+print(kurtosis(race3['time_in_hospital']))
+print(kurtosis(race4['time_in_hospital']))
+print(kurtosis(race5['time_in_hospital']))
+
+# skewness 
+print('skew white: ', skew(race1['time_in_hospital']))
+print('skew black: ', skew(race2['time_in_hospital']))
+print(skew(race3['time_in_hospital']))
+print(skew(race4['time_in_hospital']))
+print(skew(race5['time_in_hospital']))
+
+
+#### Homogeneity of Variance 
+## barlett test 
+
+
+stats.bartlett(race1['time_in_hospital'],
+               race2['time_in_hospital'],
+               race3['time_in_hospital'],
+               race4['time_in_hospital'],
+               race5['time_in_hospital']
+               )
+
+
+
+
+
+
+## Template 
+stats.f_oneway(race1['time_in_hospital'],
+               race2['time_in_hospital'],
+               race3['time_in_hospital'],
+               race4['time_in_hospital'],
+               race5['time_in_hospital'])
+
+## Post-hoc analysis for significant differences between groups
+# TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD)
+import statsmodels.stats.multicomp as mc
+comp = mc.MultiComparison(diabetes['time_in_hospital'], diabetes['race'])
+post_hoc_res = comp.tukeyhsd()
+tukey1way = pd.DataFrame(post_hoc_res.summary())
+
+
+
+race1['time_in_hospital'].describe()
+race2['time_in_hospital'].describe()
+race5['time_in_hospital'].describe()
+
+
+
+
diff --git a/inferential/anovas/1way/pythonCode.py b/inferential/anovas/1way/pythonCode.py
@@ -0,0 +1,60 @@
+#### Good walkthrough: 
+# https://www.pythonfordatascience.org/anova-python/ 
+
+## Template 
+import scipy.stats as stats
+stats.f_oneway(df['dataColumn'][df['categoryColumn'] == 'category1'],
+               df['dataColumn'][df['categoryColumn'] == 'category2'],
+               df['dataColumn'][df['categoryColumn'] == 'category3'])
+
+## Post-hoc analysis for significant differences between groups
+# TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD)
+import statsmodels.stats.multicomp as mc
+comp = mc.MultiComparison(df['dataColumn'], df['categoryColumn'])
+post_hoc_res = comp.tukeyhsd()
+post_hoc_res.summary()
+
+
+## Example data:
+
+df = pd.read_csv("https://raw.githubusercontent.com/researchpy/Data-sets/master/difficile.csv")
+df.drop('person', axis= 1, inplace= True)
+df['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace= True) # Recoding value from numeric to string
+df.info()
+
+import pandas as pd
+import researchpy as rp
+
+import scipy.stats as stats
+
+stats.f_oneway(df['libido'][df['dose'] == 'high'],
+               df['libido'][df['dose'] == 'low'],
+               df['libido'][df['dose'] == 'placebo'])
+
+
+
+
+## Non-parametric 
+from scipy import stats
+# Kruskal-Wallis
+# Kruskal-Wallis
+x = [1, 1, 1]
+y = [2, 2, 2]
+z = [2, 2]
+stats.kruskal(x, y, z)
+# expected output: KruskalResult(statistic=7.0, pvalue=0.0301973834223185)
+
+# Friedman test
+# Friedman test
+stats.friedmanchisquare(group1, group2, group3)
+# expected output: (statistic=13.3514, pvalue=0.00126))
+
+# posthoc tuskey equivalent: howell test
+
+import pingouin as pg
+df = pg.read_dataset('penguins')
+pg.pairwise_gameshowell(data=df, dv='body_mass_g',
+                         between='species').round(3)
+
+
+
diff --git a/inferential/anovas/1way/rCode.r b/inferential/anovas/1way/rCode.r
@@ -0,0 +1,26 @@
+# Analysis for variance (1-way ANOVA) in R 
+
+res.aov <- aov(numericalColumn ~ grouperColumn, data = yourDataframe)
+summary(res.aov)
+
+## Tukey HSD post-hoc analysis 
+TukeyHSD(res.aov)
+
+
+
+
+# Non-parametric: when no homogeneity (equal variance): 
+oneway.test(weight ~ group, data = my_data)
+# Posthoc for above: 
+pairwise.t.test(my_data$weight, my_data$group, p.adjust.method = "BH", pool.sd = FALSE)
+
+
+# Non-parametric: 
+kruskal.test(DV ~ IVfactor, data = clean_nomiss)
+
+
+# Non-parametric / tuskey equivalent / 
+library(userfriendlyscience)
+games.howell(df$var1, df$var2) 
+
+## 
diff --git a/inferential/anovas/2way/Rcode.r b/inferential/anovas/2way/Rcode.r
@@ -0,0 +1,36 @@
+# Two-way - no interaction prediction 
+two.way <- aov(DV ~ IV1 + IV2, data = yourdataframe)
+
+# Two-way - with interaction prediction 
+two.way <- aov(DV ~ IV1 * IV2, data = yourdataframe)
+
+# Two-way - with interaction prediction 
+two.way <- aov(DV ~ IV1 + IV2 + IV1:IV2, data = yourdataframe)
+
+
+
+
+# Requires ‘car’ package
+# Two-way - no interaction prediction 
+two.way <- aov(DV ~ IV1 + IV2, data = yourdataframe) 
+Anova(two.way, type=“III”)
+Anova(two.way, type=“I”)
+
+# Two-way - with interaction prediction 
+two.way <- aov(DV ~ IV1 * IV2, data = yourdataframe)
+Anova(two.way, type=“III”)
+Anova(two.way, type=“I”)
+
+# Two-way - with interaction prediction 
+two.way <- aov(DV ~ IV1 + IV2 + IV1:IV2, data = yourdataframe)
+Anova(two.way, type=“III”)
+Anova(two.way, type=“I”)
+
+
+
+# TUKEY 
+TukeyHSD(two.way)
+TukeyHSD(two.way, which=“IV1”)
+TukeyHSD(two.way, which=“IV2”)
+TukeyHSD(two.way, which=“IV1:IV2”)
+
diff --git a/inferential/anovas/2way/pythonCode.py b/inferential/anovas/2way/pythonCode.py
@@ -0,0 +1,100 @@
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+
+# https://www.statsmodels.org/dev/anova.html
+# https://www.statsmodels.org/dev/generated/statsmodels.stats.anova.anova_lm.html#statsmodels.stats.anova.anova_lm
+
+#perform two-way ANOVA
+model = ols('DV ~ C(IV) + C(IV) + C(IV):C(IV)', data=df).fit()
+sm.stats.anova_lm(model, typ=2)
+
+
+# Example
+
+
+import pandas as pd
+import seaborn as sns
+# load data file
+d = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/twowayanova.txt", sep="\t")
+# reshape the d dataframe suitable for statsmodels package 
+# you do not need to reshape if your data is already in stacked format. Compare d and d_melt tables for detail 
+# understanding 
+d_melt = pd.melt(d, id_vars=['Genotype'], value_vars=['1_year', '2_year', '3_year'])
+# replace column names
+d_melt.columns = ['Genotype', 'years', 'value']
+d_melt.head()
+
+
+
+
+# generate a boxplot to see the data distribution by genotypes and years. Using boxplot, we can easily detect the 
+# differences between different groups
+sns.boxplot(x="Genotype", y="value", hue="years", data=d_melt, palette="Set3") 
+
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+model = ols('value ~ C(Genotype) + C(years) + C(Genotype):C(years)', data=d_melt).fit()
+anova_table = sm.stats.anova_lm(model, typ=2)
+anova_table
+
+# ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm)
+from bioinfokit.analys import stat
+res = stat()
+res.anova_stat(df=d_melt, res_var='value', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)')
+res.anova_summary
+
+from statsmodels.graphics.factorplots import interaction_plot
+import matplotlib.pyplot as plt
+fig = interaction_plot(x=d_melt['Genotype'], trace=d_melt['years'], response=d_melt['value'], 
+    colors=['#4c061d','#d17a22', '#b4c292'])
+plt.show()
+
+
+
+
+# we will use bioinfokit (v1.0.3 or later) for performing tukey HSD test
+# check documentation here https://github.com/reneshbedre/bioinfokit
+from bioinfokit.analys import stat
+# perform multiple pairwise comparison (Tukey HSD)
+# unequal sample size data, tukey_hsd uses Tukey-Kramer test
+res = stat()
+# for main effect Genotype
+res.tukey_hsd(df=d_melt, res_var='value', xfac_var='Genotype', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)')
+res.tukey_summary
+
+
+
+
+##### Testing assumptions in our 2-way: 
+##### Testing assumptions in our 2-way: 
+##### Testing assumptions in our 2-way: 
+
+
+# QQ-plot
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+# res.anova_std_residuals are standardized residuals obtained from two-way ANOVA (check above)
+sm.qqplot(res.anova_std_residuals, line='45')
+plt.xlabel("Theoretical Quantiles")
+plt.ylabel("Standardized Residuals")
+plt.show()
+
+# histogram
+plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k') 
+plt.xlabel("Residuals")
+plt.ylabel('Frequency')
+plt.show()
+
+# Shapiro-Wilk test
+import scipy.stats as stats
+w, pvalue = stats.shapiro(res.anova_model_out.resid)
+print(w, pvalue)
+
+# if you have  a stacked table, you can use bioinfokit v1.0.3 or later for the Levene's test
+from bioinfokit.analys import stat 
+res = stat()
+res.levene(df=d_melt, res_var='value', xfac_var=['Genotype', 'years'])
+res.levene_summary
+
+#interpretation: As the p value (0.09) is non-significant, we fail to reject the null 
+# hypothesis and conclude that treatments have equal variance