Skip to content

Commit

Permalink
Merge
Browse files Browse the repository at this point in the history
  • Loading branch information
DanielJanowicz committed Oct 12, 2022
1 parent b8631a3 commit 04d36fd
Show file tree
Hide file tree
Showing 18 changed files with 1,195 additions and 0 deletions.
164 changes: 164 additions & 0 deletions inferential/anovas/1way/practice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 27 16:27:46 2021
@author: hantswilliams
"""



import pandas as pd
import numpy as np

diabetes = pd.read_csv('https://raw.githubusercontent.com/hantswilliams/AHI_DataSci_507/main/Datasets/Diabetes/DB1_Diabetes/diabetic_data.csv')

diabetes_small = diabetes.sample(100)


## Data transformation step

diabetes = diabetes.replace('?', np.NaN)



### generate list of var names
list(diabetes_small)


# 1 factor of RACE has 5 levels:

# Lets replace nan for diabetes.race with 'Other'

diabetes['race'] = diabetes['race'].replace(np.NaN, 'Other')
diabetes.race.value_counts()
diabetes['race'].value_counts()
len(diabetes['race'].value_counts() )

# 1 factor of AGE we have 10 levels
diabetes.age.value_counts()
diabetes['age'].value_counts()

# 1 factor of PAYER_CODE we have 17 levels
diabetes.payer_code.value_counts()
len(diabetes.payer_code.value_counts())

# 1 factor of medical_specialty we have 72 levels
diabetes.medical_specialty.value_counts()
len(diabetes.medical_specialty.value_counts())

specialtycounts = pd.DataFrame(diabetes.medical_specialty.value_counts())
specialtycounts = specialtycounts.reset_index()


#### Continuous values:

timeinhospital = diabetes['time_in_hospital']
labprocedures = diabetes['num_lab_procedures']




# 1 way anova
# 1 DV - time_in_hospital
# 1 IV - Race

# is there a difference between the "levels" of race
# and time in hospital?


### Checking assumptions....

# From regression or ANOVA framework
import pandas as pd
import scipy.stats as stats
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from scipy.stats import kurtosis, skew, bartlett


# DV ~ C(IV) + C(IV)
model = smf.ols("time_in_hospital ~ C(race)", data = diabetes).fit()
stats.shapiro(model.resid)



# Lets create a chart

race1 = diabetes[diabetes['race'] == 'Caucasian']
race2 = diabetes[diabetes['race'] == 'AfricanAmerican']
race3 = diabetes[diabetes['race'] == 'Other']
race4 = diabetes[diabetes['race'] == 'Hispanic']
race5 = diabetes[diabetes['race'] == 'Asian']

plt.hist(race1['time_in_hospital'])
plt.show()

plt.hist(race2['time_in_hospital'])
plt.show()

plt.hist(race3['time_in_hospital'])
plt.show()

plt.hist(race4['time_in_hospital'])
plt.show()

plt.hist(race5['time_in_hospital'])
plt.show()



# kertosis
print(kurtosis(race1['time_in_hospital']))
print(kurtosis(race2['time_in_hospital']))
print(kurtosis(race3['time_in_hospital']))
print(kurtosis(race4['time_in_hospital']))
print(kurtosis(race5['time_in_hospital']))

# skewness
print('skew white: ', skew(race1['time_in_hospital']))
print('skew black: ', skew(race2['time_in_hospital']))
print(skew(race3['time_in_hospital']))
print(skew(race4['time_in_hospital']))
print(skew(race5['time_in_hospital']))


#### Homogeneity of Variance
## barlett test


stats.bartlett(race1['time_in_hospital'],
race2['time_in_hospital'],
race3['time_in_hospital'],
race4['time_in_hospital'],
race5['time_in_hospital']
)






## Template
stats.f_oneway(race1['time_in_hospital'],
race2['time_in_hospital'],
race3['time_in_hospital'],
race4['time_in_hospital'],
race5['time_in_hospital'])

## Post-hoc analysis for significant differences between groups
# TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD)
import statsmodels.stats.multicomp as mc
comp = mc.MultiComparison(diabetes['time_in_hospital'], diabetes['race'])
post_hoc_res = comp.tukeyhsd()
tukey1way = pd.DataFrame(post_hoc_res.summary())



race1['time_in_hospital'].describe()
race2['time_in_hospital'].describe()
race5['time_in_hospital'].describe()




60 changes: 60 additions & 0 deletions inferential/anovas/1way/pythonCode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#### Good walkthrough:
# https://www.pythonfordatascience.org/anova-python/

## Template
import scipy.stats as stats
stats.f_oneway(df['dataColumn'][df['categoryColumn'] == 'category1'],
df['dataColumn'][df['categoryColumn'] == 'category2'],
df['dataColumn'][df['categoryColumn'] == 'category3'])

## Post-hoc analysis for significant differences between groups
# TUKEY HONESTLY SIGNIFICANT DIFFERENCE (HSD)
import statsmodels.stats.multicomp as mc
comp = mc.MultiComparison(df['dataColumn'], df['categoryColumn'])
post_hoc_res = comp.tukeyhsd()
post_hoc_res.summary()


## Example data:

df = pd.read_csv("https://raw.githubusercontent.com/researchpy/Data-sets/master/difficile.csv")
df.drop('person', axis= 1, inplace= True)
df['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace= True) # Recoding value from numeric to string
df.info()

import pandas as pd
import researchpy as rp

import scipy.stats as stats

stats.f_oneway(df['libido'][df['dose'] == 'high'],
df['libido'][df['dose'] == 'low'],
df['libido'][df['dose'] == 'placebo'])




## Non-parametric
from scipy import stats
# Kruskal-Wallis
# Kruskal-Wallis
x = [1, 1, 1]
y = [2, 2, 2]
z = [2, 2]
stats.kruskal(x, y, z)
# expected output: KruskalResult(statistic=7.0, pvalue=0.0301973834223185)

# Friedman test
# Friedman test
stats.friedmanchisquare(group1, group2, group3)
# expected output: (statistic=13.3514, pvalue=0.00126))

# posthoc tuskey equivalent: howell test

import pingouin as pg
df = pg.read_dataset('penguins')
pg.pairwise_gameshowell(data=df, dv='body_mass_g',
between='species').round(3)



26 changes: 26 additions & 0 deletions inferential/anovas/1way/rCode.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Analysis for variance (1-way ANOVA) in R

res.aov <- aov(numericalColumn ~ grouperColumn, data = yourDataframe)
summary(res.aov)

## Tukey HSD post-hoc analysis
TukeyHSD(res.aov)




# Non-parametric: when no homogeneity (equal variance):
oneway.test(weight ~ group, data = my_data)
# Posthoc for above:
pairwise.t.test(my_data$weight, my_data$group, p.adjust.method = "BH", pool.sd = FALSE)


# Non-parametric:
kruskal.test(DV ~ IVfactor, data = clean_nomiss)


# Non-parametric / tuskey equivalent /
library(userfriendlyscience)
games.howell(df$var1, df$var2)

##
36 changes: 36 additions & 0 deletions inferential/anovas/2way/Rcode.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Two-way - no interaction prediction
two.way <- aov(DV ~ IV1 + IV2, data = yourdataframe)

# Two-way - with interaction prediction
two.way <- aov(DV ~ IV1 * IV2, data = yourdataframe)

# Two-way - with interaction prediction
two.way <- aov(DV ~ IV1 + IV2 + IV1:IV2, data = yourdataframe)




# Requires ‘car’ package
# Two-way - no interaction prediction
two.way <- aov(DV ~ IV1 + IV2, data = yourdataframe)
Anova(two.way, type=III”)
Anova(two.way, type=I”)

# Two-way - with interaction prediction
two.way <- aov(DV ~ IV1 * IV2, data = yourdataframe)
Anova(two.way, type=III”)
Anova(two.way, type=I”)

# Two-way - with interaction prediction
two.way <- aov(DV ~ IV1 + IV2 + IV1:IV2, data = yourdataframe)
Anova(two.way, type=III”)
Anova(two.way, type=I”)



# TUKEY
TukeyHSD(two.way)
TukeyHSD(two.way, which=IV1”)
TukeyHSD(two.way, which=IV2”)
TukeyHSD(two.way, which=IV1:IV2”)

100 changes: 100 additions & 0 deletions inferential/anovas/2way/pythonCode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import statsmodels.api as sm
from statsmodels.formula.api import ols

# https://www.statsmodels.org/dev/anova.html
# https://www.statsmodels.org/dev/generated/statsmodels.stats.anova.anova_lm.html#statsmodels.stats.anova.anova_lm

#perform two-way ANOVA
model = ols('DV ~ C(IV) + C(IV) + C(IV):C(IV)', data=df).fit()
sm.stats.anova_lm(model, typ=2)


# Example


import pandas as pd
import seaborn as sns
# load data file
d = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/twowayanova.txt", sep="\t")
# reshape the d dataframe suitable for statsmodels package
# you do not need to reshape if your data is already in stacked format. Compare d and d_melt tables for detail
# understanding
d_melt = pd.melt(d, id_vars=['Genotype'], value_vars=['1_year', '2_year', '3_year'])
# replace column names
d_melt.columns = ['Genotype', 'years', 'value']
d_melt.head()




# generate a boxplot to see the data distribution by genotypes and years. Using boxplot, we can easily detect the
# differences between different groups
sns.boxplot(x="Genotype", y="value", hue="years", data=d_melt, palette="Set3")

import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('value ~ C(Genotype) + C(years) + C(Genotype):C(years)', data=d_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

# ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm)
from bioinfokit.analys import stat
res = stat()
res.anova_stat(df=d_melt, res_var='value', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)')
res.anova_summary

from statsmodels.graphics.factorplots import interaction_plot
import matplotlib.pyplot as plt
fig = interaction_plot(x=d_melt['Genotype'], trace=d_melt['years'], response=d_melt['value'],
colors=['#4c061d','#d17a22', '#b4c292'])
plt.show()




# we will use bioinfokit (v1.0.3 or later) for performing tukey HSD test
# check documentation here https://github.com/reneshbedre/bioinfokit
from bioinfokit.analys import stat
# perform multiple pairwise comparison (Tukey HSD)
# unequal sample size data, tukey_hsd uses Tukey-Kramer test
res = stat()
# for main effect Genotype
res.tukey_hsd(df=d_melt, res_var='value', xfac_var='Genotype', anova_model='value~C(Genotype)+C(years)+C(Genotype):C(years)')
res.tukey_summary




##### Testing assumptions in our 2-way:
##### Testing assumptions in our 2-way:
##### Testing assumptions in our 2-way:


# QQ-plot
import statsmodels.api as sm
import matplotlib.pyplot as plt
# res.anova_std_residuals are standardized residuals obtained from two-way ANOVA (check above)
sm.qqplot(res.anova_std_residuals, line='45')
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Standardized Residuals")
plt.show()

# histogram
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k')
plt.xlabel("Residuals")
plt.ylabel('Frequency')
plt.show()

# Shapiro-Wilk test
import scipy.stats as stats
w, pvalue = stats.shapiro(res.anova_model_out.resid)
print(w, pvalue)

# if you have a stacked table, you can use bioinfokit v1.0.3 or later for the Levene's test
from bioinfokit.analys import stat
res = stat()
res.levene(df=d_melt, res_var='value', xfac_var=['Genotype', 'years'])
res.levene_summary

#interpretation: As the p value (0.09) is non-significant, we fail to reject the null
# hypothesis and conclude that treatments have equal variance
Loading

0 comments on commit 04d36fd

Please sign in to comment.