.Rhistory

'poultry',
'beef_intake',
'lamb_intake',
'pork_intake',
'cheese_intake',
'salt_added_food',
'varition_in_diet',
'Alc_intake_freq',
'seated_box_height'
)
#categorical cols
cat_cols = c('birth_month')
####################################################################################################
# PREPARING MULTI-MORBID DATA AND SCALING
####################################################################################################
matched_inds = frequency_matching(merged_data[,c("age","Sex")],
as.numeric(as.character(merged_data$no_chronic)) >1,times=1,seed=1)
matched_inds_data=merged_data[as.numeric(matched_inds$selection),]
multi_morbid_ordinal_factors_HW_mod_controls=matched_inds_data[as.numeric(as.character(matched_inds_data$no_chronic)) < 2,]
# We noticed that the columns "smoker" and "current_smoker" are the  (just different for 5 individuals), so we're going to remove one
print(sum(multi_morbid_ordinal_factors_HW_mod_controls$smoker == multi_morbid_ordinal_factors_HW_mod_controls$current_smoker))
print(nrow(multi_morbid_ordinal_factors_HW_mod_controls))
# We're going to remove one of them
multi_morbid_ordinal_factors_HW_mod_controls$smoker=NULL
#Birth_month is a dumb variable
multi_morbid_ordinal_factors_HW_mod_controls$birth_month=NULL
multi_morbid_ordinal_factors_HW_mod_controls$hip_waist_ratio=multi_morbid_ordinal_factors_HW_mod_controls$hip_circum/multi_morbid_ordinal_factors_HW_mod_controls$waist_circum
multi_morbid_ordinal_factors_HW_mod_controls$height_sitting=NULL
multi_morbid_ordinal_factors_HW_mod_controls$sitting_height=NULL
multi_morbid_ordinal_factors_HW_mod_controls[,c("height_sitting","sitting_height","waist_circum","hip_circum","whole_body_water_mass",
"whole_body_fat_mass","Height","Weight")] <- list(NULL)
multi_morbid_ordinal_factors_HW_mod_controls[,"seated_box_height"] <- list(NULL)
#binary cols
binary_col_ids = which(unlist(sapply(multi_morbid_ordinal_factors_HW_mod_controls, function(x) length(levels(factor(x)))==2)))
multi_morbid_ordinal_factors_HW_mod_controls[,binary_col_ids]=lapply(multi_morbid_ordinal_factors_HW_mod_controls[,binary_col_ids],as.factor)
cat_col_ids = which(colnames(multi_morbid_ordinal_factors_HW_mod_controls) %in% cat_cols)
ord_col_ids = which(colnames(multi_morbid_ordinal_factors_HW_mod_controls) %in% ord_cols)
multi_morbid_ordinal_factors_HW_mod_controls[,cat_col_ids] = factor(multi_morbid_ordinal_factors_HW_mod_controls[,cat_col_ids])
multi_morbid_ordinal_factors_HW_mod_controls[,ord_col_ids] = lapply(multi_morbid_ordinal_factors_HW_mod_controls[,ord_col_ids], function(x) factor(as.integer(x), ordered = TRUE))
multi_morbid_ordinal_factors_HW_mod_controls_male=multi_morbid_ordinal_factors_HW_mod_controls[multi_morbid_ordinal_factors_HW_mod_controls$Sex=="Male",]
multi_morbid_ordinal_factors_HW_mod_controls_female=multi_morbid_ordinal_factors_HW_mod_controls[multi_morbid_ordinal_factors_HW_mod_controls$Sex=="Female",]
set.seed(1)
multi_morbid_ordinal_factors_HW_mod_controls <- multi_morbid_ordinal_factors_HW_mod_controls %>%
group_by(Sex,age) %>%
sample_frac(0.3) %>% as.data.frame()
#scale numeric features
multi_morbid_ordinal_factors_HW_mod_controls_male[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)] =
as.data.frame(scale(multi_morbid_ordinal_factors_HW_mod_controls_male[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)]))
multi_morbid_ordinal_factors_HW_mod_controls_female[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)] =
as.data.frame(scale(multi_morbid_ordinal_factors_HW_mod_controls_female[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)]))
multi_morbid_ordinal_factors_HW_mod_controls_male$Sex=NULL
multi_morbid_ordinal_factors_HW_mod_controls_female$Sex=NULL
saveRDS(multi_morbid_ordinal_factors_HW_mod_controls_male,"../data/processed_V5_males/multi_morbid_ordinal_factors_HW_mod_controls_male_subset.rds")
saveRDS(multi_morbid_ordinal_factors_HW_mod_controls_female,"../data/processed_V5_females/multi_morbid_ordinal_factors_HW_mod_controls_female_subset.rds")
using<-function(...) {
libs<-unlist(list(...))
req<-unlist(lapply(libs,require,character.only=TRUE))
need<-libs[req==FALSE]
if(length(need)>0){
install.packages(need)
lapply(need,require,character.only=TRUE)
}
}
using("magrittr","dplyr","KODAMA")
# library(cluster,lib.loc ="/home/jheller/anaconda3/lib/R/library")
# library(dplyr,lib.loc ="/home/jheller/anaconda3/lib/R/library")
################################################################################
# WORKING DIRECTORY AND SOURCING FUNCTIONS
################################################################################
# file_path<-dirname(rstudioapi::getActiveDocumentContext()$path)
# setwd(file_path)
setwd("C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas")
################################################################################
# LOADING DATA
################################################################################
full_data=read.csv("../data/processed/UKBcompleteFeb19.csv")
deborah_data=read.csv("../data/processed/CVD_diag_by_doc_236.csv",sep=";")
colnames(deborah_data)[1]="eid"
CVD_ICD10_codes=read.table("../data/processed/CVD_icd10codes.txt")[,1]
deborah_data2=read.csv("../data/processed/CVD_10035_236.csv")
colnames(deborah_data2)[ncol(deborah_data2)-1]="eid"
print(table(deborah_data2[,2],useNA="ifany"))
print(table(deborah_data2[,3],useNA="ifany"))
#These NAs are just from people that never had CVD, so we'll code as 0
print(length(deborah_data2[deborah_data2$first_admidate=="","first_admidate"]))
deborah_data2[is.na(deborah_data2)] <- 0
print(table(deborah_data2[,2],useNA="ifany"))
print(table(deborah_data2[,3],useNA="ifany"))
################################################################################
# DEALING WITH ICD10 CODE
################################################################################
#These are the ICD10 codes of all the diseases in the dataset
ICD10_code_names=colnames(deborah_data2)[2:76]
#Let's check how many of them are in the list of CVD related ICD10 codes
print(ICD10_code_names%in%CVD_ICD10_codes)
print(ICD10_code_names[!ICD10_code_names%in%CVD_ICD10_codes]) #All but one!
#I500 : corresponds to forms of heart failure. Note that I50 is in list so we'll keep it in still
print(unique(substr(ICD10_code_names, start = 1, stop = 3)))
#Here are the disease types that correspond to the codes
#I22: MI
#I61 : Intracerebral haemorrhage
#I21 : MI
#I25: Chronic ischaemic heart diseases
#I50: Heart failure
#I63: Cerebral infarction
#G46: Vascular syndromes of brain in cerebrovascular diseases
#I60: Subarachnoid haemorrhage
#I24: Other acute ischaemic heart diseases
#I20 : Angina pectoris
#I62 : Other nontraumatic intracranial haemorrhage
#I23: Certain current complications following acute myocardial infarction
#I69 : Sequelae of cerebrovascular disease
#I73:Other peripheral vascular diseases
#I64:Stroke, not specified as haemorrhage or infarction
#We now try to make meaningful groups
CAD_codes=c("I22","I21","I23","I25","I20","I24")
stroke_codes=c("I63","I64","I69","G46")
heart_failure_codes=c("I50")
intracranial_haemorrhage_codes=c("I61","I60","I62")
peripheral_vascular_codes=c("I73")
CAD_data=ifelse(rowSums(deborah_data2[,substr(colnames(deborah_data2), start = 1, stop = 3)%in%CAD_codes])>=1,1,0)
stroke_data=ifelse(rowSums(deborah_data2[,substr(colnames(deborah_data2), start = 1, stop = 3)%in%stroke_codes])>=1,1,0)
heart_failure_data=ifelse(rowSums(deborah_data2[,substr(colnames(deborah_data2), start = 1, stop = 3)%in%heart_failure_codes])>=1,1,0)
intracranial_haemorrhage_data=ifelse(rowSums(deborah_data2[,substr(colnames(deborah_data2), start = 1, stop = 3)%in%intracranial_haemorrhage_codes])>=1,1,0)
peripheral_vascular_data=ifelse(rowSums(deborah_data2[,substr(colnames(deborah_data2), start = 1, stop = 3)%in%peripheral_vascular_codes])>=1,1,0)
supp_data=data.frame(CAD_data,stroke_data,heart_failure_data,intracranial_haemorrhage_data,peripheral_vascular_data,eid=deborah_data2$eid)
merged_data=merge(full_data,deborah_data,by="eid")
merged_data=merge(merged_data,supp_data,by="eid")
merged_data$mi=NULL
merged_data$angina=NULL
merged_data$stroke=NULL
merged_data$htn=NULL
#SHow how much HES data corresponds to self reported
merged_data$CAD=ifelse(merged_data$angina_diag+merged_data$heartattack_diag+merged_data$CAD_data>=1,1,0)
merged_data$stroke=ifelse(merged_data$stroke_diag+merged_data$stroke_data>=1,1,0)
merged_data$heart_failure=merged_data$heart_failure_data
merged_data$intracranial_haemorrhage=merged_data$intracranial_haemorrhage_data
merged_data$peripheral_vascular=merged_data$peripheral_vascular_data
merged_data$heartattack_diag=NULL
merged_data$angina_diag=NULL
merged_data$stroke_diag=NULL
merged_data$CAD_data=NULL
merged_data$heart_failure_data=NULL
merged_data$stroke_data=NULL
merged_data$intracranial_haemorrhage_data=NULL
merged_data$peripheral_vascular_data=NULL
merged_data$htn=merged_data$hBP_diag
merged_data$hBP_diag=NULL
merged_data$htn=as.numeric(as.character(merged_data$htn))
table(merged_data$htn)
table(merged_data$intracranial_haemorrhage)
table(merged_data$peripheral_vascular)
UKB_age=readRDS("../data/processed/UKB_age.rds")
colnames(UKB_age)=c("eid","age")
merged_data=merge(merged_data,UKB_age,by="eid")
merged_data$birth_year=NULL
################################################################################
# PRE-PROCESSING ON FULL DATA
################################################################################
#define obese BMI > 40
merged_data$obese = ifelse(merged_data$BMI >= 40, 1, 0)
#define outcome cols
outcomes = c('diabetes','CAD','angina','obese','htn',"heart_failure","intracranial_haemorrhage","peripheral_vascular")
outcome_cols = grep(paste0('^',outcomes,'$',collapse = '|'), colnames(merged_data))
#col of chronic diseases
merged_data$no_chronic = apply(merged_data[,outcome_cols],1,sum)
#change gender levels and remove gender that is not used anymore
merged_data$Sex = factor(ifelse(merged_data$gender == 0, 'Female','Male'))
merged_data$gender=NULL
#re-organize columns
merged_data=merged_data %>% dplyr::select(eid,CAD,stroke,obese,diabetes,htn,dvt_asthma_copd_atopy,
heart_failure,intracranial_haemorrhage,peripheral_vascular,no_chronic,self_reported_surgery,
previous_surgery,pacemaker, everything())
merged_data[,'no_chronic']=as.factor(merged_data[,'no_chronic'])
#We'll transform all the ordinal columns so that they can reasonably be considered continuous
ord_cols = c('self_reported_surgery',
'freq_climb_stairs_4wks',
'freq_walked_for_pleasure_4wks',
'Duration_pleasure_walks',
'smokers_in_house',
'oily_fish_intake',
'non_oily_fish_intake',
'processed_meat',
'poultry',
'beef_intake',
'lamb_intake',
'pork_intake',
'cheese_intake',
'salt_added_food',
'varition_in_diet',
'Alc_intake_freq',
'seated_box_height'
)
#categorical cols
cat_cols = c('birth_month')
####################################################################################################
# PREPARING MULTI-MORBID DATA AND SCALING
####################################################################################################
matched_inds = frequency_matching(merged_data[,c("age","Sex")],
as.numeric(as.character(merged_data$no_chronic)) >1,times=1,seed=1)
matched_inds_data=merged_data[as.numeric(matched_inds$selection),]
multi_morbid_ordinal_factors_HW_mod_controls=matched_inds_data[as.numeric(as.character(matched_inds_data$no_chronic)) < 2,]
# We noticed that the columns "smoker" and "current_smoker" are the  (just different for 5 individuals), so we're going to remove one
print(sum(multi_morbid_ordinal_factors_HW_mod_controls$smoker == multi_morbid_ordinal_factors_HW_mod_controls$current_smoker))
print(nrow(multi_morbid_ordinal_factors_HW_mod_controls))
# We're going to remove one of them
multi_morbid_ordinal_factors_HW_mod_controls$smoker=NULL
#Birth_month is a dumb variable
multi_morbid_ordinal_factors_HW_mod_controls$birth_month=NULL
multi_morbid_ordinal_factors_HW_mod_controls$hip_waist_ratio=multi_morbid_ordinal_factors_HW_mod_controls$hip_circum/multi_morbid_ordinal_factors_HW_mod_controls$waist_circum
multi_morbid_ordinal_factors_HW_mod_controls$height_sitting=NULL
multi_morbid_ordinal_factors_HW_mod_controls$sitting_height=NULL
multi_morbid_ordinal_factors_HW_mod_controls[,c("height_sitting","sitting_height","waist_circum","hip_circum","whole_body_water_mass",
"whole_body_fat_mass","Height","Weight")] <- list(NULL)
multi_morbid_ordinal_factors_HW_mod_controls[,"seated_box_height"] <- list(NULL)
#binary cols
binary_col_ids = which(unlist(sapply(multi_morbid_ordinal_factors_HW_mod_controls, function(x) length(levels(factor(x)))==2)))
multi_morbid_ordinal_factors_HW_mod_controls[,binary_col_ids]=lapply(multi_morbid_ordinal_factors_HW_mod_controls[,binary_col_ids],as.factor)
cat_col_ids = which(colnames(multi_morbid_ordinal_factors_HW_mod_controls) %in% cat_cols)
ord_col_ids = which(colnames(multi_morbid_ordinal_factors_HW_mod_controls) %in% ord_cols)
multi_morbid_ordinal_factors_HW_mod_controls[,cat_col_ids] = factor(multi_morbid_ordinal_factors_HW_mod_controls[,cat_col_ids])
multi_morbid_ordinal_factors_HW_mod_controls[,ord_col_ids] = lapply(multi_morbid_ordinal_factors_HW_mod_controls[,ord_col_ids], function(x) factor(as.integer(x), ordered = TRUE))
set.seed(1)
multi_morbid_ordinal_factors_HW_mod_controls <- multi_morbid_ordinal_factors_HW_mod_controls %>%
group_by(Sex,age) %>%
sample_frac(0.3) %>% as.data.frame()
multi_morbid_ordinal_factors_HW_mod_controls_male=multi_morbid_ordinal_factors_HW_mod_controls[multi_morbid_ordinal_factors_HW_mod_controls$Sex=="Male",]
multi_morbid_ordinal_factors_HW_mod_controls_female=multi_morbid_ordinal_factors_HW_mod_controls[multi_morbid_ordinal_factors_HW_mod_controls$Sex=="Female",]
#scale numeric features
multi_morbid_ordinal_factors_HW_mod_controls_male[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)] =
as.data.frame(scale(multi_morbid_ordinal_factors_HW_mod_controls_male[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)]))
multi_morbid_ordinal_factors_HW_mod_controls_female[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)] =
as.data.frame(scale(multi_morbid_ordinal_factors_HW_mod_controls_female[,-c(1,11,binary_col_ids,cat_col_ids,ord_col_ids)]))
multi_morbid_ordinal_factors_HW_mod_controls_male$Sex=NULL
multi_morbid_ordinal_factors_HW_mod_controls_female$Sex=NULL
saveRDS(multi_morbid_ordinal_factors_HW_mod_controls_male,"../data/processed_V5_males/multi_morbid_ordinal_factors_HW_mod_controls_male_subset.rds")
saveRDS(multi_morbid_ordinal_factors_HW_mod_controls_female,"../data/processed_V5_females/multi_morbid_ordinal_factors_HW_mod_controls_female_subset.rds")
dim(multi_morbid_ordinal_factors_HW_mod_controls_male)
dim(multi_morbid_ordinal_factors_HW_mod_controls_female)
file_find_replace <- function(filepath, pattern, replacement) {
file_contents <- readLines(filepath)
updated_contents <- gsub(x = file_contents, pattern = pattern, replacement = replacement)
cat(updated_contents, file = filepath, sep = "\n")
}
my_dir <- "C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas/code/scripts_joel_HPC_V5_males"
setwd(my_dir)
my_dir <- "C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas/code/scripts_joel_HPC_V4_females"
setwd(my_dir)
# Apply the function to each of the R scripts in the directory
my_r_scripts <- list.files(path = my_dir, pattern = "(r|R)$")
for (r_script in my_r_scripts) {
file_find_replace(r_script,
"processed_V5",
"processed_V4")
}
for (r_script in my_r_scripts ) {
file_find_replace(r_script,
"results_joel_HPC_V5",
"results_joel_HPC_V4")
}
for (r_script in my_r_scripts ) {
file_find_replace(r_script,
"var_groupings_V5",
"var_groupings_V4")
}
for (r_script in my_r_scripts ) {
file_find_replace(r_script,
"HW_mod_controls",
"HW_mod_no_obesity")
}
setwd("C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas")
multi_morbid=readRDS("../data/processed_V4_females/multi_morbid_ordinal_continuous_HW_mod_no_obesity_female.rds")
my_dir <- "C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas/code/scripts_joel_HPC_V4_males"
setwd(my_dir)
my_r_scripts <- list.files(path = my_dir, pattern = "(r|R)$")
for (r_script in my_r_scripts) {
file_find_replace(r_script,
"female",
"male")
}
my_dir <- "C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas/code/scripts_joel_HPC_V4_males"
setwd(my_dir)
# Apply the function to each of the R scripts in the directory
my_r_scripts <- list.files(path = my_dir, pattern = "(r|R)$")
for (r_script in my_r_scripts) {
file_find_replace(r_script,
"female",
"male")
}
multi_morbid=readRDS("../data/processed_V4_males/multi_morbid_ordinal_continuous_HW_mod_no_obesity_male.rds")
setwd("C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas")
multi_morbid=readRDS("../data/processed_V4_males/multi_morbid_ordinal_continuous_HW_mod_no_obesity_male.rds")
library(glmnet)
?glmnet
file_path<-dirname(rstudioapi::getActiveDocumentContext()$path)
file_path
using<-function(...) {
libs<-unlist(list(...))
req<-unlist(lapply(libs,require,character.only=TRUE))
need<-libs[req==FALSE]
if(length(need)>0){
install.packages(need)
lapply(need,require,character.only=TRUE)
}
}
using("ggplot2","dplyr","tidyr","scales","shadowtext")
################################################################################
# WORKING DIRECTORY AND SOURCING FUNCTIONS
################################################################################
setwd("C:/Users/JOE/Documents/Imperial College 2018-2019/Translational Data Science/Barracudas")
source("C:/Users/JOE/Documents/R_utility_and_self_implementations/clustering_utility.R")
source("C:/Users/JOE/Documents/R_utility_and_self_implementations/colors_themes_utility.R")
################################################################################
# LOADING LIBRARIES
################################################################################
using<-function(...) {
libs<-unlist(list(...))
req<-unlist(lapply(libs,require,character.only=TRUE))
need<-libs[req==FALSE]
if(length(need)>0){
install.packages(need)
lapply(need,require,character.only=TRUE)
}
}
using("ggplot2","grid","gridExtra")
FAMD_multi_morbid_res=readRDS("../real_results_from_HPC/results_used_for_final_figures/main_figure_2/FAMD_kmeans_ordinal_continuous_var_importance_stab_df_morbid.rds")
FAMD_kmeans_ordinal_factors_var_importance_stab_df=
readRDS("../real_results_from_HPC/results_used_for_final_figures/main_figure_2/FAMD_kmeans_ordinal_factors_var_importance_stab_df_morbid.rds")
FAMD_kmeans_ordinal_factors_var_importance_stab_df
kamila_var_importance_stab_df_morbid=
readRDS("../real_results_from_HPC/results_used_for_final_figures/main_figure_2/kamila_var_importance_stab_df_morbid.rds")
grouping_names
source("../data/processed_V3_females/var_groupings_V3.R")
variable_importance_stability_plot=make_variable_importance_stability_plot(kamila_var_importance_stab_df_morbid,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=50)
variable_importance_stability_plot
make_variable_importance_plot=function(data,grouping_names=NULL,color_scale=NULL,custom_theme=NULL,threshold=5, annotation_line=5) {
if (is.null(grouping_names)) {
data$var_name=factor(data$var_name,levels=data$var_name)
var_importance_plot=ggplot(data) + geom_point(aes(x=var_name,y=var_importance,color=Type),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=60,vjust = 0.8)) +
ggtitle("Variable importance for cluster classification")
if (!is.null(color_scale)) {
var_importance_plot= var_importance_plot + scale_color_manual(values=color_scale)
}
} else {
groupings=lapply(grouping_names,function(x) {which(data[,1]%in%x)})
data=data[unlist(groupings),]
data$group=unlist(lapply(names(grouping_names),function(x) {rep(x,length(grouping_names[[x]]))}))
data$var_name=factor(data$var_name,levels=data$var_name)
data$significant=ifelse(data[,"var_importance"]>threshold,1,0)
var_importance_plot=ggplot(data) + geom_point(aes(x=var_name,y=var_importance,color=group),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=60,vjust = 0.8)) +
ggtitle("Significance of differences between clusters across variables")
if (!is.null(color_scale)) {
var_importance_plot= var_importance_plot + scale_color_manual(values=color_scale)
}
var_importance_plot= var_importance_plot +
geom_label_repel(data=subset(data, data$significant==1),aes(x=var_name,y=var_importance,label=var_name)) +
geom_hline(yintercept = annotation_line)
}
return(var_importance_plot)
}
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=50)
variable_importance_stability_plot
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
make_variable_importance_stability_plot=function(data,grouping_names=NULL,color_scale=NULL,custom_theme=NULL,threshold=5,annotation_line=5) {
if (is.null(grouping_names)) {
data$var_name=factor(data$var_name,levels=data$var_name)
var_importance_stability_plot=ggplot(data) + geom_point(aes(x=var_name,y=median,color=Type),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=60,vjust = 0.8)) +
geom_errorbar(aes(ymin=LB, ymax=UB,x=var_name,color=Type), width=.2,
position=position_dodge(.9)) +
ggtitle("Variable importance for cluster classification")
if (!is.null(color_scale)) {
var_importance_stability_plot= var_importance_stability_plot + scale_color_manual(values=color_scale)
}
} else {
groupings=lapply(grouping_names,function(x) {which(data[,1]%in%x)})
data=data[unlist(groupings),]
data$group=unlist(lapply(names(grouping_names),function(x) {rep(x,length(grouping_names[[x]]))}))
data$var_name=factor(data$var_name,levels=data$var_name)
data$significant=ifelse(data[,"median"]>threshold,1,0)
var_importance_stability_plot=ggplot(data) + geom_point(aes(x=var_name,y=median,color=group),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=60,vjust = 0.8)) +
geom_errorbar(aes(ymin=LB, ymax=UB,x=var_name,color=group), width=.2,
position=position_dodge(.9)) +
ggtitle("Significance of differences between clusters across variables")
if (!is.null(color_scale)) {
var_importance_stability_plot= var_importance_stability_plot + scale_color_manual(values=color_scale)
}
var_importance_stability_plot= var_importance_stability_plot +
geom_label_repel(data=subset(data, data$significant==1),aes(x=var_name,y=median,label=var_name)) +
geom_hline(yintercept = annotation_line)
}
return(var_importance_stability_plot)
}
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
x11(variable_importance_stability_plot)
grouping_names=list(Disease=Disease,Demographics=Demographics,BMI_related=BMI_related,
Activity=Activity,Vital_signs=Vital_signs,Tobacco=Tobacco,
Alcohol=Alcohol,Dietary=Dietary,Med_surg_hx=Med_surg_hx)
variable_importance_stability_plot=make_variable_importance_stability_plot(kamila_var_importance_stab_df_morbid,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=50)
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
x11(variable_importance_stability_plot)
source("C:/Users/JOE/Documents/R_utility_and_self_implementations/clustering_utility.R")
source("../data/processed_V3_females/var_groupings_V3.R")
class(FAMD_kmeans_ordinal_factors_var_importance_stab_df)
data=FAMD_kmeans_ordinal_factors_var_importance_stab_df
grouping_names=grouping_names
color_scale=NULL
custom_theme=theme_jh
threshold=100
annotation_line=0
groupings=lapply(grouping_names,function(x) {which(data[,1]%in%x)})
data=data[unlist(groupings),]
data$group=unlist(lapply(names(grouping_names),function(x) {rep(x,length(grouping_names[[x]]))}))
data$var_name=factor(data$var_name,levels=data$var_name)
data$significant=ifelse(data[,"median"]>threshold,1,0)
var_importance_stability_plot=ggplot(data) + geom_point(aes(x=var_name,y=median,color=group),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=60,vjust = 0.8)) +
geom_errorbar(aes(ymin=LB, ymax=UB,x=var_name,color=group), width=.2,
position=position_dodge(.9)) +
ggtitle("Significance of differences between clusters across variables")
var_importance_stability_plot
if (!is.null(color_scale)) {
var_importance_stability_plot= var_importance_stability_plot + scale_color_manual(values=color_scale)
}
var_importance_stability_plot= var_importance_stability_plot +
geom_label_repel(data=subset(data, data$significant==1),aes(x=var_name,y=median,label=var_name)) +
geom_hline(yintercept = annotation_line)
var_importance_stability_plot
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
x11()
print(variable_importance_stability_plot)
make_variable_importance_stability_plot=function(data,grouping_names=NULL,color_scale=NULL,custom_theme=NULL,threshold=5,annotation_line=5) {
if (is.null(grouping_names)) {
data$var_name=factor(data$var_name,levels=data$var_name)
var_importance_stability_plot=ggplot(data) + geom_point(aes(x=var_name,y=median,color=Type),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=60,vjust = 0.8)) +
geom_errorbar(aes(ymin=LB, ymax=UB,x=var_name,color=Type), width=.2,
position=position_dodge(.9)) +
ggtitle("Variable importance for cluster classification")
if (!is.null(color_scale)) {
var_importance_stability_plot= var_importance_stability_plot + scale_color_manual(values=color_scale)
}
} else {
groupings=lapply(grouping_names,function(x) {which(data[,1]%in%x)})
data=data[unlist(groupings),]
data$group=unlist(lapply(names(grouping_names),function(x) {rep(x,length(grouping_names[[x]]))}))
data$var_name=factor(data$var_name,levels=data$var_name)
data$significant=ifelse(data[,"median"]>threshold,1,0)
var_importance_stability_plot=ggplot(data) + geom_point(aes(x=var_name,y=median,color=group),size=2) +
custom_theme + theme(axis.text.x = element_text(angle=90,vjust = 0.8)) +
geom_errorbar(aes(ymin=LB, ymax=UB,x=var_name,color=group), width=.2,
position=position_dodge(.9)) +
ggtitle("Significance of differences between clusters across variables")
if (!is.null(color_scale)) {
var_importance_stability_plot= var_importance_stability_plot + scale_color_manual(values=color_scale)
}
var_importance_stability_plot= var_importance_stability_plot +
geom_label_repel(data=subset(data, data$significant==1),aes(x=var_name,y=median,label=var_name)) +
geom_hline(yintercept = annotation_line)
}
return(var_importance_stability_plot)
}
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
x11()
print(variable_importance_stability_plot)
variable_importance_stability_plot=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
x11()
print(variable_importance_stability_plot)
variable_importance_stability_plot_1=make_variable_importance_stability_plot(FAMD_kmeans_ordinal_factors_var_importance_stab_df,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=100,annotation_line=0)
variable_importance_stability_plot_2=make_variable_importance_stability_plot(gower_diana_multi_morbid_var_importance_stab_df_morbid,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=50)
variable_importance_stability_plot_3=make_variable_importance_stability_plot(kamila_var_importance_stab_df_morbid,
grouping_names=grouping_names, color_scale=NULL,custom_theme=theme_jh,
threshold=50)
test=grid.arrange(arrangeGrob(grobs=plots, ncol=1, nrow=3))
plots=list(variable_importance_stability_plot_1,variable_importance_stability_plot_2,variable_importance_stability_plot_3)
test=grid.arrange(arrangeGrob(grobs=plots, ncol=1, nrow=3))
length(plots)
test=grid.arrange(arrangeGrob(grobs=plots, ncol=2, nrow=3))
test=grid.arrange(arrangeGrob(grobs=plots, ncol=2, nrow=3))
test=grid.arrange(grobs=plots, ncol=2, nrow=3)
?grid.arrange
plots
test=grid.arrange(grobs=plots, ncol=1, nrow=3)
test=arrangeGrob(grobs=plots, ncol=10, nrow=3)
grid.draw(test)
test=arrangeGrob(grobs=plots, ncol=1, nrow=3)
grid.draw(test)