CLUSTPROFILE

/*********************************************************
Usecase 3 - R Script for Employee Profiling - TESTED OK
Author: Snehotosh Banerjee
Date: 20/01/2017

ORE Script Name:
	CLUSTPROFILE
	
Packages:
	cluster # for gower similarity and pam
	Rtsne   # for t-SNE plot for Visualization
	dplyr	# Data wrangling
	ggplot2 # Visualization
	
Algorithm:
	PAM (Partition Around Medoids)

Datastore:
ds_cluster_store
	
Output Tables:
	AA_UC3_PNG_TBL
  AA_UC3_PRED_OUTPUT_TBL
*********************************************************/

BEGIN
sys.rqScriptDrop('CLUSTPROFILE');
sys.rqScriptCreate('CLUSTPROFILE',
        'function(dat,isDendo = ''N'',isStand = ''T'',ds.name){  
                  ##----------------------------------------------------
                  ## THE FIRST COLUMN SHOULD BE THE PRIMARY KEY ##
                  ## USING PAM (Partitioning Around Medoids) and T-SNE
                  ##----------------------------------------------------
                  # Reading the Table into ore.dataframe
                  ore.ds <- ore.pull(dat)
                  
                  # Converting Character to Factor
                  # Note: ore.pull cannot preserve factors
                  
                  ## Find which columns are factors
                  factor_cols <- names(ore.ds)[sapply(ore.ds,is.character)]            
                  
                  ## Converting training character columns to factor
                  ore.ds[, factor_cols] <- lapply(ore.ds[, factor_cols], as.factor) 
                  
                  #------------------------------------------------------------------------
                  # Deleting the Data store
                  #------------------------------------------------------------------------            
                  if (nrow(ore.datastore(name=ds.name)) > 0 ) 
                  {
                    ore.delete(name = ds.name)
                  }
                  
                  #------------------------------------------------------------------------
                  # PAM Clustering
                  #------------------------------------------------------------------------
                  
                  # Finding NA records
                  row.has.na <- apply(ore.ds, 1, function(x){any(is.na(x))}) 
                  
                  # Getting count of NA records
                  sum(row.has.na)
                  
                  # Removing records with NA
                  ore.ds <- ore.ds[!row.has.na,]
                  
                  # Clustering
                  library(cluster) # for gower similarity and pam
                  library(Rtsne) # for t-SNE plot
                  
                  # Distance method
                  gower_dist <- daisy(ore.ds[, -1],metric = "gower",type = list(),stand = isStand)
                
                  if(isDendo == "Y"){
                    agnes.clust <- agnes(gower_dist)
                    plot(agnes.clust)
                  }
                  
                  # Calculate silhouette width for many k using PAM
                  sil_width <- c(NA)
                  
                  for(i in 2:10){
                    pam_fit <- pam(gower_dist, diss = TRUE,k = i)
                    sil_width[i] <- pam_fit$silinfo$avg.width
                  }
                  
                  # Plot sihouette width (higher is better)
                  plot(1:10, sil_width,xlab = "Number of clusters",ylab = "Silhouette Width")
                  lines(1:10, sil_width)
                  
                  # Finding optimum no. of Clusters
                  opti_cluster <- which(sil_width == max(sil_width,na.rm=TRUE))
                  opti_cluster
                  
                  # Fitting the Cluster Model
                  pam_fit <- pam(gower_dist, diss = TRUE, k = opti_cluster)
                  
                  # Visualization
                  library(Rtsne)
                  library(ggplot2)                  
                  
                  tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)              
                  
                  tsne_data <- setNames(data.frame(tsne_obj$Y),c("X","Y"))
                  tsne_data$cluster <-  factor(pam_fit$clustering)
                  
                  # t-SNE Based
                  print(ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster)))  
                  
                  #------------------------------#
                  # Cluster Interpretation
                  # Via Descriptive Statistics
                  #------------------------------#
                
                  #Alternative coding without dplyr
                  pam_results <- ore.ds[,-which(names(ore.ds) == "EMPID")]
                  pam_results$CLUSTER <- pam_fit$clustering
                  
                  the_summary <- list()
                  for(i in 1:opti_cluster){
                    the_summary[[i]] <- summary(subset(pam_results,CLUSTER==i)) 
                    
                    #Storing in Datastore the clusterwise summary
                    labelling <- paste0("clus_summary_",i)
                    assign(labelling,the_summary[[i]])
                    ore.save(list = c(labelling),name = ds.name,append = TRUE)
                  }
                  
                  # 1.Cluster Number with the record
                  ore.ds$CLUS_ASSIGN <- paste0("Cluster",pam_fit[["clustering"]])
                  
                  # 2.Cluster Info
                  clust_info <- data.frame(pam_fit$clusinfo)
                  
                  #---------------------------------------#
                  # 4.Barplot for each numerical variables
                  #---------------------------------------#
                  
                  ## Getting only numeric variable
                  # Find the numeric columns
                  num_cols_indx <- sapply(X = ore.ds[,-1],FUN = is.numeric)
                  factor_cols_indx <- sapply(X = ore.ds[,-1],FUN = is.factor)
                  
                  num_cols <- data.frame(ore.ds[,-1][,num_cols_indx])
                  factor_cols <- ore.ds[,-1][,factor_cols_indx]
                  
                  # Plotting Boxplot for each columns against Cluster Number
                  for(cols in colnames(num_cols))
                  {
                    form <- paste(cols,"~CLUS_ASSIGN")
                    boxplot(formula = as.formula(form),data = ore.ds, main=toupper(paste("Employee -",cols)), 
                            font.main=3, cex.main=1.2, xlab="Cluster Number", ylab=cols, font.lab=3, 
                            col=c("red","lightgreen","green","grey","cyan"),range = 0)
                  }
                  
                  #----------------------------------#
                  # 5.Medoid records of each cluster
                  #----------------------------------#
                  clus_represent <- ore.ds[pam_fit$medoids, ]
                  
                  #------------------------#
                  # 6.SAVING in Datastore
                  #------------------------#
                  ore.save(pam_fit,name = ds.name,append = TRUE)
                  ore.save(opti_cluster,name = ds.name,append = TRUE)
                  ore.save(gower_dist,name = ds.name,append = TRUE)
                  ore.save(clust_info,name = ds.name,append = TRUE)
                  ore.save(clus_represent,name = ds.name,append = TRUE)            
                  
                  # Return clustering assignment Dataframe
                  ore.ds[,c("EMPID","CLUS_ASSIGN")]
        }');
END;