-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCLUSTPROFILE
173 lines (134 loc) · 7.46 KB
/
CLUSTPROFILE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*********************************************************
Usecase 3 - R Script for Employee Profiling - TESTED OK
Author: Snehotosh Banerjee
Date: 20/01/2017
ORE Script Name:
CLUSTPROFILE
Packages:
cluster # for gower similarity and pam
Rtsne # for t-SNE plot for Visualization
dplyr # Data wrangling
ggplot2 # Visualization
Algorithm:
PAM (Partition Around Medoids)
Datastore:
ds_cluster_store
Output Tables:
AA_UC3_PNG_TBL
AA_UC3_PRED_OUTPUT_TBL
*********************************************************/
BEGIN
sys.rqScriptDrop('CLUSTPROFILE');
sys.rqScriptCreate('CLUSTPROFILE',
'function(dat,isDendo = ''N'',isStand = ''T'',ds.name){
##----------------------------------------------------
## THE FIRST COLUMN SHOULD BE THE PRIMARY KEY ##
## USING PAM (Partitioning Around Medoids) and T-SNE
##----------------------------------------------------
# Reading the Table into ore.dataframe
ore.ds <- ore.pull(dat)
# Converting Character to Factor
# Note: ore.pull cannot preserve factors
## Find which columns are factors
factor_cols <- names(ore.ds)[sapply(ore.ds,is.character)]
## Converting training character columns to factor
ore.ds[, factor_cols] <- lapply(ore.ds[, factor_cols], as.factor)
#------------------------------------------------------------------------
# Deleting the Data store
#------------------------------------------------------------------------
if (nrow(ore.datastore(name=ds.name)) > 0 )
{
ore.delete(name = ds.name)
}
#------------------------------------------------------------------------
# PAM Clustering
#------------------------------------------------------------------------
# Finding NA records
row.has.na <- apply(ore.ds, 1, function(x){any(is.na(x))})
# Getting count of NA records
sum(row.has.na)
# Removing records with NA
ore.ds <- ore.ds[!row.has.na,]
# Clustering
library(cluster) # for gower similarity and pam
library(Rtsne) # for t-SNE plot
# Distance method
gower_dist <- daisy(ore.ds[, -1],metric = "gower",type = list(),stand = isStand)
if(isDendo == "Y"){
agnes.clust <- agnes(gower_dist)
plot(agnes.clust)
}
# Calculate silhouette width for many k using PAM
sil_width <- c(NA)
for(i in 2:10){
pam_fit <- pam(gower_dist, diss = TRUE,k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
# Plot sihouette width (higher is better)
plot(1:10, sil_width,xlab = "Number of clusters",ylab = "Silhouette Width")
lines(1:10, sil_width)
# Finding optimum no. of Clusters
opti_cluster <- which(sil_width == max(sil_width,na.rm=TRUE))
opti_cluster
# Fitting the Cluster Model
pam_fit <- pam(gower_dist, diss = TRUE, k = opti_cluster)
# Visualization
library(Rtsne)
library(ggplot2)
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- setNames(data.frame(tsne_obj$Y),c("X","Y"))
tsne_data$cluster <- factor(pam_fit$clustering)
# t-SNE Based
print(ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster)))
#------------------------------#
# Cluster Interpretation
# Via Descriptive Statistics
#------------------------------#
#Alternative coding without dplyr
pam_results <- ore.ds[,-which(names(ore.ds) == "EMPID")]
pam_results$CLUSTER <- pam_fit$clustering
the_summary <- list()
for(i in 1:opti_cluster){
the_summary[[i]] <- summary(subset(pam_results,CLUSTER==i))
#Storing in Datastore the clusterwise summary
labelling <- paste0("clus_summary_",i)
assign(labelling,the_summary[[i]])
ore.save(list = c(labelling),name = ds.name,append = TRUE)
}
# 1.Cluster Number with the record
ore.ds$CLUS_ASSIGN <- paste0("Cluster",pam_fit[["clustering"]])
# 2.Cluster Info
clust_info <- data.frame(pam_fit$clusinfo)
#---------------------------------------#
# 4.Barplot for each numerical variables
#---------------------------------------#
## Getting only numeric variable
# Find the numeric columns
num_cols_indx <- sapply(X = ore.ds[,-1],FUN = is.numeric)
factor_cols_indx <- sapply(X = ore.ds[,-1],FUN = is.factor)
num_cols <- data.frame(ore.ds[,-1][,num_cols_indx])
factor_cols <- ore.ds[,-1][,factor_cols_indx]
# Plotting Boxplot for each columns against Cluster Number
for(cols in colnames(num_cols))
{
form <- paste(cols,"~CLUS_ASSIGN")
boxplot(formula = as.formula(form),data = ore.ds, main=toupper(paste("Employee -",cols)),
font.main=3, cex.main=1.2, xlab="Cluster Number", ylab=cols, font.lab=3,
col=c("red","lightgreen","green","grey","cyan"),range = 0)
}
#----------------------------------#
# 5.Medoid records of each cluster
#----------------------------------#
clus_represent <- ore.ds[pam_fit$medoids, ]
#------------------------#
# 6.SAVING in Datastore
#------------------------#
ore.save(pam_fit,name = ds.name,append = TRUE)
ore.save(opti_cluster,name = ds.name,append = TRUE)
ore.save(gower_dist,name = ds.name,append = TRUE)
ore.save(clust_info,name = ds.name,append = TRUE)
ore.save(clus_represent,name = ds.name,append = TRUE)
# Return clustering assignment Dataframe
ore.ds[,c("EMPID","CLUS_ASSIGN")]
}');
END;