-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
171 lines (135 loc) · 5.22 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
## Created May 2014
## The purpose of this R script is to create a tidy data set by merging files
## from two sources: a train set and test set. These files contain a large
## number of variables but only mean and std variables will be used to create
## the tidy data set.
### load libraries
library(data.table)
### load features and activity desc files
features <- read.table("./UCI HAR Dataset/features.txt"
,header = FALSE
,col.names = c("id", "var_name"))
activity_desc <- read.table("./UCI HAR Dataset/activity_labels.txt"
,header = FALSE
,col.names = c("id", "activity_name"))
### find columns that show mean() and std() variables
mean.v <- grep("mean()"
,features$var_name
,fixed = TRUE)
std.v<- grep("std()"
,features$var_name
,fixed = TRUE)
### load subject file for train set
subject_train <- read.table("./UCI HAR Dataset/train/subject_train.txt"
,header = FALSE
,col.names = c("subject_id"))
### load activity for train set
y_train.df <- read.table("./UCI HAR Dataset/train/y_train.txt"
,header = FALSE
,col.names = c("activity_id"))
### add activity description
y_train.df <- merge(y_train.df, activity_desc
,by.x="activity_id"
,by.y="id")
### load data for train set
x_train.df <- read.table("./UCI HAR Dataset/train/X_train.txt"
,header = FALSE
,col.names = features$var_name)
#,nrows = 100)
### subset data frame to show columns only with mean() and std() variables
x_train.df <- x_train.df[,c(mean.v, std.v)]
### prepare train set with subject and activity names
x_train.df <- cbind(subject_train, y_train.df["activity_name"], x_train.df)
### load subject file for test set
subject_test <- read.table("./UCI HAR Dataset/test/subject_test.txt"
,header = FALSE
,col.names = c("subject_id"))
### load activity for test set
y_test.df <- read.table("./UCI HAR Dataset/test/y_test.txt"
,header = FALSE
,col.names = c("activity_id"))
### add activity description
y_test.df <- merge(y_test.df, activity_desc
,by.x="activity_id"
,by.y="id")
### load data for train set
x_test.df <- read.table("./UCI HAR Dataset/test/X_test.txt"
,header = FALSE
,col.names = features$var_name)
### subset data frame to show columns only with mean() and std() variables
x_test.df <- x_test.df[,c(mean.v, std.v)]
### prepare test set with subject and activity names
x_test.df <- cbind(subject_test, y_test.df["activity_name"], x_test.df)
### check rows and columns for train and test sets
ncol(x_train.df); nrow(x_train.df)
ncol(x_test.df); nrow(x_test.df)
### combine train and test data sets into data table
samsung <- data.table(rbind(x_train.df, x_test.df))
nrow(samsung)
### create vector with column names
col <- colnames(samsung)
### use vector to fix column names by making following changes
for (i in seq_along (col)) {
### fix "...X", "...Y" or "...Z"
if (i %in% grep("...X", col, fixed = TRUE)) {
col[i] <- gsub("...X"
,".X"
,fixed = TRUE
,col[i]
)
}
if (i %in% grep("...Y", col, fixed = TRUE)) {
col[i] <- gsub("...Y"
,".Y"
,fixed = TRUE
,col[i]
)
}
if (i %in% grep("...Z", col, fixed = TRUE)) {
col[i] <- gsub("...Z"
,".Z"
,fixed = TRUE
,col[i]
)
}
### replace "t" and "f" with "time" and "freq"
if (substr(col[i],1,1) == "t") {
col[i] <- paste("time."
,substr(col[i],2,nchar(col[i]))
,sep=""
)
}
if (substr(col[i],1,1) == "f") {
col[i] <- paste("freq."
,substr(col[i],2,nchar(col[i]))
,sep=""
)
}
### fix "BodyBody" and ".."
if (i %in% grep("BodyBody", col, fixed = TRUE)) {
col[i] <- gsub("BodyBody"
,"Body"
,fixed = TRUE
,col[i]
)
}
if (i %in% grep("..", col, fixed = TRUE)) {
col[i] <- gsub(".."
,""
,fixed = TRUE
,col[i]
)
}
}
### assign new column names to data table
setnames(samsung, colnames(samsung), col)
### calculate mean for each variable
samsung_mean <- samsung[,lapply (.SD, mean, na.rm = TRUE), by = list(subject_id, activity_name)]
### order rows in table
samsung_mean <- samsung_mean[order(subject_id, activity_name)]
### create tidy data file "samsung_mean.txt"
write.table(samsung_mean, file = "./samsung_mean.txt"
,sep="\t"
,col.names = TRUE
,row.names = FALSE
)