-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaning_data.R
81 lines (56 loc) · 3.19 KB
/
cleaning_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
-----------------
# Read the data into R
# Change the names of the variables to follow our naming standards.
# Split in column GeoLocation into latitude and longitude, the new latitude and longitude columns should be numeric.
# Replace any missing values in latitude and longitude with zeros.
# Remove meteorites less than 1000g in weight from the data.
# Order the data by the year of discovery.
-----------------
-----------------
# Include assertive programming to make sure that:
# The data has the variable names we expect (“id”, “name”, “mass (g)”, “fall”, “year”, “GeoLocation”).
# Latitude and longitude are valid values. (Latitude between -90 and 90, longitude between -180 and 180).
-----------------
library(tidyverse)
library(janitor)
library(assertr)
# Read the data into R
# Change the names of the variables to follow our naming standards.
meteorite_landings <- read_csv("data/meteorite_landings.csv")
# Include assertive programming to make sure that:
# The data has the variable names we expect (“id”, “name”, “mass (g)”, “fall”, “year”, “GeoLocation”). - PASSED
meteorite_landings %>%
verify(identical(names(meteorite_landings), c("id", "name", "mass (g)", "fall", "year", "GeoLocation")))
# janitor::clean_names()
meteorite_landings <- meteorite_landings %>% clean_names()
# Split in column GeoLocation into latitude and longitude, the new latitude and longitude columns should be numeric.
# getting rid off parentheses in column geo_location
meteorite_landings_no_parentheses <- meteorite_landings %>%
mutate(geo_location = str_sub(geo_location, 2, -2))
# splitting geo_location column into two
meteorite_landings_sep <- meteorite_landings_no_parentheses %>%
separate(geo_location, c("latitude", "longitude"), sep = ",") %>%
mutate(latitude = as.numeric(latitude),
longitude = as.numeric(longitude))
# Include assertive programming to make sure that:
# Latitude and longitude are valid values. (Latitude between -90 and 90, longitude between -180 and 180).
#We get an error, there is an observation with longitude > 180, but we get rid of this row in last step anyway
meteorite_landings_sep %>%
verify(latitude > -90 & latitude < 90) %>%
verify(longitude > -180 & longitude < 180)
# Replace any missing values in latitude and longitude with zeros.
# is there any missing values
meteorite_landings_sep %>%
summarise(across(.fns = ~ sum(is.na(.x))))
meteorite_landings_sep_no_NA <- meteorite_landings_sep %>%
mutate(latitude = coalesce(latitude, 0),
longitude = coalesce(longitude, 0))
# check there is no missing values in columns latitude and longitude
meteorite_landings_sep_no_NA %>%
summarise(across(.fns = ~ sum(is.na(.x))))
# Remove meteorites less than 1000g in weight from the data
# Order the data by the year of discovery.
meteorite_landings_clean_data <- meteorite_landings_sep_no_NA %>%
filter(mass_g > 1000) %>%
arrange(year)
write_csv(meteorite_landings_clean_data, "meteorite_landings_clean_data.csv", path = "data/meteorite_landings_clean_data.csv")