-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconstruct_locality_freqs.R
152 lines (131 loc) · 6.55 KB
/
construct_locality_freqs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# args: (1) data_file (output from previous step, final_species_data.csv). (2) ecoregions (a file of ecoregion names and ids, attributes_and_data/eco_id_with_names.csv) (3) output1 (for biomes) (4) output2 (for ecoregions)
# GLOBAL ARGUMENTS
args <- commandArgs(TRUE)
for(i in 1:length(args)){
eval(parse(text = args[[i]]))
}
get_biomes = function (data){
all_biomes = data$biome
all_biomes = gsub("[[:space:]]", "", all_biomes)
splitter = unlist(strsplit(all_biomes,","))
odd_ind = seq(1,length(unlist(strsplit(splitter,"_"))),2)
even_ind = seq(2,length(unlist(strsplit(splitter,"_"))),2)
biomes = unlist(strsplit(splitter,"_"))[odd_ind]
occurrences = unlist(strsplit(splitter,"_"))[even_ind]
return (list(biomes,occurrences))
}
get_ecoregions = function (data){
all_ecoregions = data$eco_id
all_ecoregions = gsub("[[:space:]]", "", all_ecoregions)
splitter = unlist(strsplit(all_ecoregions,","))
odd_ind = seq(1,length(unlist(strsplit(splitter,"_"))),2)
even_ind = seq(2,length(unlist(strsplit(splitter,"_"))),2)
ecoregions = unlist(strsplit(splitter,"_"))[odd_ind]
occurrences = unlist(strsplit(splitter,"_"))[even_ind]
return(list(ecoregions,occurrences))
}
data = read.csv(data_file,stringsAsFactors=F, na.strings = c(NA,""))
ecoregions_data = read.csv(ecoregions,stringsAsFactors=F)
factors = 23
eco_data = data.frame(matrix(nrow = nrow(ecoregions_data), ncol = factors))
names(eco_data) = c("eco_id","name","occurrences","sp","dp","pp","na","annual","herb_p","woody","unclass_herb","unclass_per","no_lifeform",
"mixed","unresolved","conflict","basal_m","higher_m","angiosperm","basal_d","rosids","asterids","no_tax")
eco_data$eco_id = ecoregions_data$eco_id
eco_data$name = ecoregions_data$eco_name
biome_data = data.frame(matrix(nrow = 14, ncol = factors))
names(biome_data) = c("biome","name","occurrences","sp","dp","pp","na","annual","herb_p","woody","unclass_herb","unclass_per","no_lifeform",
"mixed","unresolved","conflict","basal_m","higher_m","angiosperm","basal_d","rosids","asterids","no_tax")
biome_data$biome = seq(1,14,1)
# initialize data frames as zeros
eco_data[,3:ncol(eco_data)] = 0
biome_data[,3:ncol(biome_data)] = 0
# update count in table
update_count = function(row,original_data_row,curr_occ){
row$occurrences = row$occurrences + curr_occ
row$sp = row$sp + 1
if (is.na(original_data_row$ploidy)){ row$na = row$na + 1 }
else if (original_data_row$ploidy==0){ row$dp = row$dp + 1 }
else if (original_data_row$ploidy==1){ row$pp = row$pp + 1 }
if (is.na(original_data_row$lifeform)) { row$no_lifeform = row$no_lifeform + 1 }
else if (original_data_row$lifeform == "Annual") { row$annual = row$annual + 1 }
else if (original_data_row$lifeform == "Perennial herb") { row$herb_p = row$herb_p + 1 }
else if (original_data_row$lifeform == "Woody") { row$woody = row$woody + 1 }
else if (original_data_row$lifeform == "Unclassified herb") { row$unclass_herb = row$unclass_herb + 1 }
else if (original_data_row$lifeform == "Unclassified perennial") { row$unclass_per = row$unclass_per + 1 }
else if (original_data_row$lifeform == "mixed") { row$mixed = row$mixed + 1 }
else if (original_data_row$lifeform == "unresolved") { row$unresolved = row$unresolved + 1 }
else if (original_data_row$lifeform =="Conflict") { row$conflict = row$conflict + 1 }
if (is.na(original_data_row$Wood_major_group)) { row$no_tax = row$no_tax + 1 }
else if (original_data_row$Wood_major_group =="Basal monocots (non-commelinid monocots)") { row$basal_m = row$basal_m + 1 }
else if (original_data_row$Wood_major_group =="Higher monocots (commelinids)") { row$higher_m = row$higher_m + 1 }
else if (original_data_row$Wood_major_group =="Basal angiosperms") { row$angiosperm = row$angiosperm + 1 }
else if (original_data_row$Wood_major_group =="Basal dicots (non-asterid +non-rosid dicots)") { row$basal_d = row$basal_d + 1 }
else if (original_data_row$Wood_major_group =="Dicots - core rosids") { row$rosids = row$rosids + 1 }
else if (original_data_row$Wood_major_group =="Dicots - core asterids") { row$asterids = row$asterids + 1 }
return (row)
}
for (i in 1:nrow(data)){
print(i)
res = get_biomes(data[i,]) # get species' biomes
biomes = res[[1]]; occurrences = res[[2]]
remove_ind = NULL
if ("98" %in% biomes){ # remove "98" from biomes
remove_ind = which(biomes=="98")
biomes = biomes[-remove_ind]
occurrences = occurrences[-remove_ind]
}
if ("99" %in% biomes){ # remove "99" from biomes
remove_ind = which(biomes=="99")
biomes = biomes[-remove_ind]
occurrences = occurrences[-remove_ind]
}
# do not consider species with less than 5 occurrences
remove_ind = NULL
if (length(which(as.numeric(occurrences)<5))>0){
remove_ind = which(as.numeric(occurrences)<5)
occurrences = occurrences[-remove_ind]
biomes = biomes[-remove_ind]
}
data$biomes[i] = length(biomes)
for (b in 1:length(biomes)){ # iterate over all biomes
row_ind = which(biome_data$biome==biomes[b])
updated_row = update_count(biome_data[row_ind,],data[i,],as.numeric(occurrences[b]))
biome_data[row_ind,] = updated_row
}
remove_ind = NULL
if (is.na(data$eco_id[i])){ # possible that there are biomes without ecoregions
next
}
res = get_ecoregions(data[i,]) # get species' ecoregions
ecoregions = res[[1]]; occurrences = res[[2]]
if ("-9998" %in% ecoregions){ # remove "-9998" from biomes
remove_ind = which(ecoregions=="-9998")
ecoregions = ecoregions[-remove_ind]
occurrences = occurrences[-remove_ind]
}
if ("-9999" %in% ecoregions){ # remove "-9999" from biomes
remove_ind = which(ecoregions=="-9999")
ecoregions = ecoregions[-remove_ind]
occurrences = occurrences[-remove_ind]
}
# do not consider species with less than 5 occurrences
remove_ind = NULL
if (length(which(as.numeric(occurrences)<5))>0){
remove_ind = which(as.numeric(occurrences)<5)
occurrences = occurrences[-remove_ind]
ecoregions = ecoregions[-remove_ind]
}
data$ecoregions[i] = length(ecoregions)
for (e in 1:length(ecoregions)){
row_ind = which(eco_data$eco_id==ecoregions[e])
updated_row = update_count(eco_data[row_ind,],data[i,],as.numeric(occurrences[e]))
eco_data[row_ind,] = updated_row
}
}
# add biome number to each ecoregion
biome_data$pp_perc = biome_data$pp/biome_data$sp*100 # pp_perc in each ecoregion
biome_data$pp_perc_resolved = biome_data$pp/(biome_data$pp + biome_data$dp)*100
eco_data$pp_perc_sp = eco_data$pp/eco_data$sp*100 # pp_perc in each ecoregion
eco_data$pp_perc_resolved = eco_data$pp/(eco_data$pp + eco_data$dp)*100
write.csv(file = output1,biome_data,row.names=F)
write.csv(file = output2,eco_data,row.names=F)