-
Notifications
You must be signed in to change notification settings - Fork 3
/
mxParallel.R
234 lines (213 loc) · 11.6 KB
/
mxParallel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#MxParallel
# Prepares data for species distribution modeling in Maxent, and runs
# distribution models in parallel using Maxent.
# Arguments:
## Data input, cleaning and output (mandatory):
### occ.file(string): Full path to species occurrence file.
### Occurrence file must be comma separated and must contain fields
### id, species, lat, lon (lowercase, order not important)
### env.dir(string): Path to directory that contains environmental layers
### env.files(string): File names of environmental layers to be used in distribution models.
### Must contain extension, e.g. bio_1.tif.
### wd(string): Path to directory where output files will be saved
### dist(numeric): Distance (in meters) within which records are considered duplicates.
### Only one record is kept within dist.Default is 1000.
##
## Background generation options:
### bkg.aoi(string): Keyword that defines where background will be sampled from.
### extent (default): background will be sampled from raster extent
### regions: background will be species specific, and it will correspond
### to the polygons of a shapefile that completely contain the
### species records.
### ch: background will be species specific and it will correspond to the
### convex hull of the species' records.
### bkg.type(string): Keyword that defines how the background will be sampled from bkg.aoi.
### random (default): background will be sampled randomly from bkg.aoi
### samples: get background samples from a file.
### Optional arguments:
### n.bkg(numeric): number of background samples.
### Used when bkg.type="random". Default is 10000.
### sample.bkg(string): Path to comma separated file containing background samples.
### Must include the fields lat and lon (lowercase, order doesn't matter).
### Used only when bkg.type="samples"
### regions(SpatialPolygons): SpatialPolygons object with the regions that will be used to
### define species background.
### Used only when bkg.aoi="regions"
### field(string): field (column name) that defines the regions.
### Used only when bkg.aoi="regions"
### buffer(numeric): Buffer in meters to be applied to convex polygons.
### Used only when bkg.aoi="ch".
## Evaluation and regularization options:
### optimize.lambda(logical):Optimize regularization value? Default FALSE.
### lambda(numeric): Regularization multiplier. Deafault 1.
### do.eval(logical): Do model evaluation? Default TRUE.
### folds(numeric): Number of folds for k-fold partitioning used in evaluation
### and regularization optimization. Default = 10.
## Modeling options:
### mxnt.args(vector): character vector containing arguments to pass to dismos'
### maxent function.
### n.cpu: Number of cores to uses for parallel processing
## Post-processing options:
### do.threshold(logical): Threshold distribution models?
### raw.threshold(vector): numeric or character vector. If numeric, this will
### specify the percentiles (from 0 to 100) at which
### models should be thresholded according to the
### "probability of occurrence" at training sites.
### If character, this should be any combination of the
### following keywords: min, 10p, ess, mss.
### do.cut(logical): Select distribution patches with evidence of occurrence
### from thresholded distribution models?
#
#
# Example:
#
# mxParallel(occ.file="D:/Projects/acuaticas/experiment/species_db.csv",
# env.dir="C:/ws2",
# env.files=c(paste0("bio_",1:19,".tif"),"slope_deg.tif","tri.tif","twi.tif"),
# wd="~/tmp3",
# dist=1000,
# bkg.aoi = "extent",
# bkg.type="random",
# n.bkg = 10000,
# sample.bkg = NULL,
# optimize.lambda=TRUE,
# folds=10,
# do.eval=TRUE,
# n.cpu=4,
# mxnt.args=c("autofeature=FALSE","linear=TRUE","quadratic=TRUE","product=FALSE","hinge=TRUE","threshold=FALSE",
# "extrapolate=FALSE","doclamp=TRUE","addsamplestobackground=TRUE"),
# do.threshold=TRUE,
# raw.threshold=c(0,10,20,30),
# do.cut=TRUE)
mxParallel<-function(occ.file,env.dir,env.files,wd=getwd(),dist=1000,bkg.aoi = "extent",
bkg.type="random", n.bkg = 10000, sample.bkg = NULL,buffer=0,
optimize.lambda=FALSE, lambda = 1, folds=5, do.eval=TRUE, n.cpu,
mxnt.args, do.threshold=FALSE, raw.threshold, do.cut=FALSE){
#Create log file
sink(paste0(wd,"/log.txt"))
on.exit(sink())
#Load Functions
library(devtools)
source_url("https://raw.githubusercontent.com/LBAB-Humboldt/parallelMaxent/master/preModelingFunctions.R")
source_url("https://raw.githubusercontent.com/LBAB-Humboldt/parallelMaxent/master/evaluationFunctions.R")
source_url("https://raw.githubusercontent.com/LBAB-Humboldt/parallelMaxent/master/postModelingFunctions.R")
LoadLibraries()
cat(paste(Sys.time(), "Functions and libraries loaded\n"))
#Load and clean data
occs <- LoadOccs(occ.file)
current.spp <- length(unique(occs$species))
current.recs <- nrow(occs)
cat(paste(Sys.time(), "Loaded",current.recs,"records corresponding to",
current.spp, "species from file", occ.file,"\n"))
env.vars <- raster:::stack(paste0(env.dir,"/",env.files))
cat(paste(Sys.time(), "Loaded environmental layers", paste(as.character(env.files), collapse=","), "from directory", env.dir,"\n"))
if(is.na(projection(env.vars))|projection(env.vars)=="NA"){
cat(paste(Sys.time(), "WARNING: Undefined environmental variables projection\n"))
cat(paste(Sys.time(), "WARNING: Setting projection to geographic\n"))
projection(env.vars)<-"+proj=longlat +ellps=WGS84 +datum=WGS84"
}
## Remove records within radius defined by variable "dist"
occs <- ddply(occs,.(species),IdNeighbors,dist=dist)
current.spp <- length(unique(occs$species))
current.recs <- nrow(occs)
cat(paste(Sys.time(), "After removing points within",dist, "meters of each other, ",
current.recs, "records corresponding to", current.spp, "species remain\n"))
#Extract covariate data for presences (and background if bkg.aoi="extent")
occs.covs <- extract(env.vars, cbind(occs$lon,occs$lat))
nna.rows <- which(apply(!is.na(occs.covs), 1, any))
occs.covs <- occs.covs[nna.rows, ]
occs <- occs[nna.rows, ]
if (bkg.aoi == "extent"){
train.bkg <- GenerateBkg(n.bkg, env.vars, bkg.type, sample.bkg)
test.bkg <- GenerateBkg(n.bkg, env.vars, bkg.type, sample.bkg)
cat(paste(Sys.time(), "Background generated for raster extent using",bkg.type, "sampling \n"))
}
## Define list of species with more than 10 records
sp.list <- FilterSpeciesByRecords(occs, 10)
if(length(sp.list)==0){
return()
}
current.spp <- length(sp.list)
cat(paste(Sys.time(), "After removing species with less than 10 unique records",
current.spp, "species remain \n"))
cat(paste(Sys.time(), "Began parallel loop using", n.cpu, "cores \n"))
sink()
#Begin modeling loop
sfInit(parallel=T,cpus=n.cpu)#Initialize nodes
sfExportAll() #Export vars to all the nodes
sfClusterSetupRNG()
sfClusterApplyLB(1:length(sp.list),function(i){
tmp.dir <- tempdir()
sink(paste0(wd,"/log.txt"), append=TRUE)
on.exit(sink())
LoadLibraries()
#Get species data
sp.name <- sp.list[i]
sp.idx <- which(occs$species == sp.list[i])
sp.occs <- occs[sp.idx, ]
#Generate covariate data for background (when bkg.type!="extent")
if(!exists("train.bkg")&!exists("test.bkg")){
train.bkg <- GenerateSpBkg(sp.occs, n.bkg, env.vars, bkg.type, bkg.aoi,
regions, field, sample.bkg, buffer)
test.bkg <- GenerateSpBkg(sp.occs, n.bkg, env.vars, bkg.type, bkg.aoi,
regions, field, sample.bkg, buffer)
cat(paste(Sys.time(), "Background generated for species", sp.name,
"using area defined by", bkg.aoi, "and", bkg.type, "sampling \n"))
}
#Determine lambda value
if (optimize.lambda){
optim.lambda <- OptimizeLambda(folds, occs.covs[sp.idx, ], train.bkg, test.bkg,
mxnt.args, wd, sp.list[i], path=tmp.dir)
mxnt.args <- c(mxnt.args, paste0("betamultiplier=", optim.lambda["best.lambda"]))
cat(paste(Sys.time(), "Performed regularization optimization for", sp.name, "\n"))
} else {
if(exists("lambda")){
mxnt.args <- c(mxnt.args, paste0("betamultiplier=", 1))
cat(paste(Sys.time(), "Used default regularization value of 1 for", sp.name, "\n"))
} else {
mxnt.args <- c(mxnt.args, paste0("betamultiplier=", lambda))
cat(paste(Sys.time(), "Used custom regularization value of", lambda, "for", sp.name,"\n"))
}
}
#Do model evaluation
if(do.eval){
sp.eval <- EvaluatePOModel(folds, occs.covs[sp.idx, ], train.bkg, test.bkg, mxnt.args, path=tmp.dir)
write.csv(sp.eval, paste0(wd, "/", sp.list[i],"_evaluation_mx.csv"), row.names=F)
cat(paste(Sys.time(), "Performed model evaluation for", sp.name, "\n"))
}
#Start modeling
mxnt.obj <- maxent(x=rbind(occs.covs[sp.idx, ], train.bkg),
p=c(rep(1,length(sp.idx)),rep(0,nrow(train.bkg))),
removeDuplicates=FALSE, args=mxnt.args, path=tmp.dir)
save(mxnt.obj, file=paste0(wd, "/", sp.list[i], "_mx.RData"))
cat(paste(Sys.time(), "Generated maxent distribution model for", sp.name, "\n"))
map <- predict(mxnt.obj, env.vars ,args=c("outputformat=logistic"))
writeRaster(map, paste0(wd, "/", sp.list[i], "_mx.tif"), format="GTiff",
overwrite=TRUE, NAflag=-9999)
cat(paste(Sys.time(), "Generated prediction of maxent distribution model for", sp.name, "\n"))
write.csv(occs[sp.idx, ], paste0(wd, "/", sp.list[i], "_mx.csv"), row.names=FALSE)
#Note to self: Other stuff to write? results?
#Post-processing: threshold & cut
if(do.threshold){
thres.maps <- sapply(raw.threshold, FUN=Threshold2, mxnt.obj=mxnt.obj,
map=map)
for(j in 1:length(raw.threshold)){
writeRaster(thres.maps[[j]],filename=paste0(wd, "/", sp.name,"_", raw.threshold[j], "_mx.tif"),
format="GTiff",overwrite=TRUE, NAflag=-9999)
}
cat(paste(Sys.time(), "Generated thresholded prediction of maxent distribution model
using thresholds ", paste(raw.threshold,collapse=", "), "for", sp.name, "\n"))
if(do.cut){
cut.maps <- sapply(thres.maps, FUN=CutModel2, sp.points=cbind(sp.occs$lon,sp.occs$lat))
for(j in 1:length(raw.threshold)){
writeRaster(cut.maps[[j]],filename=paste0(wd, "/", sp.name,"_",raw.threshold[j], "_cut_mx.tif"),
format="GTiff",overwrite=TRUE, NAflag=-9999)
}
cat(paste(Sys.time(), "Cut thresholded prediction(s) of maxent distribution model for", sp.name, "\n"))
}
}
#Remove temporary files
removeTmpFiles(2)
})
sfStop()
}