forked from gregoryross/watertoolbox-protoms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalc_localgci2.py
758 lines (672 loc) · 33.3 KB
/
calc_localgci2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
# Written by Gregory A. Ross and Matteo Aldeghi
import numpy as np
import glob
import pickle
import sys, os, re
from collections import OrderedDict
import matplotlib
if 'DISPLAY' not in os.environ:
matplotlib.use('agg')
import matplotlib.pyplot as plt
protomshome = os.environ["PROTOMSHOME"]
sys.path.append(protomshome + "/tools")
import simulationobjects
import calc_gci
from simulationobjects import ResultsFile
import argparse
# ============================================================================================================
# Parse Options
# ============================================================================================================
def ParseOptions():
parser = argparse.ArgumentParser(description="Program to plot GCMC titration within discrete sites.")
mode = parser.add_mutually_exclusive_group(required=True)
parser.add_argument('-d', '--directories', nargs="+",
help="the directories containing the GCMC simulation data, default=None", default=None)
parser.add_argument('-f', '--pdbfiles', help="the input PDB-files, default=all.pdb", default="all.pdb")
mode.add_argument('-b', '--boxes', nargs="+",
help="the pdb files of the box(es) within which you wish to analysis the water content,default=None",
default=None)
mode.add_argument('-c', '--clusters',
help="a pdb file of clusters file that will be used to determine the center of the hydration spheres, default=None",
default=None)
parser.add_argument('-re', '--resultsfiles', nargs="+", help="the input results, default=results",
default="results")
parser.add_argument('-r', '--residue', help="the name of the residue to extract, default='wat'", default="wa1")
parser.add_argument('-a', '--atom', help="the name of the atom to extract, default='o00'", default="o00")
parser.add_argument('-s', '--skip', help="the number of initial snapshots that will be discarded when fitting",
type=int, default=0)
parser.add_argument('-o', '--out',
help="the name of the pickle data package that will save the processed PDB data and ANNs, default='data.pickle'",
default="data.pickle")
parser.add_argument('-i', '--input',
help="the name of the pickle data package that contains the pre-processed and saved ANNss, default= None",
default=None)
parser.add_argument('--hydr',
help="Hydration free energy for the water model used (kcal/mol). Default = -6.3 kcal/mol",
type=float, default=-6.3)
parser.add_argument('--calc', action='store',
help="Choose whether to calculate GCI for individual sites/boxes ('ind'), for multiple sites/boxes togethr ('mult'), or both ('all'). Default is 'ind'.",
default='ind', choices=['ind', 'mult', 'all'])
parser.add_argument('--radius', help="Radius of the hydration sites in Angstroms, default=1.4", type=float,
default=1.4)
parser.add_argument('--maxsites', type=int,
help="Maximum number of hydration sites to consider; the highest occupied sites will be considered first. Default is all.",
default=9999)
parser.add_argument('--bootstraps',
help="The number of bootstrap samples performed on the free energy estimates, default=None",
type=int, default=None)
parser.add_argument('--steps', nargs="+",
help="the number of steps to be fitted for each titration box. The input is a list of integers.",
default=None, type=int)
parser.add_argument('--fit_options',
help="additional options to be passed to the artificial neural network as quoted string",
action=ParseFitDict, type=str, default=ParseFitDict.default)
args = parser.parse_args()
# Hydration free energy needs to be negative
if args.hydr > 0.0:
raise InputError('--hydr', 'The input hydration free energy must be negative')
# Define the mode: boxes or clusters
if args.boxes is not None:
args.mode = "boxes"
elif args.clusters is not None:
args.mode = "clusters"
return args
# =============================================================================================================
# MAIN: Read in the GCMC results data from multiple GCMC output folders and calculating the mean number of
# "on" waters, after skipping a specified number of frames.
# ============================================================================================================
def main(args):
# -----
# BOXES
# -----
if args.mode == "boxes":
# Parse boxes
# -----------
boxes = [read_box(b) for b in args.boxes]
args.num_inputs = len(boxes)
# Define the steps
# ----------------
if args.steps is not None:
if args.num_inputs != len(args.steps):
raise InputError('--steps', 'Number of supplied steps does not equal number of supplied boxes.')
else:
print('WARNING: --steps not defined: a single step per box is assumed.')
args.steps = [1] * args.num_inputs
# Check if pickled data is provided
# ---------------------------------
if args.input is None:
# get Adams and N waters
# ----------------------
(B, N, volumes) = read_gcmc(args.num_inputs, args.directories, args.resultsfiles, args.pdbfiles,
args.residue, args.atom, args.skip, args.mode, structures=boxes)
# Save data to file
# -----------------
data = {"Adams": B, "N": N, "volumes": volumes}
pickle.dump(data, open(args.out, "wb"))
else:
data = pickle.load(open(args.input, "rb"))
B = data["Adams"]
N = data["N"]
volumes = data['volumes']
# Fit the ANN and carry out GCI
# -----------------------------
if args.calc in ('ind', 'all'):
ANNs = gci_individual_sites(num_inputs=args.num_inputs, steps=args.steps, fit_options=args.fit_options,
dg_hydr=args.hydr, mode=args.mode, B=B, N=N, volumes=volumes,
bootstraps=args.bootstraps, centers=None)
# Make Figure
# -----------
make_figure(B=B, N=N, ANNs=ANNs, num_inputs=args.num_inputs,
bootstraps=args.bootstraps, outfn='local_gci.pdf')
if args.calc in ('mult', 'all'):
print('WARNING: Option to analyse multiple boxes not available; only possible via --clusters')
# --------
# CLUSTERS
# --------
elif args.mode == "clusters":
# Parse cluster file
# ------------------
sites_centers = read_cluster(args.clusters, args.maxsites)
args.num_inputs = len(sites_centers)
# Define the steps
# ----------------
if args.steps is not None:
if args.num_inputs != len(args.steps):
raise InputError('--steps', 'Number of supplied steps does not equal number of supplied boxes.')
else:
print('WARNING: --steps not defined: 1 step for each site is assumed.')
args.steps = [1] * args.num_inputs
# Check if B,N data is provided
# -----------------------------
if args.input is None:
# get Adams and N waters
# ----------------------
(B, N, volumes) = read_gcmc(args.num_inputs, args.directories, args.resultsfiles, args.pdbfiles,
args.residue, args.atom, args.skip, args.mode,
structures=sites_centers, radius=args.radius)
# Save data to file
# -----------------
data = {"Adams": B, "N": N, "volumes": volumes}
pickle.dump(data, open(args.out, "wb"))
else:
data = pickle.load(open(args.input, "rb"))
B = data["Adams"]
N = data["N"]
volumes = data['volumes']
# Fit the ANN and carry out GCI
# -----------------------------
if args.calc in ('ind', 'all'):
ANNs = gci_individual_sites(num_inputs=args.num_inputs, steps=args.steps, fit_options=args.fit_options,
dg_hydr=args.hydr, B=B, N=N, mode=args.mode, volumes=volumes,
centers=sites_centers, bootstraps=args.bootstraps)
# Make Figure
# -----------
make_figure(B=B, N=N, ANNs=ANNs, num_inputs=args.num_inputs,
bootstraps=args.bootstraps, outfn='local_gci.pdf')
if args.calc in ('mult', 'all'):
gci_multiple_sites(num_inputs=args.num_inputs, steps=args.steps, fit_options=args.fit_options,
dg_hydr=args.hydr, B=B, N=N, centers=sites_centers,
radius=args.radius, bootstraps=args.bootstraps)
# Make Figure
# -----------
# TODO: return ANN from gci_cluster so to use it in make_figure
# make_figure(B=B, N=N, ANNs=ANNs, num_inputs=args.num_inputs,
# bootstraps=args.bootstraps, outfn='network_local_gci.pdf')
else:
raise InputError('--boxes, --clusters', 'Either --boxes or --clusters must be defined')
print "\nType enter to quit\n>"
raw_input()
# ============================================================================================================
# Set of functions to analyse free energy for boxes or spheres
# ============================================================================================================
def read_box(gcmcbox):
"""
Reads the parameters of a ProtoMS formatted box
Parameters
----------
gcmcbox: str
the filename of the ProtoMS box, a PDB file
Returns
-------
box: dict
dictionary containing the dimensions, origin, and length of the box
"""
gcmcbox = simulationobjects.PDBFile(filename=gcmcbox)
# Try to find box information in the header
box = simulationobjects.find_box(gcmcbox)
if box is None:
# if that fails, take the extent of the PDB structure
box = gcmcbox.getBox()
if "origin" not in box:
box["origin"] = box["center"] - box["len"] / 2.0
return box
def number_in_box(pdbfiles, molname, atomname, box, skip):
"""
Function to count the number of molecules in a PDB file that are within a specified volume.
"""
box_max = box["origin"] + box["len"]
box_min = box["origin"]
residue = molname.lower()
atom = atomname.lower()
found = 0.0
# for pdb in pdbfiles.pdbs :
for k in range(skip, len(pdbfiles.pdbs)):
pdb = pdbfiles.pdbs[k]
for i, res in pdb.residues.iteritems():
if res.name.lower() == molname.lower():
for atom in res.atoms:
if atom.name.strip().lower() == atomname.lower():
xyz = atom.coords
if np.all(xyz < (box_max)) and np.all(xyz > (box_min)):
found += 1.0
return found / len(range(skip, len(pdbfiles.pdbs)))
def number_in_sphere(pdbfiles, molname, atomname, skip, sites, radius):
"""
Function to count the number of molecules in a PDB file that are
found within a distance (radius) from specified sites.
Parameters
----------
pdbfiles : PDBSet object
Objects containng the PDB trajectories.
molname : string
Name of the molecules containing the atoms of which the occurrence
will be counted.
atomname : string
Name of the atoms of which the occurrence will be counted.
skip : int
Number of equilibration frames to skip when analysis the trajectory.
sites : ndarray
Array of x,y,z coordinates for the centers if the hydration sites.
radius : float
Radii of the spheres that define the hydration sites.
Returns
-------
count : float
Average number of atoms found in the selected volume made up by
all the spherical hydration sites.
"""
found = 0.0
for k in range(skip, len(pdbfiles.pdbs)):
pdb = pdbfiles.pdbs[k]
for i, res in pdb.residues.iteritems():
if res.name.lower() == molname.lower():
for atom in res.atoms:
if atom.name.strip().lower() == atomname.lower():
xyz = atom.coords
counted = False # to avoid double counting of waters if distance between sites is < radius*2
for site in sites:
if np.linalg.norm(site - xyz) <= radius and counted == False:
found = found + 1.0
counted = True
count = found / len(range(skip, len(pdbfiles.pdbs)))
return count
def read_cluster(clusfile, max_sites=9999):
"""
Function to read the locations of the water oxygen atoms in a PDB file
containing clustered hydration sites. The sites are grouped by their
residue numbers.
Parameters
----------
clusfile : string
Name of PDB file containing the clustered hydration sites.
Returns
-------
site_centers : dict
Dictionary containing arrays with the x,y,z coordinates of
oxygen atoms for all groups of hydration sites.
"""
lines = [l for l in open(clusfile, 'r').readlines() if len(l.split()) == 9 and l.split()[2] == 'O00']
sites_centers = OrderedDict()
for i, l in enumerate(lines):
if i == max_sites:
break
tag = str(l.split()[4]) # define groups of sites based on their residue number
x = float(l.split()[5])
y = float(l.split()[6])
z = float(l.split()[7])
if tag not in sites_centers:
sites_centers[tag] = []
sites_centers[tag].append(np.array([x, y, z]))
return sites_centers
def volume(centers, r=1.4):
"""
Function to calculate the volume of multiple and possibly intersecting
spheres given their centers. It works only for intersecting pairs of
spheres with same radii.
http://mathworld.wolfram.com/Sphere-SphereIntersection.html
Parameters
----------
centers : list of lists
Array containing a list of x,y,z coordinates.
r : radii of the spheres in angstrom.
Returns
-------
V : float
Volume of the (intersecting) spheres in cubic angstroms.
"""
# Volume of the spheres
spheres = (4. / 3. * np.pi * (r ** 3.)) * len(centers)
# Volume of (pairwise) intersections
intersections = 0.0
for idx, i in enumerate(centers):
for jdx, j in enumerate(centers):
if jdx > idx:
d = sum((centers[i][0] - centers[j][0]) ** 2) ** 0.5
# No overlap:
if d >= (r * 2.):
intersections += 0
# Complete overlap - though this should not happen in our case
elif d == 0.0:
intersections += 4. / 3. * np.pi * (r ** 3.)
# Partial overlap
else:
intersections += (np.pi * (2. * r - d) ** 2 * (d ** 2 + 4 * d * r)) / (12. * d)
V = spheres - intersections
return V
# ============================================================================================================
# Read pdb files and extract N and B
# ============================================================================================================
def read_gcmc(num_inputs, directories, rfilename, afilename, residue, atom, skip, mode, structures, radius=1.4):
"""
Reads PDB files and extract N and Adams.
"""
if mode not in ["boxes", "clusters"]:
print "Error in read_gcmc. Mode must be either 'boxes' or 'clusters'"
return
B = []
N = {}
volumes = []
ANNs = {}
for b in range(num_inputs):
N[b] = []
ANNs[b] = []
print "\nREADING GCMC DATA:"
for dirs in directories:
folders = glob.glob(dirs)
if len(folders) == 0:
print "\nError. No folder(s) matching '%s'. Exiting program.\n" % directories
sys.exit()
for folder in folders:
results = ResultsFile()
resultsfiles = glob.glob(folder + "/" + rfilename + "*")
pdbfilenames = glob.glob(folder + "/" + afilename + "*")
if len(resultsfiles) > 1: # It is assumed the results are from ProtoMS 2.
results.read([folder, rfilename])
pdbfiles = simulationobjects.PDBSet()
for filename in pdbfilenames:
pdb = simulationobjects.PDBFile(filename=filename)
pdbfiles.pdbs.append(pdb)
elif len(resultsfiles) == 1: # It is assumed the results are from ProtoMS 3.
results.read(folder + "/" + rfilename)
pdbfiles = simulationobjects.PDBSet()
pdbfiles.read(pdbfilenames[0])
else:
print "\nError. No results file matching %s. Exiting program\n" % folder + "/" + rfilename
sys.exit()
adams = np.array([snap.bvalue for snap in results.snapshots])
B.append(adams[0])
for b in range(num_inputs):
if mode == 'boxes':
N[b].append(number_in_box(pdbfiles, residue, atom, structures[b], skip))
v = structures[b]['len'][0] * structures[b]['len'][1] * structures[b]['len'][2]
volumes.append(v)
elif mode == 'clusters':
N[b].append(
number_in_sphere(pdbfiles, residue, atom, skip, structures[structures.keys()[b]], radius))
volumes.append(4.0 / 3.0 * np.pi * radius ** 3)
print "...file %s has been read and processed." % pdbfilenames[0]
B = np.array(B)
return B, N, volumes
# ============================================================================================================
# ANN fit and GCI functions
# ============================================================================================================
def gci_individual_sites(num_inputs, steps, fit_options, dg_hydr, B, N, mode, volumes, centers=None, bootstraps=None):
"""
Calculates the binding free energy for all sites/boxes.
centers is needed if mode='clusters'
Parameters
----------
num_inputs : int
Description ...
steps : array of int
Description ...
fit_options : dict
Description ...
dg_hydr : float
Description ...
B : array
Description ...
N : array
Description ...
mode : str
Description ...
volumes : array
Description ...
centers : array, optional
Description ...
bootstraps : int
Description ...
Returns
-------
ANNs : array
Description ...
"""
print "\nINDIVIDUAL FREE ENERGIES:"
print "(kcal/mol)"
if bootstraps is not None:
print "Values and error bars calculated using %i bootstrap samples of the titration data." % bootstraps
print "'Site' 'Nwat' 'Ideal gas transfer energy' 'Binding energy' 'Standard deviation'"
else:
print "'Site' 'Nwat' 'Ideal gas transfer energy' 'Binding energy'"
ANNs = {}
for b in range(num_inputs):
x = np.array(B)
y = np.array(N[b])
N_range = np.arange(np.round(y.min()), np.round(y.max()) + 1)
if mode == 'clusters':
Nwat = len(centers[centers.keys()[b]]) # Nwat = N of water sites
elif mode == 'boxes':
Nwat = 1 # Nwat not properly defined
if bootstraps == None:
ANNs[b], models = calc_gci.fit_ensemble(x=x, y=y, size=steps[b], verbose=False,
pin_min=fit_options["pin_min"],
pin_max=fit_options["pin_max"],
cost=fit_options["cost"],
c=fit_options["c"],
randstarts=fit_options["randstarts"],
repeats=fit_options["repeats"],
iterations=fit_options["iterations"])
dG_single = calc_gci.insertion_pmf(N_range, ANNs[b], volumes[b])
print " %4.2f %4.2f %16.2f %18.2f" % (b + 1, Nwat, dG_single[1], dG_single[1] - dg_hydr * Nwat)
else:
indices = range(x.size)
gci_dGs = np.zeros(bootstraps)
ANNs[b] = []
for boot in range(bootstraps):
sample_inds = np.random.choice(indices, size=x.size)
x_sample = x[sample_inds]
y_sample = y[sample_inds]
ANNs_boot, models = calc_gci.fit_ensemble(x=x_sample, y=y_sample, size=steps[b], verbose=False,
pin_min=fit_options["pin_min"],
pin_max=fit_options["pin_max"],
cost=fit_options["cost"],
c=fit_options["c"],
randstarts=fit_options["randstarts"],
repeats=fit_options["repeats"],
iterations=fit_options["iterations"])
ANNs[b].append(ANNs_boot)
gci_dGs[boot] = calc_gci.insertion_pmf(N_range, ANNs_boot, volume=volumes[b])[-1]
print " %4.2f %4.2f %16.2f %18.2f %18.2f" % (
b + 1, Nwat, gci_dGs.mean(), gci_dGs.mean() - dg_hydr * Nwat, gci_dGs.std())
return ANNs
def gci_multiple_sites(num_inputs, steps, fit_options, dg_hydr, B, N, centers, radius=1.4, bootstraps=None):
"""
The binding free energy of all the sites is calculated.
The binding free energy calculated is relative to the lowest number of inserted molecules
For the standard state volume correction, the entire volume encompassed by the clusters is considered
Parameters
----------
num_inputs : int
Description ...
steps : array of int
Description ...
fit_options : dict
Description ...
dg_hydr : float
Description ...
B : array
Description ...
N : array
Description ...
centers : array, optional
Description ...
radius : float
Description ...
bootstraps : int
Description ...
Returns
-------
"""
print '\nWHOLE CLUSTER FREE ENERGY:'
print "(kcal/mol)"
print "Relative free energies (in kcal/mol) for all the clusters in the combined volume:"
print 'The free energy is difference between the minimum number (N min) to the maximum number (N max)'
print 'A correction removes the contribution from overlapping volumes\n'
# Counting the total volume, total number of inserted molecules, and maximum number of steps for the ANN
total_volume = volume(centers=centers, r=radius)
total_mols = []
total_steps = 0
# Adding up the counts in all the clusters
for b in range(num_inputs):
total_mols.append(np.array(N[b]))
total_steps += steps[b]
# Defining the vectors and variables of the titration data over all the clusters
x = np.array(B)
y = np.sum(np.array(total_mols), axis=0)
# Taking the minimum number of molecules and maximum to calculate the relative free energy between them
n_max = np.round(y.max())
n_min = np.round(y.min())
N_range = np.array((n_min, n_max))
# Now calculate the free energy of using the volumes of all the clusters
if bootstraps is None:
print "'N min' 'N max' 'Ideal gas transfer energy' 'Binding energy"
ANN_total, models = calc_gci.fit_ensemble(x=x, y=y, size=total_steps, verbose=False,
pin_min=fit_options["pin_min"],
pin_max=fit_options["pin_max"],
cost=fit_options["cost"],
c=fit_options["c"],
randstarts=fit_options["randstarts"],
repeats=fit_options["repeats"],
iterations=fit_options["iterations"])
dG_single = calc_gci.insertion_pmf(N_range, ANN_total, total_volume)
print " %5.2f %5.2f %16.2f %18.2f" % (
n_min, n_max, dG_single[1], dG_single[1] - dg_hydr * (n_max - n_min))
else:
print "'N min' 'N max' 'Ideal gas transfer energy' 'Binding energy 'Standard deviation'"
indices = range(x.size)
gci_dGs = np.zeros(bootstraps)
for boot in range(bootstraps):
sample_inds = np.random.choice(indices, size=x.size)
x_sample = x[sample_inds]
y_sample = y[sample_inds]
ANNs_boot, models = calc_gci.fit_ensemble(x=x_sample, y=y_sample, size=total_steps, verbose=False,
pin_min=fit_options["pin_min"],
pin_max=fit_options["pin_max"],
cost=fit_options["cost"],
c=fit_options["c"],
randstarts=fit_options["randstarts"],
repeats=fit_options["repeats"],
iterations=fit_options["iterations"])
gci_dGs[boot] = calc_gci.insertion_pmf(N_range, ANNs_boot, volume=total_volume)[-1]
print " %5.2f %5.2f %16.2f %18.2f %18.2f" % (
n_min, n_max, gci_dGs.mean(), gci_dGs.mean() - dg_hydr * (n_max - n_min), gci_dGs.std())
# TODO: return ANN
# ============================================================================================================
# Figures/Plotting
# ============================================================================================================
def make_figure(B, N, ANNs, num_inputs, bootstraps, outfn='local_gci.pdf'):
# TODO: make it usable also for the results of gci_cluster()
fig, axes = plt.subplots(len(ANNs), sharex=True, figsize=(8.0, 5.0*num_inputs))
for b in range(num_inputs):
axes[b].scatter(B, N[b], color="black")
if len(ANNs) != 0:
if bootstraps is None:
ANNs[b].x = np.linspace(start=B.min(), stop=B.max(), num=100)
ANNs[b].forward()
axes[b].plot(ANNs[b].x, ANNs[b].predicted, color="red", linewidth=3)
else:
plot_FitPercentiles2(ANNs[b][0].x, ANNs[b][0].y, ANNs[b], ax=axes[b])
axes[b].set_title("Site/Box %i" % (b + 1))
axes[b].set_xlabel("Adams value")
axes[b].set_ylabel("Occupancy within volume")
plt.tight_layout()
fig.savefig(outfn)
def plot_FitPercentiles2(B, N, gcmc_models, resolution=None, level1=None, level1_colour=None, level2=None,
level2_colour=None, median_colour=None, smoothness=None, ax=None):
"""
Tool to plot a collection of fitted ANNs at two confidence limits.
Parameters
----------
B : numpy array
vector of the explanatory variable, the B/Adams value
N : numpy array
vector of the response variable, the average number of waters for a given B
gcmc_models : list of Slp objects
the fitted ANNs that will be plotted
resolution : float
the number of data points that will be plotted, i.e. resolution of plotted lines
level1 : float
the confidence level (eg top 50% as a fraction) that will be plotted
level2 : float
the confidence level that will be plotted
level1_color : string
the color of the level1 confidence region
level2_color : string
the color of the level2 confidence region
median_colour : string
colour that the median of all the fits will be plotted in
smoothness : float
the standard deviation over x that will be used to smooth over y
ax : matplotlib Axes, optional
Axes object to draw the plot onto, otherwise uses the current Axes.
Return
------
matplotlib.pyplot object
plot object
"""
# Setting the defaults:
if resolution == None: resolution = 50 # Number of points with which the lines will be drawn.
if level1 == None: level1 = 0.50 # Default is also shade the region that contains 50% of the data
if level2 == None: level2 = 0.90 # Default is NOT to shade a region at a lower percentile level.
if level1_colour == None: level1_colour = "orange" # Default is also shade the region that contains 50% of the data
if level2_colour == None: level2_colour = "gray" # Default is to also shade the region that contains 95% of the data
if median_colour == None: median_colour = "red" # Colour of the median
if smoothness == None: smoothness = 0.05 # Degree to which the lines will be smoothed (for aesthetic purposes only).
if ax is None:
ax = plt.gca()
# Generating the predictions of the different gcmc_models:
yvals = np.zeros((resolution, len(gcmc_models))) # Will hold all the predictions from the different models
x = np.linspace(start=B.min(), stop=B.max(), num=resolution) # The x values that will be plotted.
for i in range(len(gcmc_models)):
gcmc_models[i].x = x
gcmc_models[i].forward()
yvals[:, i] = gcmc_models[i].predicted
# Generating and smoothing the confidence intervals.
y_median, y_low, y_high = calc_gci.percentile_intervals(yvals, level1)
y_median, y_lowest, y_highest = calc_gci.percentile_intervals(yvals, level2)
smooth_median = calc_gci.gaussian_smooth(x, y_median,
sigma=smoothness * 3) # The median is made smoother than the error bars.
smooth_low = calc_gci.gaussian_smooth(x, y_low, sigma=smoothness)
smooth_high = calc_gci.gaussian_smooth(x, y_high, sigma=smoothness)
smooth_lowest = calc_gci.gaussian_smooth(x, y_lowest, sigma=smoothness)
smooth_highest = calc_gci.gaussian_smooth(x, y_highest, sigma=smoothness)
# Plotting:
space = 0.07 # The amount of white space to the side of the x and y axis as a fraction of the ranges.
ax.fill_between(x, smooth_lowest, smooth_highest, facecolor=level2_colour, linewidth=0, alpha=0.3,
interpolate=True)
ax.fill_between(x, smooth_low, smooth_high, facecolor="white", linewidth=0, interpolate=True)
ax.fill_between(x, smooth_low, smooth_high, facecolor=level1_colour, linewidth=0, alpha=0.4, interpolate=True)
ax.set_xlim(B.min() - space * np.ptp(B), B.max() + space * np.ptp(B))
ax.set_ylim(N.min() - space * np.ptp(N), N.max() + space * np.ptp(N))
ax.scatter(B, N, color="black", s=40)
ax.plot(x, smooth_median, color=median_colour, linewidth=3)
return ax
# ============================================================================================================
# Classes
# ============================================================================================================
class Error(Exception):
"""Base class for exceptions."""
pass
class InputError(Error):
"""Exception raised for errors in the input
Attributes:
expression -- input expression in which the error occurred
message -- explanation of the error
"""
def __init__(self, expression, message):
self.expression = expression
self.message = message
class ParseFitDict(argparse.Action):
"""Specifies the default options for artificial neural network, and reads in values from the command line"""
default = {"monotonic": True, "repeats": 10, "randstarts": 1000, "iterations": 100, "grad_tol_low": -3,
"grad_tol_high": 1, "pin_min": 0.0, "pin_max": None, "cost": "msd", "c": 2.0, "verbose": False}
def __init__(self, option_strings, *args, **kwargs):
self.fit_dict = ParseFitDict.default
super(ParseFitDict, self).__init__(option_strings=option_strings, *args, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
if values is not None:
options = values.split()
for i in range(len(options)):
if options[i] in self.fit_dict:
if type(self.fit_dict[options[i]]) == str:
self.fit_dict[options[i]] = str(options[i + 1])
if type(self.fit_dict[options[i]]) == float or self.fit_dict[options[i]] is None:
self.fit_dict[options[i]] = float(options[i + 1])
if type(self.fit_dict[options[i]]) == int:
self.fit_dict[options[i]] = int(options[i + 1])
if type(self.fit_dict[options[i]]) == str:
self.fit_dict[options[i]] = str(options[i + 1])
setattr(namespace, self.dest, self.fit_dict)
if __name__ == "__main__":
args = ParseOptions()
main(args)