-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathNea_to_CEU.toml
128 lines (113 loc) · 4.96 KB
/
Nea_to_CEU.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Working directory, for simulation output, trained networks, plots, etc.
dir = "Nea_to_CEU"
# Reference population. This is the donor of introgressed haplotypes.
# Must be one of the populations defined below.
ref_pop = "Nea"
# Minor Allele Frequency (MAF) threshold.
# Both training data and empirical data will be filtered to exclude
# sites with MAF < maf_threshold.
maf_threshold = 0.05
[sim]
# Length of the genomic region to be simulated.
sequence_length = 100_000 # 100 kb
# For selection simulations (sweep or AI), we condition on the final allele
# frequency of the selected mutation being greater than this in the target
# population.
min_allele_frequency = 0.01
[sim.tranche]
# The labels and modelspec(s) for each tranche. The network will be trained to
# classify data as coming from one of these tranches. Each tranche consists of
# a list of simulation modelspecs.
# Only two tranches are supported.
"not AI" = [
"HomSap/HomininComposite_4G20/Neutral/slim",
"HomSap/HomininComposite_4G20/Sweep/CEU",
# Skip this for now, as it's too computationally intensive
# to do many replicates for training. :-(
#"HomSap/HomininComposite_4G20/DFE",
]
AI = [
"HomSap/HomininComposite_4G20/AI/Nea_to_CEU",
]
[sim.extra]
# Additional modelspecs against which the trained model will be evaluated.
#DFE = ["HomSap/HomininComposite_4G20/DFE"]
[vcf]
# Path to the (indexed) VCF or BCF file(s).
# This is the data to which the trained network will be applied.
#file = "/path/to/vcf/all_chromosomes_in_one_file.vcf.gz"
# VCFs are often split by chromosome. If this is the case, a list of
# chromosomes may be provided here, and the ${chr} string in the filename
# will be substituted for the chromosome number when applying the trained
# network. Chromosomes may be specified as integers or strings.
# Only diploid autosomes are supported.
chr = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22 ]
file = "1000g.Nea.${chr}.bcf"
# Specify if genotypes should be treated as phased or unphased.
# Note: mixed phasing is not supported, and '|' or '/' phase indicators in
# the VCF genotype field are ignored.
phased = true
[pop]
# For each population, specify the individual IDs in the VCF. This can either
# be a list of IDs, or the name of a file containing the IDs (one per line).
# The population names must match those used for the simulations.
# The order of populations given here will be used for the ordering in the
# genotype matrices. It's recommended for the donor and recipient populations
# to be adjacent in the genotype matrices!
Nea = ["AltaiNeandertal", "Vindija33.19"]
CEU = "CEU.indlist"
YRI = "YRI.indlist"
[train]
# Resize haplotypes in each genotype matrix using this many bins.
num_rows = 256
# Train for this many epochs.
epochs = 3
# The number of genotype matrices that will be processed before Tensorflow
# updates gradients.
batch_size = 64
# Which model to use. Only "cnn" supported for now, which is a generic CNN.
# Hyperparameters can be tuned below under [train.cnn]
model = "cnn"
[train.af_filter]
# Before training (but after simulating), we filter simulations from the
# given modelspec to retain only simulations where the beneficial-allele
# frequency in the specified population is greater than the AF threshold
# given here.
modelspec = "HomSap/HomininComposite_4G20/AI/Nea_to_CEU"
pop = "CEU"
AF = 0.05
[train.cnn]
# Hyperparameters of the network architecture.
n_conv = 7 # number of convolution layers
n_conv_filt = 16 # number of convolution filters in each layer
# Convolution filter size.
filt_size_x = 4 # spanning across 'sites'
filt_size_y = 4 # spanning across haplotypes/individuals
# Dense (fully connected) layers may follow the convolution layers.
# We generally found that adding dense layers gave no improvement,
# and sometimes decreased, the accuracy of the network.
n_dense = 0 # number of dense layers following the convolution layers
dense_size = 0 # size of each dense layer
[calibrate.weights]
# When calibrating the CNN output probabilities, the following ratios of
# training simulations will be used to fit the calibrator.
# Rather than downsampling the categories with lower weights, we take all
# simulations for the category with the lowest weight, and upsample
# those with higher weights (sampling with replacement).
"HomSap/HomininComposite_4G20/Neutral/slim" = 1
"HomSap/HomininComposite_4G20/Sweep/CEU" = 1
"HomSap/HomininComposite_4G20/AI/Nea_to_CEU" = 1
[apply]
# Application to empirical data.
# The CNN will evaluate this many empirical genotype matrices in one batch.
batch_size = 256
# The trained model is applied to windows of the VCF of the same size
# as the sequence_length above. Windows are chosen by moving along a
# chromosome by the `step` size defined here. For non-overlapping windows,
# set this to the same value as the sequence_length.
step = 20_000 # 20 kb
# Exclude sites with more genotypes missing than this proportion.
max_missing_genotypes = 0.1
# Exclude windows with fewer segregating sites than this.
min_seg_sites = 20