examples/Nea_to_CEU.toml

# Working directory, for simulation output, trained networks, plots, etc.
dir = "Nea_to_CEU"
# Reference population. This is the donor of introgressed haplotypes.
# Must be one of the populations defined below.
ref_pop = "Nea"

# Minor Allele Frequency (MAF) threshold. 
# Both training data and empirical data will be filtered to exclude
# sites with MAF < maf_threshold.
maf_threshold = 0.05

[sim]
# Length of the genomic region to be simulated.
sequence_length = 100_000  # 100 kb
# For selection simulations (sweep or AI), we condition on the final allele
# frequency of the selected mutation being greater than this in the target
# population.
min_allele_frequency = 0.01

[sim.tranche]
# The labels and modelspec(s) for each tranche. The network will be trained to
# classify data as coming from one of these tranches. Each tranche consists of
# a list of simulation modelspecs.
# Only two tranches are supported.
"not AI" = [
	"HomSap/HomininComposite_4G20/Neutral/slim",
	"HomSap/HomininComposite_4G20/Sweep/CEU",

	# Skip this for now, as it's too computationally intensive
	# to do many replicates for training. :-(
	#"HomSap/HomininComposite_4G20/DFE",
]

AI = [
	"HomSap/HomininComposite_4G20/AI/Nea_to_CEU",
]

[sim.extra]
# Additional modelspecs against which the trained model will be evaluated.
#DFE = ["HomSap/HomininComposite_4G20/DFE"]

[vcf]
# Path to the (indexed) VCF or BCF file(s).
# This is the data to which the trained network will be applied.

#file = "/path/to/vcf/all_chromosomes_in_one_file.vcf.gz"

# VCFs are often split by chromosome. If this is the case, a list of
# chromosomes may be provided here, and the ${chr} string in the filename
# will be substituted for the chromosome number when applying the trained
# network. Chromosomes may be specified as integers or strings.
# Only diploid autosomes are supported.
chr = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
	20, 21, 22 ]
file = "1000g.Nea.${chr}.bcf"
# Specify if genotypes should be treated as phased or unphased.
# Note: mixed phasing is not supported, and '|' or '/' phase indicators in
# the VCF genotype field are ignored.
phased = true

[pop]
# For each population, specify the individual IDs in the VCF. This can either
# be a list of IDs, or the name of a file containing the IDs (one per line).
# The population names must match those used for the simulations.
# The order of populations given here will be used for the ordering in the
# genotype matrices. It's recommended for the donor and recipient populations
# to be adjacent in the genotype matrices!
Nea = ["AltaiNeandertal", "Vindija33.19"]
CEU = "CEU.indlist"
YRI = "YRI.indlist"

[train]
# Resize haplotypes in each genotype matrix using this many bins.
num_rows = 256
# Train for this many epochs.
epochs = 3
# The number of genotype matrices that will be processed before Tensorflow
# updates gradients.
batch_size = 64
# Which model to use. Only "cnn" supported for now, which is a generic CNN.
# Hyperparameters can be tuned below under [train.cnn]
model = "cnn"

[train.af_filter]
# Before training (but after simulating), we filter simulations from the
# given modelspec to retain only simulations where the beneficial-allele
# frequency in the specified population is greater than the AF threshold
# given here.
modelspec = "HomSap/HomininComposite_4G20/AI/Nea_to_CEU"
pop = "CEU"
AF = 0.05

[train.cnn]
# Hyperparameters of the network architecture.
n_conv = 7  # number of convolution layers
n_conv_filt = 16  # number of convolution filters in each layer
# Convolution filter size.
filt_size_x = 4  # spanning across 'sites'
filt_size_y = 4  # spanning across haplotypes/individuals
# Dense (fully connected) layers may follow the convolution layers.
# We generally found that adding dense layers gave no improvement,
# and sometimes decreased, the accuracy of the network.
n_dense = 0  # number of dense layers following the convolution layers
dense_size = 0  # size of each dense layer

[calibrate.weights]
# When calibrating the CNN output probabilities, the following ratios of
# training simulations will be used to fit the calibrator.
# Rather than downsampling the categories with lower weights, we take all
# simulations for the category with the lowest weight, and upsample
# those with higher weights (sampling with replacement).
"HomSap/HomininComposite_4G20/Neutral/slim" = 1
"HomSap/HomininComposite_4G20/Sweep/CEU" = 1
"HomSap/HomininComposite_4G20/AI/Nea_to_CEU" = 1

[apply]
# Application to empirical data.
# The CNN will evaluate this many empirical genotype matrices in one batch.
batch_size = 256
# The trained model is applied to windows of the VCF of the same size
# as the sequence_length above. Windows are chosen by moving along a
# chromosome by the `step` size defined here. For non-overlapping windows,
# set this to the same value as the sequence_length.
step = 20_000  # 20 kb
# Exclude sites with more genotypes missing than this proportion.
max_missing_genotypes = 0.1
# Exclude windows with fewer segregating sites than this.
min_seg_sites = 20