-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathallel_class.py
94 lines (87 loc) · 2.87 KB
/
allel_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon May 8 14:24:53 2017
Class for scikit-allel
Example:
Chr1_0 = Chr(callset, "Chr1_0")
Chr1_0.loadpos()
Chr1_0.positions # positions
Chr1_0.vtbl # variant table
Chr1_0.genotypes # genotypes
Chr1_0.allelecounts # allelecounts
Chr1_0.popsubset(["PNG", "Haiti", "Kenya", "Mali"])
Chr1_0.subsample["PNG"] # meta just in this pop
Chr1_0.subsample_genotype["PNG"] # genotypes just in this pop
Chr1_0.ldfilter(Chr1_0.genotypes, size=100, step=50, thresh=.1, n_iter=1)
Chr1_0.ldthin
Chr1_0.corr
Chr1_0.gtNOsingletons
@author: [email protected]
"""
import numpy as np
import allel
import h5py
class Chr(object):
def __init__(self, pop, calls):
"""
name: chromosome to query
calls: load data
callset_fn = FOO.h5
callset = h5py.File(callset_fn, mode='r')
Chr1 = Chr("Wb_Chr1_0", callset)
"""
self.calls = h5py.File(calls, mode='r') # requires premade h5
self.pop = pop
self.chrm = self.calls['variants/CHROM']
def geno(self, name, meta):
"""
pos: positions in the chromosome
gt: genotype array
sm: attached meta data
"""
# get chrom
chrom = self.chrm
chrom_mask = np.where(chrom[:] == name)
p = self.calls['variants/POS']
pos = p[:][chrom_mask]
self.pos = allel.SortedIndex(pos)
# get genotype data
geno = allel.GenotypeArray(self.calls['calldata/GT'])
gt = geno[:][chrom_mask]
# get population data
if self.pop == 'All':
self.gt = gt
self.sm = meta
else:
samples = meta.ix[meta.Population.isin(self.pop)].index.tolist()
self.gt = gt[:, samples]
self.sm = meta.ix[samples]
def miss(self, genotypes, positions, prctmiss):
"""
"""
misscount = genotypes.count_missing(axis=1)
# .20 will remove sites with > 20% missing data
missarray = misscount <= (genotypes.n_samples * prctmiss)
# remove missing
self.gt = genotypes.compress(missarray, axis=0)
# adjust positions
self.pos = positions[missarray]
def seg(self, genotypes, positions):
"""
"""
var_seg = genotypes.count_alleles().is_segregating()
self.pos = positions[var_seg]
self.gt = genotypes.compress(var_seg)
def mac(self, genotypes, positions, mac):
"""
mac: minor allele count
"""
# identify singletons
allelecounts = genotypes.count_alleles()
fltsingle = (allelecounts.max_allele() == 1) &\
(allelecounts[:, :2].min(axis=1) > mac) # biallelic only
# remove singletons
self.gt = genotypes.compress(fltsingle, axis=0)
# adjust positions
self.pos = positions[fltsingle]