Skip to content

Commit

Permalink
correct quality <-> error functions and adding some description in fu…
Browse files Browse the repository at this point in the history
…nctions
  • Loading branch information
MoiColl committed Jul 19, 2022
1 parent 650e4cb commit d869194
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 22 deletions.
139 changes: 126 additions & 13 deletions notebook/simGL.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,10 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "a3c58dad-95fa-4fe1-8971-521842ea4182",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The rpy2.ipython extension is already loaded. To reload it, use:\n",
" %reload_ext rpy2.ipython\n"
]
}
],
"outputs": [],
"source": [
"import time\n",
"import numpy as np\n",
Expand All @@ -48,10 +39,28 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"id": "966418dd-9400-405c-8983-a4714ad51704",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──\n",
"\n",
"R[write to console]: ✔ tibble 3.1.7 ✔ dplyr 1.0.9\n",
"✔ tidyr 1.2.0 ✔ stringr 1.4.0\n",
"✔ readr 2.1.2 ✔ forcats 0.5.1\n",
"✔ purrr 0.3.4 \n",
"\n",
"R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n",
"✖ dplyr::filter() masks stats::filter()\n",
"✖ dplyr::lag() masks stats::lag()\n",
"\n"
]
}
],
"source": [
"%%R\n",
"\n",
Expand Down Expand Up @@ -6819,6 +6828,110 @@
"id": "dca5ddb8-285c-4677-887e-a0fb3e8f17d6",
"metadata": {},
"outputs": [],
"source": [
"-10 * log(0.000001) = 60"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7846e2d4-dc88-46fd-9f15-dff97b13f9ba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"60.0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"-10*np.log10(0.000001)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be8f646b-ea5d-4dc5-b84e-7b46390979ad",
"metadata": {},
"outputs": [],
"source": [
"-10*np.log10(0.000001)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1d8b345c-771b-4bd8-b3ce-e6f4bc9e180f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1e-06"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.power(10, -(60/10))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "94eb0a1c-fa6c-4970-bf09-59a8cd017d8c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"60.0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"-10*np.log10(0.000001)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e8b5e154-b1b3-4a7c-af06-e74b1f1648b8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.0024787521766663585"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.exp(-60/10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a17f675b-2961-4365-aa28-7a77f955e6ee",
"metadata": {},
"outputs": [],
"source": []
}
],
Expand Down
26 changes: 17 additions & 9 deletions simGL/simGL.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from scipy.stats import binom

def e2q(e):
return -10*np.log(e)
return -10*np.log10(e)

def q2e(q):
return np.exp(-q/10)
return np.power(10, -(q/10))

def incorporate_monomorphic(gm, pos, start, end):
'''
Expand Down Expand Up @@ -46,7 +46,7 @@ def refalt(ref, alt, n_sit):
if ref is None and alt is None:
ref = np.full(n_sit, "A")
alt = np.full(n_sit, "C")
return ref, alt
return ref, alt

def depth_per_haplotype(rng, mean_depth, std_depth, n_hap):
if isinstance(mean_depth, np.ndarray):
Expand All @@ -66,7 +66,7 @@ def refalt_int_encoding(gm, ref, alt):
refalt_int[refalt_str == "T"] = 3
return refalt_int[gm.reshape(-1), np.repeat(np.arange(gm.shape[0]), gm.shape[1])].reshape(gm.shape)

def linked_depth(rng, DPh, read_length, sites_n):
def linked_depth(rng, DPh, read_length, n_sit):
'''
Simulates reads in a contiguous genomic region to compute the depth per position.
Expand All @@ -78,7 +78,7 @@ def linked_depth(rng, DPh, read_length, sites_n):
Numpy array with the depth per haplotype
read_length : `int`
Read length in base pair units
sites_n : `int`
n_sit : `int`
number of sites that depth has to be simulated for
Returns
Expand All @@ -87,10 +87,10 @@ def linked_depth(rng, DPh, read_length, sites_n):
Depth per site per haplotype
'''
DP = []
read_n = ((DPh*sites_n)/read_length).astype("int")
read_n = ((DPh*n_sit)/read_length).astype("int")
for r in read_n:
dp = np.zeros((sites_n,), dtype=int)
for p in rng.integers(low=0, high=sites_n-read_length+1, size=r):
dp = np.zeros((n_sit,), dtype=int)
for p in rng.integers(low=0, high=n_sit-read_length+1, size=r):
dp[p:p+read_length] += 1
DP.append(dp.tolist())
return np.array(DP).T
Expand Down Expand Up @@ -150,7 +150,7 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
(haplotypic samples, )) and the order must be the same as the second dimention of `gm`.
ploidy : `int`
Number of haplotypic chromosomes per individual.
Number of haplotypic chromosomes per individual. It is recomended to read Notes about ploidy.
ref : `numpy.ndarray`, optional
Reference alleles list per site. The size of the array must be (sites, ) and the order has to
Expand Down Expand Up @@ -181,6 +181,14 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
must be 15.
- If monomorphic sites are included, the `alt` values corresponding to those sites are not taken into account,
but they must be still indicated.
- Regarding ploidy, if the error parameter is specified as a constant for all individuals, the user can specify
the desired ploidy of the organisms simulated.
If different error rate per haplotype is inputed and the user wants to compute Genotype Likelihoods (GL) for
organisms with ploidy > 1, ploidy should be equal to 1 for this function, and when the later function
`allelereadcounts_to_GL()` is used, then, the desired ploidy can be specified. This is because the error values
must be inputed again to compute GL and if ploidy > 1 is specified for this function, the dimentions of `arc`
will be smaller than the dimentions of `e`. Nonetheless, if the user desires to obtain the output `arc` in
a certain ploidy, one can use `ploidy_sum(arc, ploidy)` fucntion.
'''
#Checks
assert check_gm(gm)
Expand Down

0 comments on commit d869194

Please sign in to comment.