diff --git a/notebook/simGL.ipynb b/notebook/simGL.ipynb index f7ebc20..35b4457 100644 --- a/notebook/simGL.ipynb +++ b/notebook/simGL.ipynb @@ -18,19 +18,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "a3c58dad-95fa-4fe1-8971-521842ea4182", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The rpy2.ipython extension is already loaded. To reload it, use:\n", - " %reload_ext rpy2.ipython\n" - ] - } - ], + "outputs": [], "source": [ "import time\n", "import numpy as np\n", @@ -48,10 +39,28 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "966418dd-9400-405c-8983-a4714ad51704", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──\n", + "\n", + "R[write to console]: ✔ tibble 3.1.7 ✔ dplyr 1.0.9\n", + "✔ tidyr 1.2.0 ✔ stringr 1.4.0\n", + "✔ readr 2.1.2 ✔ forcats 0.5.1\n", + "✔ purrr 0.3.4 \n", + "\n", + "R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n", + "✖ dplyr::filter() masks stats::filter()\n", + "✖ dplyr::lag() masks stats::lag()\n", + "\n" + ] + } + ], "source": [ "%%R\n", "\n", @@ -6819,6 +6828,110 @@ "id": "dca5ddb8-285c-4677-887e-a0fb3e8f17d6", "metadata": {}, "outputs": [], + "source": [ + "-10 * log(0.000001) = 60" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7846e2d4-dc88-46fd-9f15-dff97b13f9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "60.0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "-10*np.log10(0.000001)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be8f646b-ea5d-4dc5-b84e-7b46390979ad", + "metadata": {}, + "outputs": [], + "source": [ + "-10*np.log10(0.000001)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1d8b345c-771b-4bd8-b3ce-e6f4bc9e180f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1e-06" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.power(10, -(60/10))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "94eb0a1c-fa6c-4970-bf09-59a8cd017d8c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "60.0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "-10*np.log10(0.000001)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e8b5e154-b1b3-4a7c-af06-e74b1f1648b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0024787521766663585" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.exp(-60/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a17f675b-2961-4365-aa28-7a77f955e6ee", + "metadata": {}, + "outputs": [], "source": [] } ], diff --git a/simGL/simGL.py b/simGL/simGL.py index 7c5108c..056d112 100644 --- a/simGL/simGL.py +++ b/simGL/simGL.py @@ -4,10 +4,10 @@ from scipy.stats import binom def e2q(e): - return -10*np.log(e) + return -10*np.log10(e) def q2e(q): - return np.exp(-q/10) + return np.power(10, -(q/10)) def incorporate_monomorphic(gm, pos, start, end): ''' @@ -46,7 +46,7 @@ def refalt(ref, alt, n_sit): if ref is None and alt is None: ref = np.full(n_sit, "A") alt = np.full(n_sit, "C") - return ref, alt + return ref, alt def depth_per_haplotype(rng, mean_depth, std_depth, n_hap): if isinstance(mean_depth, np.ndarray): @@ -66,7 +66,7 @@ def refalt_int_encoding(gm, ref, alt): refalt_int[refalt_str == "T"] = 3 return refalt_int[gm.reshape(-1), np.repeat(np.arange(gm.shape[0]), gm.shape[1])].reshape(gm.shape) -def linked_depth(rng, DPh, read_length, sites_n): +def linked_depth(rng, DPh, read_length, n_sit): ''' Simulates reads in a contiguous genomic region to compute the depth per position. @@ -78,7 +78,7 @@ def linked_depth(rng, DPh, read_length, sites_n): Numpy array with the depth per haplotype read_length : `int` Read length in base pair units - sites_n : `int` + n_sit : `int` number of sites that depth has to be simulated for Returns @@ -87,10 +87,10 @@ def linked_depth(rng, DPh, read_length, sites_n): Depth per site per haplotype ''' DP = [] - read_n = ((DPh*sites_n)/read_length).astype("int") + read_n = ((DPh*n_sit)/read_length).astype("int") for r in read_n: - dp = np.zeros((sites_n,), dtype=int) - for p in rng.integers(low=0, high=sites_n-read_length+1, size=r): + dp = np.zeros((n_sit,), dtype=int) + for p in rng.integers(low=0, high=n_sit-read_length+1, size=r): dp[p:p+read_length] += 1 DP.append(dp.tolist()) return np.array(DP).T @@ -150,7 +150,7 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non (haplotypic samples, )) and the order must be the same as the second dimention of `gm`. ploidy : `int` - Number of haplotypic chromosomes per individual. + Number of haplotypic chromosomes per individual. It is recomended to read Notes about ploidy. ref : `numpy.ndarray`, optional Reference alleles list per site. The size of the array must be (sites, ) and the order has to @@ -181,6 +181,14 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non must be 15. - If monomorphic sites are included, the `alt` values corresponding to those sites are not taken into account, but they must be still indicated. + - Regarding ploidy, if the error parameter is specified as a constant for all individuals, the user can specify + the desired ploidy of the organisms simulated. + If different error rate per haplotype is inputed and the user wants to compute Genotype Likelihoods (GL) for + organisms with ploidy > 1, ploidy should be equal to 1 for this function, and when the later function + `allelereadcounts_to_GL()` is used, then, the desired ploidy can be specified. This is because the error values + must be inputed again to compute GL and if ploidy > 1 is specified for this function, the dimentions of `arc` + will be smaller than the dimentions of `e`. Nonetheless, if the user desires to obtain the output `arc` in + a certain ploidy, one can use `ploidy_sum(arc, ploidy)` fucntion. ''' #Checks assert check_gm(gm)