correct quality <-> error functions and adding some description in fu…

…nctions
RacimoLab · Jul 19, 2022 · d869194 · d869194
1 parent 650e4cb
commit d869194
Show file tree

Hide file tree

Showing 2 changed files with 143 additions and 22 deletions.
diff --git a/notebook/simGL.ipynb b/notebook/simGL.ipynb
@@ -18,19 +18,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "a3c58dad-95fa-4fe1-8971-521842ea4182",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The rpy2.ipython extension is already loaded. To reload it, use:\n",
-      "  %reload_ext rpy2.ipython\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import time\n",
     "import numpy as np\n",
@@ -48,10 +39,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "id": "966418dd-9400-405c-8983-a4714ad51704",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "R[write to console]: ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──\n",
+      "\n",
+      "R[write to console]: ✔ tibble  3.1.7     ✔ dplyr   1.0.9\n",
+      "✔ tidyr   1.2.0     ✔ stringr 1.4.0\n",
+      "✔ readr   2.1.2     ✔ forcats 0.5.1\n",
+      "✔ purrr   0.3.4     \n",
+      "\n",
+      "R[write to console]: ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n",
+      "✖ dplyr::filter() masks stats::filter()\n",
+      "✖ dplyr::lag()    masks stats::lag()\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "%%R\n",
     "\n",
@@ -6819,6 +6828,110 @@
    "id": "dca5ddb8-285c-4677-887e-a0fb3e8f17d6",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "-10 * log(0.000001) = 60"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7846e2d4-dc88-46fd-9f15-dff97b13f9ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "60.0"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "-10*np.log10(0.000001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be8f646b-ea5d-4dc5-b84e-7b46390979ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "-10*np.log10(0.000001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1d8b345c-771b-4bd8-b3ce-e6f4bc9e180f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1e-06"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.power(10, -(60/10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "94eb0a1c-fa6c-4970-bf09-59a8cd017d8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "60.0"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "-10*np.log10(0.000001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e8b5e154-b1b3-4a7c-af06-e74b1f1648b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0024787521766663585"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.exp(-60/10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a17f675b-2961-4365-aa28-7a77f955e6ee",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],

diff --git a/simGL/simGL.py b/simGL/simGL.py
@@ -4,10 +4,10 @@
 from scipy.stats import binom
 
 def e2q(e):
-    return -10*np.log(e)
+    return -10*np.log10(e)
 
 def q2e(q):
-    return np.exp(-q/10)
+    return np.power(10, -(q/10))
 
 def incorporate_monomorphic(gm, pos, start, end):
     '''
@@ -46,7 +46,7 @@ def refalt(ref, alt, n_sit):
     if ref is None and alt is None:
         ref = np.full(n_sit, "A")
         alt = np.full(n_sit, "C")
-        return ref, alt
+    return ref, alt
 
 def depth_per_haplotype(rng, mean_depth, std_depth, n_hap):
     if isinstance(mean_depth, np.ndarray):
@@ -66,7 +66,7 @@ def refalt_int_encoding(gm, ref, alt):
     refalt_int[refalt_str == "T"] = 3
     return refalt_int[gm.reshape(-1), np.repeat(np.arange(gm.shape[0]), gm.shape[1])].reshape(gm.shape)
 
-def linked_depth(rng, DPh, read_length, sites_n):
+def linked_depth(rng, DPh, read_length, n_sit):
     '''
     Simulates reads in a contiguous genomic region to compute the depth per position.
     
@@ -78,7 +78,7 @@ def linked_depth(rng, DPh, read_length, sites_n):
         Numpy array with the depth per haplotype
     read_length : `int`
         Read length in base pair units
-    sites_n : `int`
+    n_sit : `int`
         number of sites that depth has to be simulated for
     
     Returns 
@@ -87,10 +87,10 @@ def linked_depth(rng, DPh, read_length, sites_n):
         Depth per site per haplotype
     '''
     DP = []
-    read_n     = ((DPh*sites_n)/read_length).astype("int")
+    read_n     = ((DPh*n_sit)/read_length).astype("int")
     for r in read_n:
-        dp = np.zeros((sites_n,), dtype=int)
-        for p in rng.integers(low=0, high=sites_n-read_length+1, size=r):
+        dp = np.zeros((n_sit,), dtype=int)
+        for p in rng.integers(low=0, high=n_sit-read_length+1, size=r):
             dp[p:p+read_length] += 1
         DP.append(dp.tolist())
     return np.array(DP).T
@@ -150,7 +150,7 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
         (haplotypic samples, )) and the order must be the same as the second dimention of `gm`.
     
     ploidy : `int` 
-        Number of haplotypic chromosomes per individual.
+        Number of haplotypic chromosomes per individual. It is recomended to read Notes about ploidy.
     
     ref : `numpy.ndarray`, optional
         Reference alleles list per site. The size of the array must be (sites, ) and the order has to 
@@ -181,6 +181,14 @@ def sim_allelereadcounts(gm, mean_depth, e, ploidy, seed = None, std_depth = Non
       must be 15. 
     - If monomorphic sites are included, the `alt` values corresponding to those sites are not taken into account, 
       but they must be still indicated.
+    - Regarding ploidy, if the error parameter is specified as a constant for all individuals, the user can specify 
+      the desired ploidy of the organisms simulated. 
+      If different error rate per haplotype is inputed and the user wants to compute Genotype Likelihoods (GL) for 
+      organisms with ploidy > 1, ploidy should be equal to 1 for this function, and when the later function 
+      `allelereadcounts_to_GL()` is used, then, the desired ploidy can be specified. This is because the error values 
+      must be inputed again to compute GL and if ploidy > 1 is specified for this function, the dimentions of `arc`
+      will be smaller than the dimentions of `e`. Nonetheless, if the user desires to obtain the output `arc` in 
+      a certain ploidy, one can use `ploidy_sum(arc, ploidy)` fucntion. 
     '''
     #Checks
     assert check_gm(gm)