Skip to content

Commit

Permalink
assemblies code
Browse files Browse the repository at this point in the history
  • Loading branch information
aniafijarczyk committed May 20, 2020
1 parent 90a5922 commit cd4fa9a
Show file tree
Hide file tree
Showing 12 changed files with 984,110 additions and 0 deletions.
195,449 changes: 195,449 additions & 0 deletions 07_draft_assemblies/Jean-Talon.contigs.fasta

Large diffs are not rendered by default.

3,356 changes: 3,356 additions & 0 deletions 07_draft_assemblies/Jean-Talon.contigs.fasta.coords

Large diffs are not rendered by default.

246,904 changes: 246,904 additions & 0 deletions 07_draft_assemblies/Jean-Talon.unitigs.fasta

Large diffs are not rendered by default.

4,464 changes: 4,464 additions & 0 deletions 07_draft_assemblies/Jean-Talon.unitigs.fasta.coords

Large diffs are not rendered by default.

192,857 changes: 192,857 additions & 0 deletions 07_draft_assemblies/Jean-Talon_reordered.fasta

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions 07_draft_assemblies/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Code for analysing translocation using draft assemblies


Jean-Talon.contigs.fasta # canu draft assembly contigs from reads >8kb
Jean-Talon.contigs.fasta.coords # mummer output from alignment of the draft assembly against the wtdbg2 polished assembly
Jean-Talon.unitigs.fasta # canu draft assembly unitigs from reads >8kb
Jean-Talon.unitigs.fasta.coords # mummer output from alignment of the draft assembly against the wtdbg2 polished assembly
Jean-Talon_reordered.fasta # wtdbg2 polished assembly
S288c.genome.fa # Genome of the S.cerevisiae strain S288C produced by Yue et al. Nat. Genet. 2017.
# Downloaded from https://yjx1217.github.io/Yeast_PacBio_2016/data/
S288c.genome.fa.coords # mummer output from alignment of the draft assembly against the wtdbg2 polished assembly
assemblies.ipynb # main script
barcode11.cns.fa # wtdbg2 draft assembly from reads >8kb
barcode11.cns.fa.coords # mummer output from alignment of the draft assembly against the wtdbg2 polished assembly
tig00000035.fasta # canu unitig corresponding to the uncertain translocation junction
221,656 changes: 221,656 additions & 0 deletions 07_draft_assemblies/S288c.genome.fa

Large diffs are not rendered by default.

1,766 changes: 1,766 additions & 0 deletions 07_draft_assemblies/S288c.genome.fa.coords

Large diffs are not rendered by default.

135 changes: 135 additions & 0 deletions 07_draft_assemblies/assemblies.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# author: Mathieu Hénault\n",
"from Bio import SeqIO\n",
"from Bio import SeqRecord as SeqRecord\n",
"import pandas as pd\n",
"import numpy as np\n",
"import itertools\n",
"from matplotlib import pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analysis of the alignments of draft assemblies using mummer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DELTA = {}\n",
"ASSEMBLIES = {}\n",
"tig_order = {}\n",
"cum_len = {}\n",
"for s in ['Jean-Talon_reordered.fasta','S288c.genome.fa','barcode11.cns.fa', 'Jean-Talon.unitigs.fasta', 'Jean-Talon.contigs.fasta']:\n",
" ASSEMBLIES[s] = {}\n",
" tig_order[s] = []\n",
" cl = []\n",
" with open(f'/Volumes/MacintoshHD/Dropbox/Jean_Talon/assemblies/{s}') as fi:\n",
" for seq in SeqIO.parse(fi, 'fasta'):\n",
" ASSEMBLIES[s][seq.id] = seq\n",
" tig_order[s].append(seq.id)\n",
" cl.append(len(seq.seq))\n",
" \n",
" cl = pd.Series([0]+list(np.cumsum(cl)[:-1]), index=tig_order[s])\n",
" cum_len[s] = cl\n",
"\n",
" if s!='Jean-Talon_reordered.fasta':\n",
" delta = pd.read_csv(f'/Volumes/MacintoshHD/Dropbox/Jean_Talon/mummer/{s}.coords', sep='\\t', skiprows=range(4), index_col=None, header=None)\n",
" delta = delta.loc[(delta[4]>=1e4) & (delta[5]>=1e4)].astype({1:int,2:int,2:int,2:int}).sort_values(by=4)\n",
" \n",
" delta['s1'] = delta[0]+cum_len['Jean-Talon_reordered.fasta'].loc[delta[7].values].values\n",
" delta['e1'] = delta[1]+cum_len['Jean-Talon_reordered.fasta'].loc[delta[7].values].values\n",
" delta['s2'] = delta[2]+cum_len[s].loc[delta[8].values].values\n",
" delta['e2'] = delta[3]+cum_len[s].loc[delta[8].values].values\n",
" \n",
" DELTA[s] = delta "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"fig = plt.figure(figsize=[32,8])\n",
"gs = plt.GridSpec(ncols=4, nrows=1, wspace=0.3, left=0.03, bottom=0.08, right=0.96, top=0.9)\n",
"\n",
"assembly_alias = {'Jean-Talon_reordered.fasta': 'Jean-Talon wtdbg2 polished',\n",
" 'S288c.genome.fa': 'S288C',\n",
" 'barcode11.cns.fa': 'Jean-Talon wtdbg2 draft (>8kb reads)', \n",
" 'Jean-Talon.unitigs.fasta': 'Jean-Talon Canu draft unitigs (>8kb reads)', \n",
" 'Jean-Talon.contigs.fasta': 'Jean-Talon Canu draft contigs (>8kb reads)'}\n",
"coord_translocation = np.mean(np.array([1.4e5, 1.47e5])+cum_len['Jean-Talon_reordered.fasta'].loc['ctg6_pilon'])\n",
"\n",
"for ax_idx, s in enumerate(['S288c.genome.fa', 'barcode11.cns.fa', 'Jean-Talon.contigs.fasta', 'Jean-Talon.unitigs.fasta']):\n",
" ax = fig.add_subplot(gs[ax_idx])\n",
" delta = DELTA[s]\n",
" for i in delta.index:\n",
" s1, e1, s2, e2 = delta.loc[i, ['s1','e1','s2','e2']]\n",
" c = str(1-delta.loc[i,6]/100)\n",
" ax.plot([s1,e1], [s2,e2], color=c, lw=1)\n",
" for i in cum_len[s]:\n",
" ax.axhline(i, lw=0.5, ls=':', color='k')\n",
" for i in cum_len['Jean-Talon_reordered.fasta']:\n",
" ax.axvline(i, lw=0.5, ls=':', color='k')\n",
" ax.margins(0)\n",
" ax.axvline(coord_translocation, color='red', ls='-', lw=3, alpha=0.3)\n",
" \n",
" assembly_size = ax.axis()\n",
" ax.set_xticks(np.arange(0, assembly_size[1], 1e6))\n",
" ax.set_xticklabels(np.arange(0, assembly_size[1]*1e-6, 1).astype(int))\n",
" ax.set_yticks(np.arange(0, assembly_size[3], 1e6))\n",
" ax.set_yticklabels(np.arange(0, assembly_size[3]*1e-6, 1).astype(int))\n",
" \n",
" #if s=='S288c.genome.fa':\n",
" for tig, df in delta.groupby(8):\n",
" if len(ASSEMBLIES[s][tig].seq)>2e5:\n",
" ax.text(assembly_size[1]*1.02, np.mean([df['s2'].min(), df['e2'].max()]), s=tig, va='center', ha='left', size=7)\n",
" \n",
" for tig, df in delta.groupby(7):\n",
" if len(ASSEMBLIES['Jean-Talon_reordered.fasta'][tig].seq)>2e5:\n",
" ax.text(np.mean([df['s1'].min(), df['e1'].max()]), assembly_size[3]*1.02, s=tig, va='bottom', ha='center', rotation=90, size=7)\n",
" \n",
" ax.set_xlabel(assembly_alias['Jean-Talon_reordered.fasta'], size=14)\n",
" ax.set_ylabel(assembly_alias[s], size=14)\n",
" \n",
"plt.savefig('/Volumes/MacintoshHD/Dropbox/Jean_Talon/fig/FigS7.svg')\n",
"#plt.show()\n",
"plt.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit cd4fa9a

Please sign in to comment.