Skip to content

Commit

Permalink
Replaced crude_db_harmonisation.sh with python file. Removed unnecess…
Browse files Browse the repository at this point in the history
…ary print statements
  • Loading branch information
Vedanth-Ramji committed Mar 14, 2024
1 parent 8cd9947 commit d0be645
Show file tree
Hide file tree
Showing 24 changed files with 11,759 additions and 132,838 deletions.
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
15,990 changes: 8,132 additions & 7,858 deletions argnorm/data/megares_ARO_mapping.tsv

Large diffs are not rendered by default.

6,908 changes: 3,446 additions & 3,462 deletions argnorm/data/resfinder_fg_ARO_mapping.tsv

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions argnorm/normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def run(self, input_file : str):
original_annot[self._input_gene_col].str.lower()
)
aro_table = self.get_aro_mapping_table()
print(aro_table)
aro_table.set_index(self.preprocess_ref_genes(
aro_table[ORIGINAL_ID_COL].str.lower()
), inplace=True)
Expand Down Expand Up @@ -119,12 +118,10 @@ def get_aro_mapping_table(self):
gene_identifier = 'Original ID'
manual_curation_fname = 'sarg_manual_curation.tsv'
manual_curation = pd.read_csv(get_data_path(manual_curation_fname, True), sep='\t')
print(manual_curation)
aro_nan_indices = [(list(df[gene_identifier]).index(manual_curation.loc[i, gene_identifier])) for i in range(manual_curation.shape[0])]

for i in range(len(aro_nan_indices)):
df.loc[aro_nan_indices[i], 'ARO'] = manual_curation.loc[i, 'ARO']
print(df.loc[aro_nan_indices[i]])
df.loc[aro_nan_indices[i], 'Gene Name in CARD'] = manual_curation.loc[i, 'Gene Name in CARD']

df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(float(a)) if is_number(a) == True else a}')
Expand Down
7 changes: 0 additions & 7 deletions db_harmonisation/clean_ncbi.py

This file was deleted.

170 changes: 170 additions & 0 deletions db_harmonisation/crude_db_harmonisation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
from jug import TaskGenerator, barrier
import shutil
import subprocess
import requests
import os

@TaskGenerator
def create_out_dirs():
os.makedirs('dbs', exist_ok=True)
os.makedirs('mapping', exist_ok=True)

@TaskGenerator
def get_resfinder_db():
from glob import glob
subprocess.check_call(
['git', 'clone', 'https://bitbucket.org/genomicepidemiology/resfinder_db'])

with open('dbs/resfinder.fna', 'w') as f:
for file in glob('resfinder_db/*.fsa'):
with open(file) as f2:
f.write(f2.read())
return 'dbs/resfinder.fna'

@TaskGenerator
def get_resfinderfg_db():
url = 'https://raw.githubusercontent.com/RemiGSC/ResFinder_FG_Construction/606b4768433079d55f5b179219e080a45bf59dfc/output/RFG_db/ResFinder_FG.faa'
with open('dbs/resfinder_fg.faa', 'w') as f:
f.write(requests.get(url).text)
return 'dbs/resfinder_fg.faa'


@TaskGenerator
def get_ncbi_db():
subprocess.check_call(
['wget', 'https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/AMRProt'])
subprocess.check_call(
['mv', 'AMRProt', 'dbs/ncbi_amr.faa'])
return 'dbs/ncbi_amr.faa'

@TaskGenerator
def get_sarg_db():
subprocess.check_call(
['wget', 'https://smile.hku.hk/ARGs/dataset/indexingdownload/Short_subdatabase_V3.2.1.zip'])
subprocess.check_call(
['mv', 'Short_subdatabase_V3.2.1.zip', 'sarg.zip'])
subprocess.check_call(
['unzip', 'sarg.zip'])

shutil.copy('Short_subdatabase/4.SARG_v3.2_20220917_Short_subdatabase.fasta', 'dbs/sarg.faa')
return 'dbs/sarg.faa'

@TaskGenerator
def get_deeparg_db():
subprocess.check_call(
['git', 'clone', 'https://bitbucket.org/gusphdproj/deeparg-largerepo/'])
shutil.copy('deeparg-largerepo/database/v2/features.fasta', 'dbs/deeparg.faa')
return 'dbs/deeparg.faa'

@TaskGenerator
def get_card_db():
subprocess.check_call(
['wget', '-c', '-O', 'dbs/card.tar.bz2', 'https://card.mcmaster.ca/latest/data'])
subprocess.check_call(
['tar', '-xvf', 'dbs/card.tar.bz2', '-C', 'dbs'])
return 'dbs/card.json'

@TaskGenerator
def get_argannot_db():
url = 'https://raw.githubusercontent.com/tseemann/abricate/master/db/argannot/sequences'
with open('dbs/argannot.fna', 'w') as f:
f.write(requests.get(url).text)
return 'dbs/argannot.fna'

@TaskGenerator
def get_megares_db():
url = 'https://www.meglab.org/downloads/megares_v3.00/megares_database_v3.00.fasta'
with open('dbs/megares.fna', 'w') as f:
f.write(requests.get(url).text)
return 'dbs/megares.fna'

@TaskGenerator
def load_card_db(card_json):
subprocess.check_call(
['rgi', 'load', '-i', card_json])

@TaskGenerator
def rgi_on_resfinder(resfinder_fna):
subprocess.check_call(
['rgi', 'main', '-i', resfinder_fna, '-o', 'mapping/resfinder_rgi', '-t', 'contig', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def rgi_on_ncbi(ncbi_amr_faa):
from Bio import SeqIO
from Bio.Seq import Seq

with open(ncbi_amr_faa) as original, open('./dbs/ncbi_amr_corrected.faa', 'w') as corrected:
for record in SeqIO.parse(ncbi_amr_faa, 'fasta'):
record.seq = Seq(str(record.seq).replace("*", ""))
SeqIO.write(record, corrected, 'fasta')

subprocess.check_call(
['rgi', 'main', '-i', './dbs/ncbi_amr_corrected.faa', '-o', 'mapping/ncbi_rgi', '-t', 'protein', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def rgi_on_sarg(sarg_faa):
subprocess.check_call(
['rgi', 'main', '-i', sarg_faa, '-o', 'mapping/sarg_rgi', '-t', 'protein', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def rgi_on_resfinderfg(resfinder_fg_faa):
subprocess.check_call(
['rgi', 'main', '-i', resfinder_fg_faa, '-o', 'mapping/resfinder_fg_rgi', '-t', 'protein', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def rgi_on_deeparg(deeparg_faa):
subprocess.check_call(
['rgi', 'main', '-i', deeparg_faa, '-o', 'mapping/deeparg_rgi', '-t', 'protein', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def rgi_on_argannot(argnnot_fna):
subprocess.check_call(
['rgi', 'main', '-i', argnnot_fna, '-o', 'mapping/argannot_rgi', '-t', 'contig', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def rgi_on_megares(megares_fna):
subprocess.check_call(
['rgi', 'main', '-i', megares_fna, '-o', 'mapping/megares_rgi', '-t', 'contig', '-a', 'BLAST', '--clean', '--include_loose']
)

@TaskGenerator
def reconcile_dbs():
from .get_mapping_table import get_aro_for_hits

get_aro_for_hits('mapping/resfinder_rgi.txt').to_csv('mapping/resfinder_ARO_mapping.tsv', sep='\t')
get_aro_for_hits('mapping/ncbi_rgi.txt').to_csv('mapping/ncbi_ARO_mapping.tsv', sep='\t')
get_aro_for_hits('mapping/sarg_rgi.txt').to_csv('mapping/sarg_ARO_mapping.tsv', sep='\t')
get_aro_for_hits('mapping/resfinder_fg_rgi.txt').to_csv('mapping/resfinder_fg_ARO_mapping.tsv', sep='\t')
get_aro_for_hits('mapping/deeparg_rgi.txt').to_csv('mapping/deeparg_ARO_mapping.tsv', sep='\t')
get_aro_for_hits('mapping/argannot_rgi.txt').to_csv('mapping/argannot_ARO_mapping.tsv', sep='\t')
get_aro_for_hits('mapping/megares_rgi.txt').to_csv('mapping/megares_ARO_mapping.tsv', sep='\t')

create_out_dirs()
databases = [
get_resfinder_db(),
get_resfinderfg_db(),
get_megares_db(),
get_ncbi_db(),
get_sarg_db(),
get_deeparg_db(),
get_argannot_db()
]
card_json = get_card_db()
load_card_db(card_json)
rgi_on_databases = [
rgi_on_resfinder('dbs/resfinder.fna'),
rgi_on_ncbi('dbs/ncbi_amr.faa'),
rgi_on_sarg('dbs/sarg.faa'),
rgi_on_resfinderfg('dbs/resfinder_fg.faa'),
rgi_on_deeparg('dbs/deeparg.faa'),
rgi_on_argannot('dbs/argannot.fna'),
rgi_on_megares('dbs/megares.fna')
]
reconcile_dbs()
barrier()
71 changes: 0 additions & 71 deletions db_harmonisation/crude_db_harmonisation.sh

This file was deleted.

Loading

0 comments on commit d0be645

Please sign in to comment.