diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..0d2acf6 --- /dev/null +++ b/NOTES @@ -0,0 +1,4 @@ +We have use script to convert fasta to fastq downlaoded from here +https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/fasta-to-fastq/fasta_to_fastq.pl + + diff --git a/fasta_to_fastq.pl b/fasta_to_fastq.pl new file mode 100644 index 0000000..36f2aa2 --- /dev/null +++ b/fasta_to_fastq.pl @@ -0,0 +1,47 @@ +#Copyright (c) 2010 LUQMAN HAKIM BIN ABDUL HADI (csilhah@nus.edu.sg) +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +#(the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +#merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +#furnished to do so, subject to the following conditions: + +#The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +#OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +#LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +#IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#!/usr/bin/perl +use strict; + +my $file = $ARGV[0]; +open FILE, $file; + +my ($header, $sequence, $sequence_length, $sequence_quality); +while() { + chomp $_; + if ($_ =~ /^>(.+)/) { + if($header ne "") { + print "\@".$header."\n"; + print $sequence."\n"; + print "+"."\n"; + print $sequence_quality."\n"; + } + $header = $1; + $sequence = ""; + $sequence_length = ""; + $sequence_quality = ""; + } + else { + $sequence .= $_; + $sequence_length = length($_); + for(my $i=0; $i<$sequence_length; $i++) {$sequence_quality .= "I"} + } +} +close FILE; +print "\@".$header."\n"; +print $sequence."\n"; +print "+"."\n"; +print $sequence_quality."\n"; + diff --git a/needle.sh b/needle.sh index 1cf3ba6..ccebd43 100755 --- a/needle.sh +++ b/needle.sh @@ -4,6 +4,8 @@ source $(dirname $0)/argparse.bash || exit 1 argparse "$@" < ${SAMPLE}.cat.unmapped.fastq +else + samtools view -f 0x4 -bh $BAM | samtools bam2fq - >${SAMPLE}.unmapped.fastq -#samtools view -bh $BAM NC_007605 | samtools fastq - > ${SAMPLE}.NC_007605.fastq -#rm -fr ${SAMPLE}.NC_007605.fastq -#cat ${SAMPLE}.unmapped.fastq ${SAMPLE}.NC_007605.fastq>${SAMPLE}.cat.unmapped.fastq -#rm -fr ${SAMPLE}.unmapped.fastq -UNMAPPED=${SAMPLE}.unmapped.fastq +samtools view -bh $BAM NC_007605 | samtools fastq - > ${SAMPLE}.NC_007605.fastq +rm -fr ${SAMPLE}.NC_007605.fastq +cat ${SAMPLE}.unmapped.fastq ${SAMPLE}.NC_007605.fastq>${SAMPLE}.cat.unmapped.fastq +rm -fr ${SAMPLE}.unmapped.fastq + +fi + + +UNMAPPED=${SAMPLE}.cat.unmapped.fastq +wc -l $UNMAPPED +exit 1 bwa mem -a ${DB}/viral.vipr/NONFLU_All.fastq $UNMAPPED | samtools view -S -b -F 4 - | samtools sort - >${SAMPLE}.virus.bam bwa mem -a ${DB}/fungi/fungi.ncbi.february.3.2018.fasta $UNMAPPED | samtools view -S -b -F 4 - | samtools sort - >${SAMPLE}.fungi.bam diff --git a/toy.example.fasta b/toy.example.fasta new file mode 100644 index 0000000..59f98c7 --- /dev/null +++ b/toy.example.fasta @@ -0,0 +1,14 @@ +>1 +TCGCACCGGGGATCCTAGGCTTATTATATTGCGCTTTGTATTGAAGTCTGTTCTGTGTGGACTGACCTCAGCACAAGTGTTATGGGGCAGATCATAACAATGTTTGAGGCCCTGCCTCACATTATCGATGAGGTCATCAACATTGTTATAATAGTGCTTATAATAATAACAAGCATAAAGGCTGTGTACAACTTTGCTACCTGTGGCATCATTGCATTGATCAGCTTCTGCTTCTTGGCTGGAAGGTCTTGTGGCTTGTATGGTGTCTCTGGCTCTGACATTTACAAGGGACTCTACCAGTTCCAGTCCGTAGAGTTCAACATGTCACAATTGAATTTAACAATGCCCAATGCGTGCTCAGCCAACAATTCCCACCATTACATCAGCATGGGAAAATCTGGCCTGGAACTAACCTTTACAAATGACTCCATCATTCAACACAACTTCTGCAACCTAACTGATGGGTTCAAGAAAAAAACCTTTGATCATACACTTATGAGCATAGTGTCAAGCCTGCACCTGAGCATTAGAGGAAATACCATCTACAAAGCTGTGTCCTGTGACTTCAACAATGGGATTACAATCCAGTACAACCTAACCTTCTCTGATGCACAAGGTGCCATCAATCAATGTGGAACCTTCAGAGGTAGAGTTTTAGATATGTTTAGAACAGCTTTTGGGGGGAAATACATGAGGTCTGGCTATGGTTGGAAAGACTCCAATGGGAAGACAACCTGGTGCAGTCAAACCAACTATCAATACCTAATCATACAGAACAGGACATGGGAAAATCACTGTGAGTATGCCGGTCCTTTTGGTCTCTCAAGAATTCTTTTTGCTCAGGAGAAAACAAAGTTTCTCACTAGAAGATTGGCAGGGACTTTTACCTGGACATTGTCGGATTCTTCGGGAACTGAAACCCCAGGTGGGTATTGTCTGACAAGGTGGATGCTCATAGCTGCTGATCTCAAGTGTTTCGGGAACACAGCAGTTGCCAAATGCAACATCAACCATGATGAAGAATTTTGTGACATGTTGAGGTTAATTGACTATAACAAAGCCGCTCTAAAGAAATTCAAAGAAGACGTAGAGTCTGCCCTTCACTTGTTCAAAACAACTG +>2 +CCGGGGATCCTAGGCTTATTATATTGCGCTTTGTATTGAAGTCTGTTCTGTGTGGACTGACCTCAGCACAAGTGTTATGGGGCAGATCATAACAATGTTTGAGGCCCTGCCTCACATTATCGATGAGGTCATCAACATTGTTATAATAGTGCTTATAATAATAACAAGCATAAAGGCTGTGTACAACTTTGCTACCTGTGGCATCATTGCATTGATCAGCTTCTGCTTCTTGGCTGGAAGGTCTTGTGGCTTGTATGGTGTCTCTGGCTCTGACATTTACAAGGGACTCTACCAGTTCCAGTCCGTAGAGTTCAACATGTCACAATTGAATTTAACAATGCCCAATGCGTGCTCAGCCAACAATTCCCACCATTACATCAGCATGGGAAAATCTGGCCTGGAACTAACCTTTACAAATGAC +>3 +TTTACAAGGGACTCTACCAGTTCCAGTCCGTAGAGTTCAACATGTCACAATTGAATTTAACAATGCCCAATGCGTGCTCAGCCAACAATTCCCACCATTACATCAGCATGGGAAAATCTGGCCTGGAACTAACCTTTACAAATGACTCCATCATTCAACACAACTTCTGCAACCTAACTGATGGGTTCAAGAAAAAAACCTTTGATCATACACTTATGAGCATAGTGTCAAGCCTGCACCTGAGCATTAGAGGAAATACCATCTACAAAGCTGTGTCCTGTGACTTCAACAATGGGATTACAATCCAGTACAACCTAACCTTCTCTGATGCACAAGGTGCCATCAATCAATGTGGAACCTTCAGAGGTAGAGTTTTAGATATGTTTAGAACAGCTTTTGGGGGGAAATACATGAGGTCTGGCTATGGTTGGAAAGACTCCAATGGGAAGACAACCTGGTGCAGTCAAACCAACTATCAATACCTAATCATACAGAACAGGACATGGGAAAATCACTGTGAGTATGCCGGTCCTTTTGGTCTCTCAAGAATTCTTTTTGCTCAGGAGAAAACAAAGTTTCTCACTAGAAGATTGGCAGGGACTTTTACCTGGACATTGTCGGATTCTTCGGGAACTGAAACC +>4 +AACCGGCGCCAGTGTGCTGGGACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAAACCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACAC +>5 +CATACCTAATCAAAACCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCACATACCTAATCAAATCCGTACACCCCGCTCCTCCCTTTCTGGTAATTTTACTTTTATTTTTTTCATTTTTTTATTTTTTCATTTTTTCATTTTTTCATTTTCTCATTTTCTCTTTTTTAATCACTCTAGACGGATTTCCTCTTCTGGTAATTTTCTTTTTCTCATTTTCTCATTTTCTCATTTTCTCTTTTCTAATCACTCGAAACGGATTTCCTCTTTCGGTAATTTTGCTTCTTTGTTTTTTACTTTAATTTCTTTTTCCTCTTCTCCTCTTTCGCTTTCTCCTTTCACTCTCATCCATTCTCATTCCTCAATTTATACTTTTTTTGGCAGTATTCTTGACTGTTTTTCACCATTTCCCCTTACCCGCACTTCCATCACATTTTCTTTGTCAATCCACCTTCTTGCCATGGCCAATGCTTGGTCCTGCTGGGCCTGGTCCTTGCATAGGATACACACGTTACCCGCACAATTATCACCCATTTTCAACTTCTTCCAACTTTTGCTCCTCTTGCCATGAAACTTCATTTCAAATTACTTCACTTCATCACTTCCTTAGTTACCCTGCTTTATTATAAATCACGTGCTTCCCATCTCCCGCTTCCCATCTCCCGCTTCCCATCACCTGCTTCTCTTCACCTGCTTCTCTTCACTTTCATTTTTCATCCCTTCTCATTCTACCTTTCTTTCCCTCTCTCCCTCTGAATATAAGAATTCTCCACTGTTTTTCACAATTACTACTTACCCGCACTTAAACTACACTTCTCCATGTTAATCAACCT +>6 +TAACCCTAACCCTAACCCTGACCCTAACCCTAACCCTAACCCTAACCCTAACCAGTACACGCGTACACGTACAAGCACCCGTACCCCCAGTATACCTGGACACCCGTACTCAGTTATCCTTTTTATTAGTGTACCCGCCTCTTGCACGCATGCCACAGTTCTTCAGCAGAAGAACACGCACAATGCTCTTTGATAAACGTGCGGACATGAAAAAAAGGGAAAAACGCAGCTACGTGTGCTGTCGTTGGTTTCACAGCGTCAAGCCGCGTCGGTGTACCAAAGAGGAGGTGACCCATCGAGTACTCGCACCCTCTAGCTCTCCTTTTCTGCCTCGTATTATACACGTTGATCGGAAAACAGGGTAGGCACTAGCCACCGATAATCTTCAATCGTACATCTGTCTGCGTAAGCGCGTGCCCCGGATGGAGGGCATGGAACTGCATCGACCGCCCACGGCGATCGCCGATCAGCCAGCGATGTGACTGCAACGCTGTTTGTTTCCACAACGAGGGCTGAAGGCTTTCTGATAGATTGTGCGCTATAGAACAAGGAGGGAGAGCCCACCCCTTTTTATGCGAAAACTCCTCACCCAAAGCAAGGAGGGCGGCGGGTGGGAAGCGGAAAGCCAACGCCCACGCGGACGCAATTAGCACCGACCGAAAACGAGCAGTGAGAAAAAGGGAAGTCTCTCAGACTGGGAAGAGATGAGCCGAGGAGATAAATGCACCAGATCCGAGGTACCGCGGCACAAGAGGAGCCGGGTGATATTTTTTGTTGTTTTCAGTGTTTCCTCGTGAGACGGCAAAACACGAGGCAGAAAAGGTG +>7 +CTTCAGCAGAAGAACACGCACAATGCTCTTTGATAAACGTGCGGACATGAAAAAAAGGGAAAAACGCAGCTACGTGTGCTGTCGTTGGTTTCACAGCGTCAAGCCGCGTCGGTGTACCAAAGAGGAGGTGACCCATCGAGTACTCGCACCCTCTAGCTCTCCTTTTCTGCCTCGTATTATACACGTTGATCGGAAAACAGGGTAGGCACTAGCCACCGATAATCTTCAATCGTACATCTGTCTGCGTAAGCGCGTGCCCCGGATGGAGGGCATGGAACTGCATCGACCGCCCACGGCGATCGCCGATCAGCCAGCGATGTGACTGCAACGCTGTTTGTTTCCACAACGAGGGCTGAAGGCTTTCTGATAGATTGTGCGCTATAGAACAAGGAGGGAGAGCCCACCCCTTTTTATGCGAAAACTCCTCACCCAAAGCAAGGAGGGCGGCGGGTGGGAAGCGGAAAGCCAACGCCCACGCGGACGCAATTAGCACCGACCGAAAACGAGCAGTGAGAAAAAGGGAAGTCTCTCAGACTGGGAAGAGATGAGCCGAGGAGATAAATGCACCAGATCCGAGGTACCGCGGCACAAGAGGAGCCGGGTGATATTTTTTGTTGTTTTCAGTGTTTCCTCGTGAGACGGCAAAACACGAGGCAGAAAAGGTGCAAGAGATCCAGGTGGCTGGCGAAGAGGAGGAACATGAGAAGAGAGACAGTCAACATTGGCGGGGAGTCGAACTTTGTGCAGCTCATGTGTGCAGGTGCAGGTCGATGGATAGAAGGCTAAGAGGCGATAGGACAGGGTCCCTTCACACCACAAGCGTGAGTGATGGAGTTATATGCGCATGGTCGAATAGGTATGCACATGTACGGCAGACAGGAAAGTAGAAGAGAGGAATTCGGAGTTGTGGAGAACGGGAAGTCGATGGGGCAGCAGCAGCAGTCAGAGCAGCAGACGAAATGCTACACGGAACGGCTTCACGGAGAGAGCATATCAGAGAAGCAGGGGAGCTGAGAAGTGCAGTCGATGTGTCACGCTTTGAAGTGTGTGACAT