-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
3 changed files
with
361 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Copyright (c) 2014 Alistair Ward | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
parseSequenceIndex | ||
================== | ||
|
||
Parse a 1000G sequence index and produce files for use with gkno | ||
Parse a 1000G sequence index and produce files for use with gkno. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,341 @@ | ||
#!/bin/bash | ||
|
||
TYPE=$1 | ||
INDEX=$2 | ||
MERGE=$3 | ||
FASTQ_DIR=$4 | ||
#FASTQ_DIR="$(pwd)"/"$(basename $4)" | ||
|
||
## FUNCTIONS | ||
|
||
# Function for initial contents of a json file. | ||
write_initial_align_file() | ||
{ | ||
local type=$1 | ||
local filename="$1_end_reads.json" | ||
|
||
echo '{' > $filename | ||
echo ' "arguments" : [' >> $filename | ||
echo ' "--fastq",' >> $filename | ||
if [[ $type == 'paired' ]] | ||
then | ||
echo ' "--fastq2",' >> $filename | ||
fi | ||
echo ' "--read-archive",' >> $filename | ||
echo ' "--sample-name",' >> $filename | ||
echo ' "--center-name",' >> $filename | ||
echo ' "--sequencing-technology",' >> $filename | ||
echo ' "--read-group-id",' >> $filename | ||
echo ' "--lane",' >> $filename | ||
echo ' "--platform"' >> $filename | ||
echo ' ],' >> $filename | ||
echo ' "values" : [' >> $filename | ||
} | ||
|
||
# Write the run specific information to the json file. | ||
write_data_to_align_file() | ||
{ | ||
local filename="$1_end_reads.json" | ||
local sample=$2 | ||
local centre=$3 | ||
local sequencing_technology=$4 | ||
local read_group=$5 | ||
local library=$6 | ||
local lane=$7 | ||
local id=$8 | ||
local fastq=$9 | ||
local fastq2=${10} | ||
|
||
# Define the output filename. | ||
local output=`echo "$fastq" | cut -d '.' -f 1` | ||
if [[ $1 == 'paired' ]] | ||
then | ||
output=${output%?} | ||
output=${output%?} | ||
fi | ||
output="$output.$1.mkb" | ||
|
||
if [[ $id == 1 ]] | ||
then | ||
echo -e " [" >> $filename | ||
else | ||
echo -e ",\n [" >> $filename | ||
fi | ||
echo -e " \"$FASTQ_DIR/$fastq\"," >> $filename | ||
if [[ $1 == 'paired' ]] | ||
then | ||
echo -e " \"$FASTQ_DIR/$fastq2\"," >> $filename | ||
fi | ||
echo -e " \"$output\"," >> $filename | ||
echo -e " \"\\\"$sample\\\"\"," >> $filename | ||
echo -e " \"\\\"$centre\\\"\"," >> $filename | ||
echo -e " \"\\\"$sequencing_technology\\\"\"," >> $filename | ||
echo -e " \"\\\"$read_group\\\"\"," >> $filename | ||
echo -e " \"\\\"$library\\\"\"," >> $filename | ||
echo -e " \"\\\"$lane\\\"\"" >> $filename | ||
echo -e " ]\c" >> $filename | ||
} | ||
|
||
# Finish writing the align json file. | ||
write_end_align_file() | ||
{ | ||
filename=$1 | ||
echo >> $filename | ||
echo ' ]' >> $filename | ||
echo '}' >> $filename | ||
} | ||
|
||
# Write the initial information to the merge sample input json files. | ||
write_initial_merge_file() | ||
{ | ||
local filename="sample.merge.json" | ||
|
||
echo '{' > $filename | ||
echo ' "arguments" : [' >> $filename | ||
echo ' "--bam-list",' >> $filename | ||
echo ' "--out"' >> $filename | ||
echo ' ],' >> $filename | ||
echo ' "values" : [' >> $filename | ||
} | ||
|
||
# Write sample specific information to the merge file. | ||
write_merge_file() | ||
{ | ||
local sample=$1 | ||
local id=$2 | ||
local total_number=$3 | ||
local filename="sample.merge.json" | ||
|
||
echo " [" >> $filename | ||
echo " \"${PWD}/$sample.1000G.bam.list\"," >> $filename | ||
echo " \"$sample.merged.bam\"" >> $filename | ||
if [[ $id == $total_number ]] | ||
then | ||
echo " ]" >> $filename | ||
else | ||
echo " ]," >> $filename | ||
fi | ||
} | ||
|
||
write_end_merge_file() | ||
{ | ||
local filename="sample.merge.json" | ||
|
||
echo " ]" >> $filename | ||
echo "}" >> $filename | ||
} | ||
|
||
# Check that the fastq file(s) exist. | ||
check_fastq() | ||
{ | ||
local fastq1=$1 | ||
local fastq2=$2 | ||
local file1="$FASTQ_DIR/$fastq1" | ||
local file2="$FASTQ_DIR/$fastq2" | ||
|
||
if [[ ! -f $file1 ]] | ||
then | ||
echo "Missing fastq file: $file1" 1>&2 | ||
rm -f ./single_end_reads.json | ||
rm -f ./paired_end_reads.json | ||
exit 1 | ||
fi | ||
if [[ $fastq2 != '' ]] | ||
then | ||
if [[ ! -f $file2 ]] | ||
then | ||
echo "Missing fastq file: $file2" 1>&2 | ||
rm -f ./single_end_reads.json | ||
rm -f ./paired_end_reads.json | ||
exit 1 | ||
fi | ||
fi | ||
} | ||
|
||
## END OF FUNCTIONS | ||
|
||
|
||
# Check that the requested analysis is for single-end, paired end reads or both. | ||
if [[ $TYPE != "s" ]] && [[ $TYPE != "p" ]] && [[ $TYPE != "b" ]] | ||
then | ||
echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2 | ||
echo -e "\tanalysis type:" 1>&2 | ||
echo -e "\t\t's' - single end reads only," 1>&2 | ||
echo -e "\t\t'p' - paired end reads only," 1>&2 | ||
echo -e "\t\t'b' - both single and paired end read." 1>&2 | ||
exit 1 | ||
fi | ||
|
||
# Check that the index file exists. | ||
if [[ ! -f $INDEX ]] | ||
then | ||
echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2 | ||
echo -e "\tIndex file does not exist." 1>&2 | ||
exit 1 | ||
fi | ||
|
||
# Check if merge is set to 'true' or 'false'. | ||
if [[ $MERGE != 'True' ]] && [[ $MERGE != 'False' ]] | ||
then | ||
echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2 | ||
echo -e "\tThe merge samples option must be set to 'True' or 'False'" 1>&2 | ||
echo -e "\tCurrent value is: $MERGE" 1>&2 | ||
exit 1 | ||
fi | ||
|
||
# Check that no files of the form *.bam.list exist. | ||
if [[ $MERGE == 'True' ]] | ||
then | ||
ls *.1000G.bam.list > /dev/null 2> /dev/null | ||
if [[ $? == 0 ]] | ||
then | ||
echo "ERROR" | ||
echo | ||
echo "When executed, there can exist no files of the form *.1000G.bam.list" | ||
echo "in the current directory. Please move or remove those files from" | ||
echo "this directory before proceeding." | ||
exit 1 | ||
fi | ||
fi | ||
|
||
# Set the directory for the fastq files. | ||
if [[ $FASTQ_DIR == '' ]] | ||
then | ||
FASTQ_DIR="${PWD}/fastq" | ||
fi | ||
|
||
# Define some variables. | ||
NO_SINGLE_READS=1 | ||
NO_PAIRED_READS=1 | ||
SINGLE_COUNT=0 | ||
PAIRED_COUNT=0 | ||
unset SAMPLE_LIST | ||
|
||
# Open json files for the single and paired end reads as necessary. | ||
if [[ $TYPE != 'p' ]] | ||
then | ||
write_initial_align_file 'single' | ||
fi | ||
|
||
if [[ $TYPE != 's' ]] | ||
then | ||
write_initial_align_file 'paired' | ||
fi | ||
|
||
# Parse the sequence index file. | ||
while read line | ||
do | ||
FASTQ=`echo "$line" | cut -f 1 | cut -d '/' -f 4` | ||
FASTQ2=`echo "$line" | cut -f 20 | cut -d '/' -f 4` | ||
SAMPLE=`echo "$line" | cut -f 10` | ||
CENTRE=`echo "$line" | cut -f 6` | ||
READ_GROUP=`echo "$line" | cut -f 3` | ||
LIBRARY=`echo "$line" | cut -f 15` | ||
LANE=`echo "$line" | cut -f 14` | ||
TECH=`echo "$line" | cut -f 13` | ||
LIBRARY_TYPE=`echo "$line" | cut -f 20` | ||
if [[ $LIBRARY_TYPE == '' ]] | ||
then | ||
LIBRARY_TYPE='SINGLE' | ||
else | ||
LIBRARY_TYPE='PAIRED' | ||
fi | ||
|
||
# Handle the single end reads. | ||
if [[ $LIBRARY_TYPE == 'SINGLE' ]] && [[ $TYPE != 'p' ]] | ||
then | ||
write_data_to_align_file 'single' "$SAMPLE" "$CENTRE" "$TECH" "$READ_GROUP" "$LIBRARY" "$LANE" "$NO_SINGLE_READS" "$FASTQ" | ||
if [[ $MERGE == 'True' ]] | ||
then | ||
NO_LINES=`wc -l $SAMPLE.1000G.bam.list | cut -d " " -f 1` | ||
|
||
# Write the name of the aligned BAM files to the bam list. | ||
# First define the file name. | ||
OUTPUT=`echo "$FASTQ" | cut -d '.' -f 1` | ||
OUTPUT=${OUTPUT%?} | ||
OUTPUT=${OUTPUT%?} | ||
echo -e "${PWD}/$OUTPUT.single_sorted.bam" >> $SAMPLE.1000G.bam.list | ||
fi | ||
NO_SINGLE_READS=$(($NO_SINGLE_READS + 1)) | ||
|
||
# Check that the fastq file exists. | ||
check_fastq $FASTQ | ||
|
||
# Add the sample to the list of processed samples. | ||
SAMPLE_LIST=("${SAMPLE_LIST[@]}" "$SAMPLE") | ||
fi | ||
|
||
# Handle the paired end reads. Only handle paired end reads if the name | ||
# ends with 'xxx_1.fastq.gz' to avoid double inclusion. | ||
if [[ $LIBRARY_TYPE == 'PAIRED' ]] && [[ $TYPE != 's' ]] | ||
then | ||
WHICH_READ=`echo "$FASTQ" | cut -d '.' -f 1` | ||
if [[ $WHICH_READ == *_1 ]] | ||
then | ||
write_data_to_align_file 'paired' "$SAMPLE" "$CENTRE" "$TECH" "$READ_GROUP" "$LIBRARY" "$LANE" "$NO_PAIRED_READS" "$FASTQ" "$FASTQ2" | ||
if [[ $MERGE == 'True' ]] | ||
then | ||
NO_LINES=`wc -l $SAMPLE.1000G.bam.list | cut -d " " -f 1` | ||
|
||
# Write the name of the aligned BAM files to the bam list. | ||
# First define the file name. | ||
OUTPUT=${WHICH_READ%?} | ||
OUTPUT=${OUTPUT%?} | ||
echo -e "${PWD}/$OUTPUT.paired_sorted.bam" >> $SAMPLE.1000G.bam.list | ||
fi | ||
NO_PAIRED_READS=$(($NO_PAIRED_READS + 1)) | ||
|
||
# Check that the fastq file exists. | ||
check_fastq $FASTQ $FASTQ2 | ||
|
||
# Add the sample to the list of processed samples. | ||
SAMPLE_LIST=("${SAMPLE_LIST[@]}" "$SAMPLE") | ||
fi | ||
fi | ||
|
||
done < $INDEX | ||
|
||
# Find all of the unique samples in the sample_list. | ||
for sample in ${SAMPLE_LIST[@]} | ||
do | ||
echo $sample | ||
done | sort | uniq > temp_samples.txt | ||
|
||
SAMPLE_ID=1 | ||
NO_SAMPLES=`wc -l temp_samples.txt | cut -d ' ' -f 1` | ||
if [[ $MERGE == "True" ]] | ||
then | ||
write_initial_merge_file | ||
while read sample | ||
do | ||
write_merge_file $sample $SAMPLE_ID $NO_SAMPLES | ||
SAMPLE_ID=$(($SAMPLE_ID + 1)) | ||
done < temp_samples.txt | ||
write_end_merge_file | ||
fi | ||
rm -f temp_samples.txt | ||
|
||
# Finish writing the json files, or delete if nothing was addded to them. | ||
if [[ $TYPE == 'p' ]] | ||
then | ||
rm -f ./single_end_reads.json | ||
write_end_align_file 'paired_end_reads.json' | ||
fi | ||
|
||
if [[ $TYPE == 's' ]] | ||
then | ||
rm -f ./paired_end_reads.json | ||
write_end_align_file 'single_end_reads.json' | ||
fi | ||
|
||
if [[ $TYPE == 'b' ]] | ||
then | ||
if [[ $NO_SINGLE_READS == 0 ]] | ||
then | ||
rm -f ./single_end_reads.json | ||
fi | ||
if [[ $NO_PAIRED_READS == 0 ]] | ||
then | ||
rm -f ./paired_end_reads.json | ||
fi | ||
fi |