Skip to content

Commit

Permalink
Version 1 - April 2014
Browse files Browse the repository at this point in the history
1. Added script to repo.
  • Loading branch information
AlistairNWard committed Apr 28, 2014
1 parent b3dc252 commit acf03d7
Show file tree
Hide file tree
Showing 3 changed files with 361 additions and 1 deletion.
19 changes: 19 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (c) 2014 Alistair Ward

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
parseSequenceIndex
==================

Parse a 1000G sequence index and produce files for use with gkno
Parse a 1000G sequence index and produce files for use with gkno.
341 changes: 341 additions & 0 deletions sequence-index-1000G.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,341 @@
#!/bin/bash

TYPE=$1
INDEX=$2
MERGE=$3
FASTQ_DIR=$4
#FASTQ_DIR="$(pwd)"/"$(basename $4)"

## FUNCTIONS

# Function for initial contents of a json file.
write_initial_align_file()
{
local type=$1
local filename="$1_end_reads.json"

echo '{' > $filename
echo ' "arguments" : [' >> $filename
echo ' "--fastq",' >> $filename
if [[ $type == 'paired' ]]
then
echo ' "--fastq2",' >> $filename
fi
echo ' "--read-archive",' >> $filename
echo ' "--sample-name",' >> $filename
echo ' "--center-name",' >> $filename
echo ' "--sequencing-technology",' >> $filename
echo ' "--read-group-id",' >> $filename
echo ' "--lane",' >> $filename
echo ' "--platform"' >> $filename
echo ' ],' >> $filename
echo ' "values" : [' >> $filename
}

# Write the run specific information to the json file.
write_data_to_align_file()
{
local filename="$1_end_reads.json"
local sample=$2
local centre=$3
local sequencing_technology=$4
local read_group=$5
local library=$6
local lane=$7
local id=$8
local fastq=$9
local fastq2=${10}

# Define the output filename.
local output=`echo "$fastq" | cut -d '.' -f 1`
if [[ $1 == 'paired' ]]
then
output=${output%?}
output=${output%?}
fi
output="$output.$1.mkb"

if [[ $id == 1 ]]
then
echo -e " [" >> $filename
else
echo -e ",\n [" >> $filename
fi
echo -e " \"$FASTQ_DIR/$fastq\"," >> $filename
if [[ $1 == 'paired' ]]
then
echo -e " \"$FASTQ_DIR/$fastq2\"," >> $filename
fi
echo -e " \"$output\"," >> $filename
echo -e " \"\\\"$sample\\\"\"," >> $filename
echo -e " \"\\\"$centre\\\"\"," >> $filename
echo -e " \"\\\"$sequencing_technology\\\"\"," >> $filename
echo -e " \"\\\"$read_group\\\"\"," >> $filename
echo -e " \"\\\"$library\\\"\"," >> $filename
echo -e " \"\\\"$lane\\\"\"" >> $filename
echo -e " ]\c" >> $filename
}

# Finish writing the align json file.
write_end_align_file()
{
filename=$1
echo >> $filename
echo ' ]' >> $filename
echo '}' >> $filename
}

# Write the initial information to the merge sample input json files.
write_initial_merge_file()
{
local filename="sample.merge.json"

echo '{' > $filename
echo ' "arguments" : [' >> $filename
echo ' "--bam-list",' >> $filename
echo ' "--out"' >> $filename
echo ' ],' >> $filename
echo ' "values" : [' >> $filename
}

# Write sample specific information to the merge file.
write_merge_file()
{
local sample=$1
local id=$2
local total_number=$3
local filename="sample.merge.json"

echo " [" >> $filename
echo " \"${PWD}/$sample.1000G.bam.list\"," >> $filename
echo " \"$sample.merged.bam\"" >> $filename
if [[ $id == $total_number ]]
then
echo " ]" >> $filename
else
echo " ]," >> $filename
fi
}

write_end_merge_file()
{
local filename="sample.merge.json"

echo " ]" >> $filename
echo "}" >> $filename
}

# Check that the fastq file(s) exist.
check_fastq()
{
local fastq1=$1
local fastq2=$2
local file1="$FASTQ_DIR/$fastq1"
local file2="$FASTQ_DIR/$fastq2"

if [[ ! -f $file1 ]]
then
echo "Missing fastq file: $file1" 1>&2
rm -f ./single_end_reads.json
rm -f ./paired_end_reads.json
exit 1
fi
if [[ $fastq2 != '' ]]
then
if [[ ! -f $file2 ]]
then
echo "Missing fastq file: $file2" 1>&2
rm -f ./single_end_reads.json
rm -f ./paired_end_reads.json
exit 1
fi
fi
}

## END OF FUNCTIONS


# Check that the requested analysis is for single-end, paired end reads or both.
if [[ $TYPE != "s" ]] && [[ $TYPE != "p" ]] && [[ $TYPE != "b" ]]
then
echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2
echo -e "\tanalysis type:" 1>&2
echo -e "\t\t's' - single end reads only," 1>&2
echo -e "\t\t'p' - paired end reads only," 1>&2
echo -e "\t\t'b' - both single and paired end read." 1>&2
exit 1
fi

# Check that the index file exists.
if [[ ! -f $INDEX ]]
then
echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2
echo -e "\tIndex file does not exist." 1>&2
exit 1
fi

# Check if merge is set to 'true' or 'false'.
if [[ $MERGE != 'True' ]] && [[ $MERGE != 'False' ]]
then
echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2
echo -e "\tThe merge samples option must be set to 'True' or 'False'" 1>&2
echo -e "\tCurrent value is: $MERGE" 1>&2
exit 1
fi

# Check that no files of the form *.bam.list exist.
if [[ $MERGE == 'True' ]]
then
ls *.1000G.bam.list > /dev/null 2> /dev/null
if [[ $? == 0 ]]
then
echo "ERROR"
echo
echo "When executed, there can exist no files of the form *.1000G.bam.list"
echo "in the current directory. Please move or remove those files from"
echo "this directory before proceeding."
exit 1
fi
fi

# Set the directory for the fastq files.
if [[ $FASTQ_DIR == '' ]]
then
FASTQ_DIR="${PWD}/fastq"
fi

# Define some variables.
NO_SINGLE_READS=1
NO_PAIRED_READS=1
SINGLE_COUNT=0
PAIRED_COUNT=0
unset SAMPLE_LIST

# Open json files for the single and paired end reads as necessary.
if [[ $TYPE != 'p' ]]
then
write_initial_align_file 'single'
fi

if [[ $TYPE != 's' ]]
then
write_initial_align_file 'paired'
fi

# Parse the sequence index file.
while read line
do
FASTQ=`echo "$line" | cut -f 1 | cut -d '/' -f 4`
FASTQ2=`echo "$line" | cut -f 20 | cut -d '/' -f 4`
SAMPLE=`echo "$line" | cut -f 10`
CENTRE=`echo "$line" | cut -f 6`
READ_GROUP=`echo "$line" | cut -f 3`
LIBRARY=`echo "$line" | cut -f 15`
LANE=`echo "$line" | cut -f 14`
TECH=`echo "$line" | cut -f 13`
LIBRARY_TYPE=`echo "$line" | cut -f 20`
if [[ $LIBRARY_TYPE == '' ]]
then
LIBRARY_TYPE='SINGLE'
else
LIBRARY_TYPE='PAIRED'
fi

# Handle the single end reads.
if [[ $LIBRARY_TYPE == 'SINGLE' ]] && [[ $TYPE != 'p' ]]
then
write_data_to_align_file 'single' "$SAMPLE" "$CENTRE" "$TECH" "$READ_GROUP" "$LIBRARY" "$LANE" "$NO_SINGLE_READS" "$FASTQ"
if [[ $MERGE == 'True' ]]
then
NO_LINES=`wc -l $SAMPLE.1000G.bam.list | cut -d " " -f 1`

# Write the name of the aligned BAM files to the bam list.
# First define the file name.
OUTPUT=`echo "$FASTQ" | cut -d '.' -f 1`
OUTPUT=${OUTPUT%?}
OUTPUT=${OUTPUT%?}
echo -e "${PWD}/$OUTPUT.single_sorted.bam" >> $SAMPLE.1000G.bam.list
fi
NO_SINGLE_READS=$(($NO_SINGLE_READS + 1))

# Check that the fastq file exists.
check_fastq $FASTQ

# Add the sample to the list of processed samples.
SAMPLE_LIST=("${SAMPLE_LIST[@]}" "$SAMPLE")
fi

# Handle the paired end reads. Only handle paired end reads if the name
# ends with 'xxx_1.fastq.gz' to avoid double inclusion.
if [[ $LIBRARY_TYPE == 'PAIRED' ]] && [[ $TYPE != 's' ]]
then
WHICH_READ=`echo "$FASTQ" | cut -d '.' -f 1`
if [[ $WHICH_READ == *_1 ]]
then
write_data_to_align_file 'paired' "$SAMPLE" "$CENTRE" "$TECH" "$READ_GROUP" "$LIBRARY" "$LANE" "$NO_PAIRED_READS" "$FASTQ" "$FASTQ2"
if [[ $MERGE == 'True' ]]
then
NO_LINES=`wc -l $SAMPLE.1000G.bam.list | cut -d " " -f 1`

# Write the name of the aligned BAM files to the bam list.
# First define the file name.
OUTPUT=${WHICH_READ%?}
OUTPUT=${OUTPUT%?}
echo -e "${PWD}/$OUTPUT.paired_sorted.bam" >> $SAMPLE.1000G.bam.list
fi
NO_PAIRED_READS=$(($NO_PAIRED_READS + 1))

# Check that the fastq file exists.
check_fastq $FASTQ $FASTQ2

# Add the sample to the list of processed samples.
SAMPLE_LIST=("${SAMPLE_LIST[@]}" "$SAMPLE")
fi
fi

done < $INDEX

# Find all of the unique samples in the sample_list.
for sample in ${SAMPLE_LIST[@]}
do
echo $sample
done | sort | uniq > temp_samples.txt

SAMPLE_ID=1
NO_SAMPLES=`wc -l temp_samples.txt | cut -d ' ' -f 1`
if [[ $MERGE == "True" ]]
then
write_initial_merge_file
while read sample
do
write_merge_file $sample $SAMPLE_ID $NO_SAMPLES
SAMPLE_ID=$(($SAMPLE_ID + 1))
done < temp_samples.txt
write_end_merge_file
fi
rm -f temp_samples.txt

# Finish writing the json files, or delete if nothing was addded to them.
if [[ $TYPE == 'p' ]]
then
rm -f ./single_end_reads.json
write_end_align_file 'paired_end_reads.json'
fi

if [[ $TYPE == 's' ]]
then
rm -f ./paired_end_reads.json
write_end_align_file 'single_end_reads.json'
fi

if [[ $TYPE == 'b' ]]
then
if [[ $NO_SINGLE_READS == 0 ]]
then
rm -f ./single_end_reads.json
fi
if [[ $NO_PAIRED_READS == 0 ]]
then
rm -f ./paired_end_reads.json
fi
fi

0 comments on commit acf03d7

Please sign in to comment.