Version 1 - April 2014

1. Added script to repo.
gkno · Apr 28, 2014 · acf03d7 · acf03d7
1 parent b3dc252
commit acf03d7
Show file tree

Hide file tree

Showing 3 changed files with 361 additions and 1 deletion.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2014 Alistair Ward
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
 parseSequenceIndex
 ==================
 
-Parse a 1000G sequence index and produce files for use with gkno
+Parse a 1000G sequence index and produce files for use with gkno.
diff --git a/sequence-index-1000G.sh b/sequence-index-1000G.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+
+TYPE=$1
+INDEX=$2
+MERGE=$3
+FASTQ_DIR=$4
+#FASTQ_DIR="$(pwd)"/"$(basename $4)"
+
+## FUNCTIONS
+
+# Function for initial contents of a json file.
+write_initial_align_file()
+{
+  local type=$1
+  local filename="$1_end_reads.json"
+
+  echo '{' > $filename
+  echo '  "arguments" : [' >> $filename
+  echo '    "--fastq",' >> $filename
+  if [[ $type == 'paired' ]]
+  then
+    echo '    "--fastq2",' >> $filename
+  fi
+  echo '    "--read-archive",' >> $filename
+  echo '    "--sample-name",' >> $filename
+  echo '    "--center-name",' >> $filename
+  echo '    "--sequencing-technology",' >> $filename
+  echo '    "--read-group-id",' >> $filename
+  echo '    "--lane",' >> $filename
+  echo '    "--platform"' >> $filename
+  echo '  ],' >> $filename
+  echo '  "values" : [' >> $filename
+}
+
+# Write the run specific information to the json file.
+write_data_to_align_file()
+{
+  local filename="$1_end_reads.json"
+  local sample=$2
+  local centre=$3
+  local sequencing_technology=$4
+  local read_group=$5
+  local library=$6
+  local lane=$7
+  local id=$8
+  local fastq=$9
+  local fastq2=${10}
+
+  # Define the output filename.
+  local output=`echo "$fastq" | cut -d '.' -f 1`
+  if [[ $1 == 'paired' ]]
+  then
+    output=${output%?}
+    output=${output%?}
+  fi
+  output="$output.$1.mkb"
+
+  if [[ $id == 1 ]]
+  then
+    echo -e "    [" >> $filename
+  else
+    echo -e ",\n    [" >> $filename
+  fi
+  echo -e "      \"$FASTQ_DIR/$fastq\"," >> $filename
+  if [[ $1 == 'paired' ]]
+  then
+    echo -e "      \"$FASTQ_DIR/$fastq2\"," >> $filename
+  fi
+  echo -e "      \"$output\"," >> $filename
+  echo -e "      \"\\\"$sample\\\"\"," >> $filename
+  echo -e "      \"\\\"$centre\\\"\"," >> $filename
+  echo -e "      \"\\\"$sequencing_technology\\\"\"," >> $filename
+  echo -e "      \"\\\"$read_group\\\"\"," >> $filename
+  echo -e "      \"\\\"$library\\\"\"," >> $filename
+  echo -e "      \"\\\"$lane\\\"\"" >> $filename
+  echo -e "    ]\c" >> $filename
+}
+
+# Finish writing the align json file.
+write_end_align_file()
+{
+  filename=$1
+  echo >> $filename
+  echo '  ]' >> $filename
+  echo '}' >> $filename
+}
+
+# Write the initial information to the merge sample input json files.
+write_initial_merge_file()
+{
+  local filename="sample.merge.json"
+
+  echo '{' > $filename
+  echo '  "arguments" : [' >> $filename
+  echo '    "--bam-list",' >> $filename
+  echo '    "--out"' >> $filename
+  echo '  ],' >> $filename
+  echo '  "values" : [' >> $filename
+}
+
+# Write sample specific information to the merge file.
+write_merge_file()
+{
+  local sample=$1
+  local id=$2
+  local total_number=$3
+  local filename="sample.merge.json"
+
+  echo "    [" >> $filename
+  echo "      \"${PWD}/$sample.1000G.bam.list\"," >> $filename
+  echo "      \"$sample.merged.bam\"" >> $filename
+  if [[ $id == $total_number ]]
+  then
+    echo "    ]" >> $filename
+  else
+    echo "    ]," >> $filename
+  fi
+}
+
+write_end_merge_file()
+{
+  local filename="sample.merge.json"
+
+  echo "  ]" >> $filename
+  echo "}" >> $filename
+}
+
+# Check that the fastq file(s) exist.
+check_fastq()
+{
+  local fastq1=$1
+  local fastq2=$2
+  local file1="$FASTQ_DIR/$fastq1"
+  local file2="$FASTQ_DIR/$fastq2"
+
+  if [[ ! -f $file1 ]]
+  then
+    echo "Missing fastq file: $file1" 1>&2
+    rm -f ./single_end_reads.json
+    rm -f ./paired_end_reads.json
+    exit 1
+  fi
+  if [[ $fastq2 != '' ]]
+  then
+    if [[ ! -f $file2 ]]
+    then
+      echo "Missing fastq file: $file2" 1>&2
+      rm -f ./single_end_reads.json
+      rm -f ./paired_end_reads.json
+      exit 1
+    fi
+  fi
+}
+
+## END OF FUNCTIONS
+
+
+# Check that the requested analysis is for single-end, paired end reads or both.
+if [[ $TYPE != "s" ]] && [[ $TYPE != "p" ]] && [[ $TYPE != "b" ]]
+then
+  echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2
+  echo -e "\tanalysis type:" 1>&2
+  echo -e "\t\t's' - single end reads only," 1>&2
+  echo -e "\t\t'p' - paired end reads only," 1>&2
+  echo -e "\t\t'b' - both single and paired end read." 1>&2
+  exit 1
+fi
+
+# Check that the index file exists.
+if [[ ! -f $INDEX ]]
+then
+  echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2
+  echo -e "\tIndex file does not exist." 1>&2
+  exit 1
+fi
+
+# Check if merge is set to 'true' or 'false'.
+if [[ $MERGE != 'True' ]] && [[ $MERGE != 'False' ]]
+then
+  echo -e "Usage: 1000G_script.sh <analysis type> <sequence index file> <merge samples>" 1>&2
+  echo -e "\tThe merge samples option must be set to 'True' or 'False'" 1>&2
+  echo -e "\tCurrent value is: $MERGE" 1>&2
+  exit 1
+fi
+
+# Check that no files of the form *.bam.list exist.
+if [[ $MERGE == 'True' ]]
+then
+  ls *.1000G.bam.list > /dev/null 2> /dev/null
+  if [[ $? == 0 ]]
+  then
+    echo "ERROR"
+    echo
+    echo "When executed, there can exist no files of the form *.1000G.bam.list"
+    echo "in the current directory.  Please move or remove those files from"
+    echo "this directory before proceeding."
+    exit 1
+  fi
+fi
+
+# Set the directory for the fastq files.
+if [[ $FASTQ_DIR == '' ]]
+then
+  FASTQ_DIR="${PWD}/fastq"
+fi
+
+# Define some variables.
+NO_SINGLE_READS=1
+NO_PAIRED_READS=1
+SINGLE_COUNT=0
+PAIRED_COUNT=0
+unset SAMPLE_LIST
+
+# Open json files for the single and paired end reads as necessary.
+if [[ $TYPE != 'p' ]]
+then
+  write_initial_align_file 'single'
+fi
+
+if [[ $TYPE != 's' ]]
+then
+  write_initial_align_file 'paired'
+fi
+
+# Parse the sequence index file.
+while read line
+do
+  FASTQ=`echo "$line" | cut -f 1 | cut -d '/' -f 4`
+  FASTQ2=`echo "$line" | cut -f 20 | cut -d '/' -f 4`
+  SAMPLE=`echo "$line" | cut -f 10`
+  CENTRE=`echo "$line" | cut -f 6`
+  READ_GROUP=`echo "$line" | cut -f 3`
+  LIBRARY=`echo "$line" | cut -f 15`
+  LANE=`echo "$line" | cut -f 14`
+  TECH=`echo "$line" | cut -f 13`
+  LIBRARY_TYPE=`echo "$line" | cut -f 20`
+  if [[ $LIBRARY_TYPE == '' ]]
+  then
+    LIBRARY_TYPE='SINGLE'
+  else
+    LIBRARY_TYPE='PAIRED'
+  fi
+
+  # Handle the single end reads.
+  if [[ $LIBRARY_TYPE == 'SINGLE' ]] && [[ $TYPE != 'p' ]]
+  then
+    write_data_to_align_file 'single' "$SAMPLE" "$CENTRE" "$TECH" "$READ_GROUP" "$LIBRARY" "$LANE" "$NO_SINGLE_READS" "$FASTQ"
+    if [[ $MERGE == 'True' ]]
+    then
+      NO_LINES=`wc -l $SAMPLE.1000G.bam.list | cut -d " " -f 1`
+
+      # Write the name of the aligned BAM files to the bam list.
+      # First define the file name.
+      OUTPUT=`echo "$FASTQ" | cut -d '.' -f 1`
+      OUTPUT=${OUTPUT%?}
+      OUTPUT=${OUTPUT%?}
+      echo -e "${PWD}/$OUTPUT.single_sorted.bam" >> $SAMPLE.1000G.bam.list
+    fi
+    NO_SINGLE_READS=$(($NO_SINGLE_READS + 1))
+
+    # Check that the fastq file exists.
+    check_fastq $FASTQ
+
+    # Add the sample to the list of processed samples.
+    SAMPLE_LIST=("${SAMPLE_LIST[@]}" "$SAMPLE")
+  fi
+
+  # Handle the paired end reads.  Only handle paired end reads if the name
+  # ends with 'xxx_1.fastq.gz' to avoid double inclusion.
+  if [[ $LIBRARY_TYPE == 'PAIRED' ]] && [[ $TYPE != 's' ]]
+  then
+    WHICH_READ=`echo "$FASTQ" | cut -d '.' -f 1`
+    if [[ $WHICH_READ == *_1 ]]
+    then
+      write_data_to_align_file 'paired' "$SAMPLE" "$CENTRE" "$TECH" "$READ_GROUP" "$LIBRARY" "$LANE" "$NO_PAIRED_READS" "$FASTQ" "$FASTQ2"
+      if [[ $MERGE == 'True' ]]
+      then
+	NO_LINES=`wc -l $SAMPLE.1000G.bam.list | cut -d " " -f 1`
+
+        # Write the name of the aligned BAM files to the bam list.
+        # First define the file name.
+        OUTPUT=${WHICH_READ%?}
+        OUTPUT=${OUTPUT%?}
+        echo -e "${PWD}/$OUTPUT.paired_sorted.bam" >> $SAMPLE.1000G.bam.list
+      fi
+      NO_PAIRED_READS=$(($NO_PAIRED_READS + 1))
+
+      # Check that the fastq file exists.
+      check_fastq $FASTQ $FASTQ2
+
+      # Add the sample to the list of processed samples.
+      SAMPLE_LIST=("${SAMPLE_LIST[@]}" "$SAMPLE")
+    fi
+  fi
+
+done < $INDEX
+
+# Find all of the unique samples in the sample_list.
+for sample in ${SAMPLE_LIST[@]}
+do
+  echo $sample
+done | sort | uniq > temp_samples.txt
+
+SAMPLE_ID=1
+NO_SAMPLES=`wc -l temp_samples.txt | cut -d ' ' -f 1`
+if [[ $MERGE == "True" ]]
+then
+  write_initial_merge_file
+  while read sample
+  do
+    write_merge_file $sample $SAMPLE_ID $NO_SAMPLES
+    SAMPLE_ID=$(($SAMPLE_ID + 1))
+  done < temp_samples.txt
+  write_end_merge_file
+fi
+rm -f temp_samples.txt
+
+# Finish writing the json files, or delete if nothing was addded to them.
+if [[ $TYPE == 'p' ]]
+then
+  rm -f ./single_end_reads.json
+  write_end_align_file 'paired_end_reads.json'
+fi
+
+if [[ $TYPE == 's' ]]
+then
+  rm -f ./paired_end_reads.json
+  write_end_align_file 'single_end_reads.json'
+fi
+
+if [[ $TYPE == 'b' ]]
+then
+  if [[ $NO_SINGLE_READS == 0 ]]
+  then
+    rm -f ./single_end_reads.json
+  fi
+  if [[ $NO_PAIRED_READS == 0 ]]
+  then
+    rm -f ./paired_end_reads.json
+  fi
+fi