From d7d24583d26ee264e8f65607ab01665878637515 Mon Sep 17 00:00:00 2001
From: dc12-bcm <76194269+dc12-bcm@users.noreply.github.com>
Date: Thu, 17 Feb 2022 12:17:26 -0600
Subject: [PATCH] Add files via upload

---
 README.md           |   2 +-
 dataSlicerHelper.sh |  34 ++++++
 exRnaDataSlicer.rb  | 265 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 300 insertions(+), 1 deletion(-)
 create mode 100644 dataSlicerHelper.sh
 create mode 100644 exRnaDataSlicer.rb
diff --git a/README.md b/README.md
index 991acd7..7e14b8f 100644
--- a/README.md
+++ b/README.md
@@ -107,4 +107,4 @@ Merging intersections
 ## Improvement Ideas
 
 ---
-- All ideas have been implemented at this point.
+- All ideas have been implemented at this point.
\ No newline at end of file
diff --git a/dataSlicerHelper.sh b/dataSlicerHelper.sh
new file mode 100644
index 0000000..31ef374
--- /dev/null
+++ b/dataSlicerHelper.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/bash
+module load BEDTools/2.17
+
+# you can specify the path to bedtools in the next line and comment out line 2 if your system is not set up the same to load the module
+bedtools_cmd="bedtools"
+
+usage()
+{
+  echo "Usage: sh `basename $0` [-r <path to roi>] [-b <bedgraph location/path>] [-o <output file>] [-i (for inersection)] [-m (for merging intersections)] [-n <list of header names>]"
+}
+
+while getopts "r:b:o:imn:" params
+do
+  case $params in
+  r) rbp=$OPTARG;;
+  b) bedgraph=$OPTARG;;
+  o) output=$OPTARG;;
+  i) intersection="true";;
+  m) combine="true";;
+  n) names=$OPTARG;;
+  ?) usage
+    exit
+    ;;
+  esac
+done
+shift $((OPTIND-1))
+
+if [[ $combine == "true" ]]; then
+  echo "${bedtools_cmd} unionbedg -i $bedgraph -header -names $names > ${output}"
+  ${bedtools_cmd} unionbedg -i ${bedgraph} -header -names ${names} > ${output}
+elif [[ $intersection == "true" ]]; then
+  echo "${bedtools_cmd} map -a ${rbp} -b ${bedgraph} -c 4 | awk '\$4!~ /\./' | sed -e 's/\./0/g' > ${output}"
+  ${bedtools_cmd} map -a ${rbp} -b ${bedgraph} -c 4 | sed -e 's/\./0/g' > ${output}
+fi
\ No newline at end of file
diff --git a/exRnaDataSlicer.rb b/exRnaDataSlicer.rb
new file mode 100644
index 0000000..0787afd
--- /dev/null
+++ b/exRnaDataSlicer.rb
@@ -0,0 +1,265 @@
+#!/usr/bin/env ruby
+#
+# exRnaDataSlicer.rb This program will provide coverage from given region of interst Bed file and the the selected exRNA Atlas biosamples.
+#
+# Usage:  ruby exRnaDataSlicer.rb [options]
+#     -b, --bed bedFile                Path to the region of interest Bed file for intersection
+#     -s, --samples sampleFile         Path to the sample files, tab delimited format: each row with [analysis ID]\t[biosampleID]
+#     -o, --out outputPath             Designate output path (default at the current locaiton)
+#     -n, --filename outputName        The name of the output file (default as exRNA_data_slice_combined.bed)
+#     -m, --multirun                   Keep intermediate files to speed up the future run time
+#         --nocleanup                  Keep the tmp directory and do not remove anything
+#     -h, --help                       Display this screen
+#
+#
+# author: David Chen
+# email: dc12@bcm.edu
+
+require 'uri'
+require 'json'
+require 'optparse'
+
+def getAllJobsJson(outfile)
+  string = "https://genboree.org/REST/v1/grp/Extracellular%20RNA%20Atlas/kb/exRNA-atlas-v4/coll/Jobs/docs"
+  `curl --silent "#{string}" -o #{outfile}`
+end
+
+def curlAtlasApi(docType, docName, outfile)
+  string = "https://genboree.org/REST/v1/grp/Extracellular%20RNA%20Atlas/kb/exRNA-atlas-v4/coll/#{docType}/doc/#{docName}"
+  `curl --silent "#{string}" -o #{outfile}`
+end
+
+def curlFtpFile(ftpPath, outfile)
+  puts "curl --silent \"#{ftpPath}\" -o #{outfile}"
+  `curl --silent "#{ftpPath}" -o #{outfile}`
+end
+
+def getFromJson(data,query)
+  query.split('.').inject(data) { |memo,key|
+    key = key.to_i if memo.is_a?(Array)
+    memo.fetch(key)
+  }
+end
+
+def checkJsonStatus?(jsonFile,doc)
+  return TRUE if getFromJson(jsonFile, 'status.msg').match(/ok/i)
+  puts "something went wrong while requesting for Doc: #{doc}"
+  return FALSE
+end
+
+def readFileToJson(file)
+  puts "#{file} not found" unless File.exist?(file)
+  return JSON.load(File.read(file))
+end
+
+def foundRightJson?(json, analysis)
+  return TRUE if getFromJson(json, 'data.Job.properties.Related Analysis.value') == analysis
+  return FALSE
+end
+
+def foundRightRelatedBiosample?(json, biosample)
+  return TRUE if getFromJson(json, "Related Biosample.value") == biosample
+  return FALSE
+end
+
+def decompressBedgraphXZ(xzFile)
+  if File.exist?(xzFile)
+    `xz -d #{xzFile}`
+    return TRUE
+  else
+    puts ""
+    puts "Warning: The xz file: #{xzFile} does not exist!"
+    puts ""
+  end
+  return FALSE
+end
+
+def fileExists?(path,filetype)
+  return TRUE if File.exist?(path)
+  puts "#{filetype}: #{path} does not exist"
+  return FALSE
+end
+
+def readSampleFile(sampleFile)
+  output = {}
+  File.foreach(sampleFile) { |line|
+    next if line.match(/biosample|analysis/i)
+    tmp = line.chomp.split("\t")
+    output[tmp[1]] = tmp[0]
+  }
+  return output
+end
+
+def cleanup(workPath)
+  Dir["#{workPath}/*"].each { |entry|
+    if File.directory?(entry)
+      cleanup(entry)
+    else
+      File.delete("#{entry}")
+    end
+  }
+  Dir.delete("#{workPath}")
+end
+
+options = {}
+optparse = OptionParser.new { |opts|
+  opts.banner = "Usage:  ruby #{File.basename(__FILE__)} [options]\n"
+  opts.on('-b', '--bed bedFile', "Path to the region of interest Bed file for intersection") {|bed| options[:bed]=bed }
+  opts.on('-s', '--samples sampleFile', "Path to the sample files, tab delimited format: each row with [analysis ID]\\t[biosampleID]") { |samples| options[:samples]=samples}
+  opts.on('-o', '--out outputPath', "Designate output path (default at the current locaiton)") {|out| options[:out]=out}
+  opts.on('-n', '--filename outputName', "The name of the output file (default as exRNA_data_slice_combined.bed)") {|fname| options[:fname]=fname}
+  opts.on('-m', '--multirun', "Keep intermediate files to speed up the future run time") { options[:multirun]= true}
+  opts.on('--nocleanup', "Keep the tmp directory and do not remove anything") {options[:nocleanup] = true}
+  opts.on('-h', '--help', "Display this screen"){ puts optparse; exit }
+}
+optparse.parse!
+
+workingPath = Dir.pwd
+workingPath = options[:out] if options[:out]
+tmpPath = "#{workingPath}/tmp"
+bedgraphPath = "#{tmpPath}/bedgraphs"
+Dir.mkdir(tmpPath) unless File.directory?(tmpPath)
+Dir.mkdir(bedgraphPath) unless File.directory?(bedgraphPath)
+
+#check for the required parameters are passed in
+if options[:bed].nil?
+  puts "A region of interest bed file is required.  It can be passed in by using [-b|--bed] [path to bed]"
+  puts ""
+  puts optparse
+  exit(1)
+end
+if options[:samples].nil?
+  puts "A tsv for the samples is required.  It can be passed in by using [-s|--samples] [path to sample file]"
+  puts ""
+  puts optparse
+  exit(1)
+end
+
+sampleFile = options[:samples]
+roi = options[:bed]
+
+puts "Using roi bed file: #{roi}"
+puts "Using sample file: #{sampleFile}"
+puts "Using output directory: #{workingPath}"
+
+exit (1) unless fileExists?(roi,"ROI")  #make sure input file is there
+exit(1) unless fileExists?(sampleFile, "Samples") #make sure input file is there
+
+alljobsFile = "#{tmpPath}/alljobs.json"
+getAllJobsJson(alljobsFile) unless File.exist?(alljobsFile)
+allJobs = readFileToJson(alljobsFile)
+#make sure the name of the jobs could be found, otherwise exit
+exit(1) unless checkJsonStatus?(allJobs, "All Jobs")
+
+failed = []
+samples = readSampleFile(sampleFile)
+#locate and download the bedgraphs
+samples.each { |biosample, analysis|
+  bedgraphfile = "#{bedgraphPath}/#{biosample}_endogenousAlignments_genome_Aligned.bedgraph"
+  puts "Looking for #{biosample} bedgraph file: #{bedgraphfile}"
+  if File.exist?(bedgraphfile)
+    puts "#{bedgraphfile} exists already.. move on to the next"
+    next
+  end
+  puts "#{bedgraphfile} was not found.  Need to find the ftp path."
+  jobFound = FALSE
+  getFromJson(allJobs,"data").each { |item|
+    docName = getFromJson(item,"Job.value")
+    jobfile = "#{tmpPath}/#{docName}.metadata.tsv"
+    next if docName.match(/PCR/)
+    curlAtlasApi(:Jobs,docName,jobfile) unless File.exist?(jobfile)
+    docJson = readFileToJson(jobfile)
+    next unless checkJsonStatus?( docJson,docName ) #make sure the docName was retrieved correctly
+    next unless foundRightJson?( docJson,analysis )
+    puts "Found the correct FTPjob for the analysis: #{docName} and downloaded the doc file: #{jobfile}"
+    jobFound = TRUE
+    getFromJson( docJson,"data.Job.properties.Related Biosamples.items").each { |relatedBiosample|
+      next unless foundRightRelatedBiosample?(relatedBiosample,biosample)
+      relatedResultFiles =  getFromJson(relatedBiosample,"Related Biosample.properties.Related Result Files.value")
+      rfFile = "#{tmpPath}/#{relatedResultFiles}.metadata.tsv"
+      puts "Found result file doc.. download doc: #{rfFile}"
+      curlAtlasApi("Result Files",relatedResultFiles,rfFile) unless File.exist?(rfFile)
+      rfJson = readFileToJson(rfFile)
+      next unless checkJsonStatus?(rfJson, relatedResultFiles) #make sure the result files doc was retrieved correctly
+      getFromJson(rfJson,"data.Result Files.properties.Biosample ID.properties.Pipeline Result Files.items").each { |file|
+        next unless getFromJson(file,"File ID.properties.File Name.value") == "endogenousAlignments_genome_Aligned.bedgraph.xz"
+        bedgraphuri = URI(getFromJson(file,"File ID.properties.Genboree URL.value"))
+        bedgraphXzFile = "#{bedgraphPath}/#{biosample}_#{File.basename(bedgraphuri.to_s)}"
+        puts "downlaoding bedgraph.xz file: #{bedgraphXzFile}"
+        curlFtpFile(bedgraphuri.to_s, bedgraphXzFile)
+        if decompressBedgraphXZ(bedgraphXzFile) #decompressedBedgraphXZ will check if the file does exist prior to decompressing the file
+          puts "#{bedgraphXzFile} has been decompressed."
+        else
+          failed << "Failed to download bedgraph for #{biosample}"
+        end
+      }
+    }
+  }
+  unless jobFound
+    failed << failed << "Failed to download bedgraph for #{biosample}"
+    puts ""
+    puts "Warning: Cannot find the Job file for analysis: #{analysis}, biosample: #{biosample}."
+    puts "Please check to make sure if the IDs for the analysis and the biosamples are correct."
+    puts ""
+  end
+}
+
+# make sure the roi file is sorted
+sortedBedPath = "#{tmpPath}/sortedBed"
+Dir.mkdir(sortedBedPath) unless File.directory?(sortedBedPath)
+roiFname = File.basename(roi, File.extname(roi))
+sortedBedFile = "#{sortedBedPath}/#{roiFname}_sorted#{File.extname(roi)}"
+puts "Sorting the given roi bed file #{roi} and store it as #{sortedBedFile}"
+puts "sort -k1,1 -k2,2n #{roi} > #{sortedBedFile}"
+`sort -k1,1 -k2,2n #{roi} > #{sortedBedFile}`
+
+# create intersection of the roi with bedgraphs
+intersectPath = "#{tmpPath}/ind_intersection"
+Dir.mkdir(intersectPath) unless File.directory?(intersectPath)
+codePath = File.dirname(__FILE__)
+#gather all of the bedgraphs
+Dir["#{bedgraphPath}/*bedgraph"].each { |bedgraph|
+  puts "intersecting sorted roi: #{sortedBedFile} with bedgraph: #{bedgraph} "
+  fname = File.basename(bedgraph)
+  system("sh #{codePath}/dataSlicerHelper.sh -r #{sortedBedFile} -b #{bedgraph} -i -o #{intersectPath}/#{fname}_intersect.bed")
+  checkfailed = $?.exitstatus == 1
+  failed << "Intersecting bedgraph: #{bedgraph} with sortedBedFile: #{sortedBedFile}" if checkfailed
+}
+
+# gather the intersection paths and merge all together
+paths = []
+names = []
+Dir["#{intersectPath}/*bed"].each { |file|
+  paths << file
+  names << File.basename(file).split("_endogenousAlignments")[0]
+}
+outName = "exRNA_data_slice_combined.bed"
+outName = options[:fname] if options[:fname]
+
+puts "Merge intersections"
+system("sh #{codePath}/dataSlicerHelper.sh -n \"#{names.join(' ')}\" -b \"#{paths.join(' ')}\" -m -o #{workingPath}/#{outName}")
+checkfailed = $?.exitstatus == 1
+failed << "Merging intersections" if checkfailed
+
+unless options[:nocleanup]
+  puts "Clean up:"
+  if options[:multirun]
+    puts "Removing intermiediate files in #{tmpPath}/bedgraphs and #{tmpPath}/ind_intersection"
+    cleanup(bedgraphPath)
+    cleanup(sortedBedPath)
+    cleanup(intersectPath)
+  else
+    puts "Remove the all of the intermediate files"
+    cleanup(tmpPath)
+  end
+end
+
+# checks to see if any error was encounterred
+if failed.size > 0
+  puts ""
+  puts "Error(s):"
+  failed.each { |f| puts f}
+  puts ""
+else
+  puts "Finish."
+end