Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
dc12-bcm authored Feb 17, 2022
1 parent cb75555 commit d7d2458
Show file tree
Hide file tree
Showing 3 changed files with 300 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,4 @@ Merging intersections
## Improvement Ideas

---
- All ideas have been implemented at this point.
- All ideas have been implemented at this point.
34 changes: 34 additions & 0 deletions dataSlicerHelper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/bash
module load BEDTools/2.17

# you can specify the path to bedtools in the next line and comment out line 2 if your system is not set up the same to load the module
bedtools_cmd="bedtools"

usage()
{
echo "Usage: sh `basename $0` [-r <path to roi>] [-b <bedgraph location/path>] [-o <output file>] [-i (for inersection)] [-m (for merging intersections)] [-n <list of header names>]"
}

while getopts "r:b:o:imn:" params
do
case $params in
r) rbp=$OPTARG;;
b) bedgraph=$OPTARG;;
o) output=$OPTARG;;
i) intersection="true";;
m) combine="true";;
n) names=$OPTARG;;
?) usage
exit
;;
esac
done
shift $((OPTIND-1))

if [[ $combine == "true" ]]; then
echo "${bedtools_cmd} unionbedg -i $bedgraph -header -names $names > ${output}"
${bedtools_cmd} unionbedg -i ${bedgraph} -header -names ${names} > ${output}
elif [[ $intersection == "true" ]]; then
echo "${bedtools_cmd} map -a ${rbp} -b ${bedgraph} -c 4 | awk '\$4!~ /\./' | sed -e 's/\./0/g' > ${output}"
${bedtools_cmd} map -a ${rbp} -b ${bedgraph} -c 4 | sed -e 's/\./0/g' > ${output}
fi
265 changes: 265 additions & 0 deletions exRnaDataSlicer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
#!/usr/bin/env ruby
#
# exRnaDataSlicer.rb This program will provide coverage from given region of interst Bed file and the the selected exRNA Atlas biosamples.
#
# Usage: ruby exRnaDataSlicer.rb [options]
# -b, --bed bedFile Path to the region of interest Bed file for intersection
# -s, --samples sampleFile Path to the sample files, tab delimited format: each row with [analysis ID]\t[biosampleID]
# -o, --out outputPath Designate output path (default at the current locaiton)
# -n, --filename outputName The name of the output file (default as exRNA_data_slice_combined.bed)
# -m, --multirun Keep intermediate files to speed up the future run time
# --nocleanup Keep the tmp directory and do not remove anything
# -h, --help Display this screen
#
#
# author: David Chen
# email: [email protected]

require 'uri'
require 'json'
require 'optparse'

def getAllJobsJson(outfile)
string = "https://genboree.org/REST/v1/grp/Extracellular%20RNA%20Atlas/kb/exRNA-atlas-v4/coll/Jobs/docs"
`curl --silent "#{string}" -o #{outfile}`
end

def curlAtlasApi(docType, docName, outfile)
string = "https://genboree.org/REST/v1/grp/Extracellular%20RNA%20Atlas/kb/exRNA-atlas-v4/coll/#{docType}/doc/#{docName}"
`curl --silent "#{string}" -o #{outfile}`
end

def curlFtpFile(ftpPath, outfile)
puts "curl --silent \"#{ftpPath}\" -o #{outfile}"
`curl --silent "#{ftpPath}" -o #{outfile}`
end

def getFromJson(data,query)
query.split('.').inject(data) { |memo,key|
key = key.to_i if memo.is_a?(Array)
memo.fetch(key)
}
end

def checkJsonStatus?(jsonFile,doc)
return TRUE if getFromJson(jsonFile, 'status.msg').match(/ok/i)
puts "something went wrong while requesting for Doc: #{doc}"
return FALSE
end

def readFileToJson(file)
puts "#{file} not found" unless File.exist?(file)
return JSON.load(File.read(file))
end

def foundRightJson?(json, analysis)
return TRUE if getFromJson(json, 'data.Job.properties.Related Analysis.value') == analysis
return FALSE
end

def foundRightRelatedBiosample?(json, biosample)
return TRUE if getFromJson(json, "Related Biosample.value") == biosample
return FALSE
end

def decompressBedgraphXZ(xzFile)
if File.exist?(xzFile)
`xz -d #{xzFile}`
return TRUE
else
puts ""
puts "Warning: The xz file: #{xzFile} does not exist!"
puts ""
end
return FALSE
end

def fileExists?(path,filetype)
return TRUE if File.exist?(path)
puts "#{filetype}: #{path} does not exist"
return FALSE
end

def readSampleFile(sampleFile)
output = {}
File.foreach(sampleFile) { |line|
next if line.match(/biosample|analysis/i)
tmp = line.chomp.split("\t")
output[tmp[1]] = tmp[0]
}
return output
end

def cleanup(workPath)
Dir["#{workPath}/*"].each { |entry|
if File.directory?(entry)
cleanup(entry)
else
File.delete("#{entry}")
end
}
Dir.delete("#{workPath}")
end

options = {}
optparse = OptionParser.new { |opts|
opts.banner = "Usage: ruby #{File.basename(__FILE__)} [options]\n"
opts.on('-b', '--bed bedFile', "Path to the region of interest Bed file for intersection") {|bed| options[:bed]=bed }
opts.on('-s', '--samples sampleFile', "Path to the sample files, tab delimited format: each row with [analysis ID]\\t[biosampleID]") { |samples| options[:samples]=samples}
opts.on('-o', '--out outputPath', "Designate output path (default at the current locaiton)") {|out| options[:out]=out}
opts.on('-n', '--filename outputName', "The name of the output file (default as exRNA_data_slice_combined.bed)") {|fname| options[:fname]=fname}
opts.on('-m', '--multirun', "Keep intermediate files to speed up the future run time") { options[:multirun]= true}
opts.on('--nocleanup', "Keep the tmp directory and do not remove anything") {options[:nocleanup] = true}
opts.on('-h', '--help', "Display this screen"){ puts optparse; exit }
}
optparse.parse!

workingPath = Dir.pwd
workingPath = options[:out] if options[:out]
tmpPath = "#{workingPath}/tmp"
bedgraphPath = "#{tmpPath}/bedgraphs"
Dir.mkdir(tmpPath) unless File.directory?(tmpPath)
Dir.mkdir(bedgraphPath) unless File.directory?(bedgraphPath)

#check for the required parameters are passed in
if options[:bed].nil?
puts "A region of interest bed file is required. It can be passed in by using [-b|--bed] [path to bed]"
puts ""
puts optparse
exit(1)
end
if options[:samples].nil?
puts "A tsv for the samples is required. It can be passed in by using [-s|--samples] [path to sample file]"
puts ""
puts optparse
exit(1)
end

sampleFile = options[:samples]
roi = options[:bed]

puts "Using roi bed file: #{roi}"
puts "Using sample file: #{sampleFile}"
puts "Using output directory: #{workingPath}"

exit (1) unless fileExists?(roi,"ROI") #make sure input file is there
exit(1) unless fileExists?(sampleFile, "Samples") #make sure input file is there

alljobsFile = "#{tmpPath}/alljobs.json"
getAllJobsJson(alljobsFile) unless File.exist?(alljobsFile)
allJobs = readFileToJson(alljobsFile)
#make sure the name of the jobs could be found, otherwise exit
exit(1) unless checkJsonStatus?(allJobs, "All Jobs")

failed = []
samples = readSampleFile(sampleFile)
#locate and download the bedgraphs
samples.each { |biosample, analysis|
bedgraphfile = "#{bedgraphPath}/#{biosample}_endogenousAlignments_genome_Aligned.bedgraph"
puts "Looking for #{biosample} bedgraph file: #{bedgraphfile}"
if File.exist?(bedgraphfile)
puts "#{bedgraphfile} exists already.. move on to the next"
next
end
puts "#{bedgraphfile} was not found. Need to find the ftp path."
jobFound = FALSE
getFromJson(allJobs,"data").each { |item|
docName = getFromJson(item,"Job.value")
jobfile = "#{tmpPath}/#{docName}.metadata.tsv"
next if docName.match(/PCR/)
curlAtlasApi(:Jobs,docName,jobfile) unless File.exist?(jobfile)
docJson = readFileToJson(jobfile)
next unless checkJsonStatus?( docJson,docName ) #make sure the docName was retrieved correctly
next unless foundRightJson?( docJson,analysis )
puts "Found the correct FTPjob for the analysis: #{docName} and downloaded the doc file: #{jobfile}"
jobFound = TRUE
getFromJson( docJson,"data.Job.properties.Related Biosamples.items").each { |relatedBiosample|
next unless foundRightRelatedBiosample?(relatedBiosample,biosample)
relatedResultFiles = getFromJson(relatedBiosample,"Related Biosample.properties.Related Result Files.value")
rfFile = "#{tmpPath}/#{relatedResultFiles}.metadata.tsv"
puts "Found result file doc.. download doc: #{rfFile}"
curlAtlasApi("Result Files",relatedResultFiles,rfFile) unless File.exist?(rfFile)
rfJson = readFileToJson(rfFile)
next unless checkJsonStatus?(rfJson, relatedResultFiles) #make sure the result files doc was retrieved correctly
getFromJson(rfJson,"data.Result Files.properties.Biosample ID.properties.Pipeline Result Files.items").each { |file|
next unless getFromJson(file,"File ID.properties.File Name.value") == "endogenousAlignments_genome_Aligned.bedgraph.xz"
bedgraphuri = URI(getFromJson(file,"File ID.properties.Genboree URL.value"))
bedgraphXzFile = "#{bedgraphPath}/#{biosample}_#{File.basename(bedgraphuri.to_s)}"
puts "downlaoding bedgraph.xz file: #{bedgraphXzFile}"
curlFtpFile(bedgraphuri.to_s, bedgraphXzFile)
if decompressBedgraphXZ(bedgraphXzFile) #decompressedBedgraphXZ will check if the file does exist prior to decompressing the file
puts "#{bedgraphXzFile} has been decompressed."
else
failed << "Failed to download bedgraph for #{biosample}"
end
}
}
}
unless jobFound
failed << failed << "Failed to download bedgraph for #{biosample}"
puts ""
puts "Warning: Cannot find the Job file for analysis: #{analysis}, biosample: #{biosample}."
puts "Please check to make sure if the IDs for the analysis and the biosamples are correct."
puts ""
end
}

# make sure the roi file is sorted
sortedBedPath = "#{tmpPath}/sortedBed"
Dir.mkdir(sortedBedPath) unless File.directory?(sortedBedPath)
roiFname = File.basename(roi, File.extname(roi))
sortedBedFile = "#{sortedBedPath}/#{roiFname}_sorted#{File.extname(roi)}"
puts "Sorting the given roi bed file #{roi} and store it as #{sortedBedFile}"
puts "sort -k1,1 -k2,2n #{roi} > #{sortedBedFile}"
`sort -k1,1 -k2,2n #{roi} > #{sortedBedFile}`

# create intersection of the roi with bedgraphs
intersectPath = "#{tmpPath}/ind_intersection"
Dir.mkdir(intersectPath) unless File.directory?(intersectPath)
codePath = File.dirname(__FILE__)
#gather all of the bedgraphs
Dir["#{bedgraphPath}/*bedgraph"].each { |bedgraph|
puts "intersecting sorted roi: #{sortedBedFile} with bedgraph: #{bedgraph} "
fname = File.basename(bedgraph)
system("sh #{codePath}/dataSlicerHelper.sh -r #{sortedBedFile} -b #{bedgraph} -i -o #{intersectPath}/#{fname}_intersect.bed")
checkfailed = $?.exitstatus == 1
failed << "Intersecting bedgraph: #{bedgraph} with sortedBedFile: #{sortedBedFile}" if checkfailed
}

# gather the intersection paths and merge all together
paths = []
names = []
Dir["#{intersectPath}/*bed"].each { |file|
paths << file
names << File.basename(file).split("_endogenousAlignments")[0]
}
outName = "exRNA_data_slice_combined.bed"
outName = options[:fname] if options[:fname]

puts "Merge intersections"
system("sh #{codePath}/dataSlicerHelper.sh -n \"#{names.join(' ')}\" -b \"#{paths.join(' ')}\" -m -o #{workingPath}/#{outName}")
checkfailed = $?.exitstatus == 1
failed << "Merging intersections" if checkfailed

unless options[:nocleanup]
puts "Clean up:"
if options[:multirun]
puts "Removing intermiediate files in #{tmpPath}/bedgraphs and #{tmpPath}/ind_intersection"
cleanup(bedgraphPath)
cleanup(sortedBedPath)
cleanup(intersectPath)
else
puts "Remove the all of the intermediate files"
cleanup(tmpPath)
end
end

# checks to see if any error was encounterred
if failed.size > 0
puts ""
puts "Error(s):"
failed.each { |f| puts f}
puts ""
else
puts "Finish."
end

0 comments on commit d7d2458

Please sign in to comment.