Skip to content

Commit

Permalink
Bam index optimizations (#1572)
Browse files Browse the repository at this point in the history
* Bam and tabix indexed file refactorings.   Introduce cache for bgzip blocks.

Partially addresses issue #1561 (igv reloads already loaded blocks)
  • Loading branch information
jrobinso authored Jan 3, 2023
1 parent 51b94bb commit 2dde211
Show file tree
Hide file tree
Showing 20 changed files with 669 additions and 199 deletions.
50 changes: 50 additions & 0 deletions dev/alignment/bam.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>igv.js</title>
</head>

<body>

<div id="igvDiv" style="padding-top: 50px;padding-bottom: 20px; height: auto"></div>

<script type="module">

import igv from "../../js/index.js";

var options =
{
genome: "hg19",
locus: [
"chr1:155,153,822-155,155,105"
],
tracks: [
{
type: "alignment",
url: "https://1000genomes.s3.amazonaws.com/phase3/data/NA19625/exome_alignment/NA19625.mapped.ILLUMINA.bwa.ASW.exome.20120522.bam",
indexURL: "https://1000genomes.s3.amazonaws.com/phase3/data/NA19625/exome_alignment/NA19625.mapped.ILLUMINA.bwa.ASW.exome.20120522.bam.bai",
name: "NA12878 - cache on",
},
{
type: "alignment",
url: "https://1000genomes.s3.amazonaws.com/phase3/data/NA19625/exome_alignment/NA19625.mapped.ILLUMINA.bwa.ASW.exome.20120522.bam",
indexURL: "https://1000genomes.s3.amazonaws.com/phase3/data/NA19625/exome_alignment/NA19625.mapped.ILLUMINA.bwa.ASW.exome.20120522.bam.bai",
cacheBlocks: false,
name: "NA12878 - cache off",
}
]
};

var igvDiv = document.getElementById("igvDiv");

igv.createBrowser(igvDiv, options)
.then(function (browser) {
console.log("Created IGV browser");
})


</script>

</body>

</html>
11 changes: 1 addition & 10 deletions dev/alignment/small_sequence.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,7 @@

<body>

<h1>BAM file with initial sort</h1>

In this example a BAM file is loaded with an initial sort position set. Also demonstrated is the use of
<i>findTracks</i>and <i>sort</i> functions to programmatically trigger a sort at specific positions.

<div style="padding-top: 20px;">
<button id="gotoButton">Goto chr1:155,155,358</button>
<button id="button1">Sort position chr1:155,155,358</button>
<button id="button2">Sort position chr1:155,155,361</button>
</div>
<h1>Test BAM alignments on small sequence << bam index interval size</h1>

<div id="igvDiv" style="padding-top: 50px;padding-bottom: 20px; height: auto"></div>

Expand Down
6 changes: 6 additions & 0 deletions dev/misc/sequence_track.html
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@
fastaURL: "https://storage.googleapis.com/genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa",
indexURL: "https://storage.googleapis.com/genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa.fai"
},
{
type: "sequence",
name: "genepattern",
fastaURL: "https://igv.genepattern.org/genomes/seq/hg38/hg38.fa",
indexURL: "https://igv.genepattern.org/genomes/seq/hg38/hg38.fa.fai"
},
// Broad references not configured for CORS
// {
// type: "sequence",
Expand Down
12 changes: 12 additions & 0 deletions dev/misc/sequence_track_hg19.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@
fastaURL: "https://storage.googleapis.com/genomics-public-data/references/Homo_sapiens_assembly19_1000genomes_decoy/Homo_sapiens_assembly19_1000genomes_decoy.fasta",
indexURL: "https://storage.googleapis.com/genomics-public-data/references/Homo_sapiens_assembly19_1000genomes_decoy/Homo_sapiens_assembly19_1000genomes_decoy.fasta.fai"
},
{
type: "sequence",
name: "genepattern - cf",
fastaURL: "https://igv.genepattern.org/genomes/seq/hg19/hg19.fasta",
indexURL: "https://igv.genepattern.org/genomes/seq/hg19/hg19.fasta.fai"
},
{
type: "sequence",
name: "genepattern",
fastaURL: "https://s3.amazonaws.com/igv-genepattern-org/genomes/seq/hg19/hg19.fasta",
indexURL: "https://s3.amazonaws.com/igv-genepattern-org/genomes/seq/hg19/hg19.fasta.fai"
}
// BroadReferences not configured for CORS
// {
// type: "sequence",
Expand Down
87 changes: 20 additions & 67 deletions js/bam/bamIndex.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// Represents a BAM index.
// Code is based heavily on bam.js, part of the Dalliance Genome Explorer, (c) Thomas Down 2006-2001.
// Represents a BAM or Tabix index.

import BinaryParser from "../binary.js"
import {optimizeChunks} from "./indexUtils.js"

const BAI_MAGIC = 21578050
const TABIX_MAGIC = 21578324
const MB = 1000000


async function parseBamIndex(arrayBuffer, genome) {
const index = new BamIndex()
Expand Down Expand Up @@ -129,14 +129,14 @@ class BamIndex {
}

/**
* Fetch blocks for a particular genomic range. This method is public so it can be unit-tested.
* Fetch chunks for a particular genomic range. This method is public so it can be unit-tested.
*
* @param refId the sequence dictionary index of the chromosome
* @param min genomic start position
* @param max genomic end position
* @param return an array of {minv: {filePointer, offset}, {maxv: {filePointer, offset}}
* @param return an array of objects representing chunks (file spans) {minv: {block, offset}, {maxv: {block, offset}}
*/
blocksForRange(refId, min, max) {
chunksForRange(refId, min, max) {

const bam = this
const ba = bam.indices[refId]
Expand All @@ -145,34 +145,38 @@ class BamIndex {
return []
} else {
const overlappingBins = reg2bins(min, max) // List of bin #s that overlap min, max
const chunks = []

//console.log("bin ranges")
//for(let b of overlappingBins) {
// console.log(`${b[0]} - ${b[1]}`)
//}

const chunks = []
// Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned
for (let binRange of overlappingBins) {
for (let bin = binRange[0]; bin <= binRange[1]; bin++) {
if (ba.binIndex[bin]) {
const binChunks = ba.binIndex[bin];
const binChunks = ba.binIndex[bin]
for (let c of binChunks) {
const cs = c[0]
const ce = c[1]
chunks.push({minv: cs, maxv: ce, bin: bin})
chunks.push({minv: cs, maxv: ce})
}
}
}
}

// Use the linear index to find minimum file position of chunks that could contain alignments in the region
const nintv = ba.linearIndex.length
let lowest = null
const minLin = Math.min(min >> 14, nintv - 1)

let lowest
const minLin = Math.min(min >> 14, nintv - 1) // i.e. min / 16384
const maxLin = Math.min(max >> 14, nintv - 1)
for (let i = minLin; i < maxLin; i++) {
for (let i = minLin; i <= maxLin; i++) {
const vp = ba.linearIndex[i]
if (vp) {
// todo -- I think, but am not sure, that the values in the linear index have to be in increasing order. So the first non-null should be minimum
if (!lowest || vp.isLessThan(lowest)) {
lowest = vp
}
lowest = vp // lowest file offset that contains alignments overlapping (min, max)
break
}
}

Expand All @@ -181,58 +185,7 @@ class BamIndex {
}
}

function optimizeChunks(chunks, lowest) {

const mergedChunks = []
let lastChunk = null

if (chunks.length === 0) return chunks

chunks.sort(function (c0, c1) {
const dif = c0.minv.block - c1.minv.block
if (dif !== 0) {
return dif
} else {
return c0.minv.offset - c1.minv.offset
}
})

chunks.forEach(function (chunk) {

if (!lowest || chunk.maxv.isGreaterThan(lowest)) {
if (lastChunk === null) {
mergedChunks.push(chunk)
lastChunk = chunk
} else {
if (canMerge(lastChunk, chunk)) {
if (chunk.maxv.isGreaterThan(lastChunk.maxv)) {
lastChunk.maxv = chunk.maxv
}
} else {
mergedChunks.push(chunk)
lastChunk = chunk
}
}
} else {
//console.log(`skipping chunk ${chunk.minv.block} - ${chunk.maxv.block}`)
}
})

return mergedChunks
}


/**
* Merge 2 blocks if the gap between them is < 1kb and the total resulting size < 100mb
* @param chunk1
* @param chunk2
* @returns {boolean|boolean}
*/
function canMerge(chunk1, chunk2) {
const gap = chunk2.minv.block - chunk1.maxv.block
const total = chunk2.maxv.block - chunk1.minv.block
return gap < 30000 && total < 10 * MB
}

/**
* Calculate the list of bins that overlap with region [beg, end]
Expand Down
26 changes: 5 additions & 21 deletions js/bam/bamReader.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import AlignmentContainer from "./alignmentContainer.js"
import BamUtils from "./bamUtils.js"
import {BGZip, igvxhr} from "../../node_modules/igv-utils/src/index.js"
import {buildOptions} from "../util/igvUtils.js"
import BGZBlockLoader from "./bgzBlockLoader.js"

/**
* Class for reading a bam file
Expand All @@ -44,6 +45,8 @@ class BamReader {
this.bamPath = config.url
this.baiPath = config.indexURL
BamUtils.setReaderDefaults(this, config)

this._blockLoader = new BGZBlockLoader(config)
}

async readAlignments(chr, bpStart, bpEnd) {
Expand All @@ -59,37 +62,18 @@ class BamReader {
} else {

const bamIndex = await this.getIndex()
const chunks = bamIndex.blocksForRange(chrId, bpStart, bpEnd)
const chunks = bamIndex.chunksForRange(chrId, bpStart, bpEnd)

if (!chunks || chunks.length === 0) {
return alignmentContainer
}

let counter = 1
for (let c of chunks) {

let lastBlockSize
if (c.maxv.offset === 0) {
lastBlockSize = 0 // Don't need to read the last block.
} else {
const bsizeOptions = buildOptions(this.config, {range: {start: c.maxv.block, size: 26}})
const abuffer = await igvxhr.loadArrayBuffer(this.bamPath, bsizeOptions)
lastBlockSize = BGZip.bgzBlockSize(abuffer)
}
const fetchMin = c.minv.block
const fetchMax = c.maxv.block + lastBlockSize
const range = {start: fetchMin, size: fetchMax - fetchMin + 1}

const compressed = await igvxhr.loadArrayBuffer(this.bamPath, buildOptions(this.config, {range: range}))

var ba = BGZip.unbgzf(compressed) //new Uint8Array(BGZip.unbgzf(compressed)); //, c.maxv.block - c.minv.block + 1));
const ba = await this._blockLoader.getData(c.minv.block, c.maxv.block)
const done = BamUtils.decodeBamRecords(ba, c.minv.offset, alignmentContainer, this.indexToChr, chrId, bpStart, bpEnd, this.filter)

if (done) {
// console.log(`Loaded ${counter} chunks out of ${chunks.length}`);
break
}
counter++
}
alignmentContainer.finish()
return alignmentContainer
Expand Down
4 changes: 3 additions & 1 deletion js/bam/bamTrack.js
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ class BAMTrack extends TrackBase {
hoverText(clickState) {
if (true === this.showCoverage && clickState.y >= this.coverageTrack.top && clickState.y < this.coverageTrack.height) {
const clickedObject = this.coverageTrack.getClickedObject(clickState)
return clickedObject.hoverText()
if(clickedObject) {
return clickedObject.hoverText()
}
}

}
Expand Down
Loading

0 comments on commit 2dde211

Please sign in to comment.