Skip to content

Commit

Permalink
feat: output the HN tag and optionally add the MD to the XA tag (#25)
Browse files Browse the repository at this point in the history
This requires us to patch bwa, which we store in the patches
sub-directory.  They are applied before we build, and reverted after
the build completes (regardless of success).  They can be removed
when the bwa PRs are merged and submodule updated.

See: lh3/bwa#438
See: lh3/bwa#439
  • Loading branch information
nh13 authored Jan 17, 2025
1 parent 4019e84 commit fa2872b
Show file tree
Hide file tree
Showing 7 changed files with 375 additions and 43 deletions.
115 changes: 75 additions & 40 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,45 @@
import multiprocessing
import os
import platform
import subprocess
from contextlib import contextmanager
from pathlib import Path
from typing import List

from Cython.Build import cythonize
from Cython.Distutils.build_ext import new_build_ext as cython_build_ext
from setuptools import Extension, Distribution

@contextmanager
def changedir(path):
save_dir = os.getcwd()
os.chdir(path)
try:
yield
finally:
os.chdir(save_dir)

@contextmanager
def with_patches():
patches = sorted([
os.path.abspath(patch)
for patch in Path("patches").iterdir()
if patch.is_file() and patch.suffix == ".patch"
])
with changedir("bwa"):
for patch in patches:
retcode = subprocess.call(f"git apply {patch}", shell=True)
if retcode != 0:
raise RuntimeError(f"Failed to apply patch {patch}")
try:
yield
finally:
commands = ["git submodule deinit -f .", "git submodule update --init"]
for command in commands:
retcode = subprocess.call(command, shell=True)
if retcode != 0:
raise RuntimeError(f"Failed to reset submodules: {command}")

SOURCE_DIR = Path("pybwa")
BUILD_DIR = Path("cython_build")
compile_args = []
Expand Down Expand Up @@ -110,46 +143,48 @@ def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]:


def build():
# Collect and cythonize all files
extension_modules = cythonize_helper([
libbwaindex_module,
libbwaaln_module,
libbwamem_module
])

# Use Setuptools to collect files
distribution = Distribution({
"name": "pybwa",
'version': '0.0.1',
'description': 'Python bindings for BWA',
'long_description': __doc__,
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/pybwa',
'packages': ['pybwa', 'pybwa.include.bwa'],
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()
# apply patches to bwa, then revert them after
with with_patches():
# Collect and cythonize all files
extension_modules = cythonize_helper([
libbwaindex_module,
libbwaaln_module,
libbwamem_module
])

# Use Setuptools to collect files
distribution = Distribution({
"name": "pybwa",
'version': '0.0.1',
'description': 'Python bindings for BWA',
'long_description': __doc__,
'long_description_content_type': 'text/x-rst',
'author': 'Nils Homer',
'author_email': '[email protected]',
'license': 'MIT',
'platforms': ['POSIX', 'UNIX', 'MacOS'],
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
'url': 'https://github.com/fulcrumgenomics/pybwa',
'packages': ['pybwa', 'pybwa.include.bwa'],
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
"ext_modules": extension_modules,
"cmdclass": {
"build_ext": cython_build_ext,
},
})

# Grab the build_ext command and copy all files back to source dir.
# Done so Poetry grabs the files during the next step in its build.
build_ext_cmd = distribution.get_command_obj("build_ext")
build_ext_cmd.ensure_finalized()
# Set the value to 1 for "inplace", with the goal to build extensions
# in build directory, and then copy all files back to the source dir
# (under the hood, "copy_extensions_to_source" will be called after
# building the extensions). This is done so Poetry grabs the files
# during the next step in its build.
build_ext_cmd.inplace = 1
build_ext_cmd.run()


if __name__ == "__main__":
Expand Down
8 changes: 8 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ As a result, the mapping quality may also differ slightly.
This is due to an implementation detail in which the order of random numbers used is different between this wrapper
and command-line.

Finally, the following additions have been made to :code:`bwa aln/samse`:

#. The standard SAM tag :code:`HN` is added. This is useful if we find too many hits
(:attr:`~pybwa.BwaAlnOptions.max_hits`) and therefore no hits are reported in the :code:`XA` tag, we can still
know how many were found.
#. The :py:attr:`~pybwa.BwaAlnOptions.with_md` option will add the standard SAM tag :code:`MD` to the :code:`XA` tag,
otherwise :code:`.` will be used. This provides additional information on the quality of alternative alignments.

===
API
===
Expand Down
76 changes: 76 additions & 0 deletions patches/0001-feat-add-HN-tag-to-bwa-aln.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001
From: Nils Homer <[email protected]>
Date: Thu, 16 Jan 2025 22:35:13 -0800
Subject: [PATCH 1/2] feat: add HN tag to bwa aln

---
bwase.c | 17 +++++++++--------
bwtaln.h | 1 +
2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/bwase.c b/bwase.c
index 18e8671..eb43c02 100644
--- a/bwase.c
+++ b/bwase.c
@@ -21,7 +21,7 @@ int g_log_n[256];

void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
{
- int i, cnt, best;
+ int i, k, cnt, best;
if (n_aln == 0) {
s->type = BWA_TYPE_NO_MATCH;
s->c1 = s->c2 = 0;
@@ -47,14 +47,14 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
}

+ for (k = s->n_occ = 0; k < n_aln; ++k) {
+ const bwt_aln1_t *q = aln + k;
+ s->n_occ += q->l - q->k + 1;
+ }
if (n_multi) {
- int k, rest, n_occ, z = 0;
- for (k = n_occ = 0; k < n_aln; ++k) {
- const bwt_aln1_t *q = aln + k;
- n_occ += q->l - q->k + 1;
- }
+ int rest, z = 0;
if (s->multi) free(s->multi);
- if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
+ if (s->n_occ > n_multi + 1) { // if there are too many hits, generate none of them
s->multi = 0; s->n_multi = 0;
return;
}
@@ -62,7 +62,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
* here. In principle, due to the requirement above, we can
* simply output all hits, but the following samples "rest"
* number of random hits. */
- rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
+ rest = s->n_occ > n_multi + 1? n_multi + 1 : s->n_occ; // find one additional for ->sa
s->multi = calloc(rest, sizeof(bwt_multi1_t));
for (k = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
@@ -477,6 +477,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
}
}
}
+ err_printf("\tHN:i:%d", p->n_occ);
err_putchar('\n');
} else { // this read has no match
//ubyte_t *s = p->strand? p->rseq : p->seq;
diff --git a/bwtaln.h b/bwtaln.h
index 4616ff5..71ea627 100644
--- a/bwtaln.h
+++ b/bwtaln.h
@@ -76,6 +76,7 @@ typedef struct {
// multiple hits
int n_multi;
bwt_multi1_t *multi;
+ int n_occ; // total # of hits found, not just those reported in XA, output in HN
// alignment information
bwtint_t sa, pos;
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
--
2.39.5 (Apple Git-154)

Loading

0 comments on commit fa2872b

Please sign in to comment.