Skip to content

Commit

Permalink
added function to extract SA tag and return a list of supplementary a…
Browse files Browse the repository at this point in the history
…lignments (#685)

added a new function SAMUtils.getOtherCanonicalAlignments 
This function is used to extract the 'SA' tag of a SAMRecord as a List<SAMRecord>of supplementary alignments.
  • Loading branch information
lindenb authored and lbergelson committed Aug 25, 2016
1 parent 275cfe0 commit c3d5a88
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 1 deletion.
124 changes: 124 additions & 0 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,18 @@
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;


/**
* Utilty methods.
*/
public final class SAMUtils {
/** regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */
private static final Pattern SEMICOLON_PAT = Pattern.compile("[;]");
/** regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */
private static final Pattern COMMA_PAT = Pattern.compile("[,]");

// Representation of bases, one for when in low-order nybble, one for when in high-order nybble.
private static final byte COMPRESSED_EQUAL_LOW = 0;
private static final byte COMPRESSED_A_LOW = 1;
Expand Down Expand Up @@ -1096,4 +1102,122 @@ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, fina
public static boolean isValidUnsignedIntegerAttribute(long value) {
return value >= 0 && value <= BinaryCodec.MAX_UINT;
}

/**
* Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined
* in the SAM specification.
* The name, sequence and qualities, mate data are copied from the original record.
* @param record must be non null and must have a non-null associated header.
* @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing.
*/
public static List<SAMRecord> getOtherCanonicalAlignments(final SAMRecord record) {
if( record == null ) throw new IllegalArgumentException("record is null");
if( record.getHeader() == null ) throw new IllegalArgumentException("record.getHeader() is null");
/* extract value of SA tag */
final Object saValue = record.getAttribute( SAMTagUtil.getSingleton().SA );
if( saValue == null ) return Collections.emptyList();
if( ! (saValue instanceof String) ) throw new SAMException(
"Expected a String for attribute 'SA' but got " + saValue.getClass() );

final SAMRecordFactory samReaderFactory = new DefaultSAMRecordFactory();

/* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a
* semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+.
* Each element in the list represents a part of the chimeric alignment.
* Conventionally, at a supplementary line, the 1rst element points to the primary line.
*/

/* break string using semicolon */
final String semiColonStrs[] = SEMICOLON_PAT.split((String)saValue);

/* the result list */
final List<SAMRecord> alignments = new ArrayList<>( semiColonStrs.length );

/* base SAM flag */
int record_flag = record.getFlags() ;
record_flag &= ~SAMFlag.PROPER_PAIR.flag;
record_flag &= ~SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag;
record_flag &= ~SAMFlag.READ_REVERSE_STRAND.flag;


for(int i=0; i< semiColonStrs.length;++i ) {
final String semiColonStr = semiColonStrs[i];
/* ignore empty string */
if( semiColonStr.isEmpty() ) continue;

/* break string using comma */
final String commaStrs[] = COMMA_PAT.split(semiColonStr);
if( commaStrs.length != 6 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr);

/* create the new record */
final SAMRecord otherRec = samReaderFactory.createSAMRecord( record.getHeader() );

/* copy fields from the original record */
otherRec.setReadName( record.getReadName() );
otherRec.setReadBases( record.getReadBases() );
otherRec.setBaseQualities( record.getBaseQualities() );
if( record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
otherRec.setMateReferenceIndex( record.getMateReferenceIndex() );
otherRec.setMateAlignmentStart( record.getMateAlignmentStart() );
}


/* get reference sequence */
final int tid = record.getHeader().getSequenceIndex( commaStrs[0] );
if( tid == -1 ) throw new SAMException("Unknown contig in " + semiColonStr);
otherRec.setReferenceIndex( tid );

/* fill POS */
final int alignStart;
try {
alignStart = Integer.parseInt(commaStrs[1]);
} catch( final NumberFormatException err ) {
throw new SAMException("bad POS in "+semiColonStr, err);
}

otherRec.setAlignmentStart( alignStart );

/* set TLEN */
if( record.getReadPairedFlag() &&
!record.getMateUnmappedFlag() &&
record.getMateReferenceIndex() == tid ) {
otherRec.setInferredInsertSize( record.getMateAlignmentStart() - alignStart );
}

/* set FLAG */
int other_flag = record_flag;
other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ;
/* spec: Conventionally, at a supplementary line, the 1st element points to the primary line */
if( !( record.getSupplementaryAlignmentFlag() && i==0 ) ) {
other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag;
}
otherRec.setFlags(other_flag);

/* set CIGAR */
otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) );

/* set MAPQ */
try {
otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) );
} catch (final NumberFormatException err) {
throw new SAMException("bad MAPQ in "+semiColonStr, err);
}

/* fill NM */
try {
otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) );
} catch (final NumberFormatException err) {
throw new SAMException("bad NM in "+semiColonStr, err);
}

/* if strand is not the same: reverse-complement */
if( otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag() ) {
SAMRecordUtil.reverseComplement(otherRec);
}

/* add the alignment */
alignments.add( otherRec );
}
return alignments;
}
}
77 changes: 76 additions & 1 deletion src/test/java/htsjdk/samtools/SAMUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import org.testng.Assert;
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.List;

public class SAMUtilsTest {
@Test
Expand Down Expand Up @@ -173,4 +173,79 @@ public void testClippingOfRecordWithMateAtSamePosition() {
record.setSecondOfPairFlag(true);
Assert.assertEquals(SAMUtils.getNumOverlappingAlignedBasesToClip(record), 10);
}

@Test
public void testOtherCanonicalAlignments() {
// setup the record
final SAMFileHeader header = new SAMFileHeader();
header.addSequence(new SAMSequenceRecord("1", 1000));
header.addSequence(new SAMSequenceRecord("2", 1000));
final SAMRecord record = new SAMRecord(header);
record.setReadPairedFlag(true);
record.setFirstOfPairFlag(true);
record.setCigar(TextCigarCodec.decode("10M"));
record.setReferenceIndex(0);
record.setAlignmentStart(1);
record.setMateReferenceIndex(0);
record.setMateAlignmentStart(1);
record.setReadPairedFlag(true);
record.setSupplementaryAlignmentFlag(true);//spec says first 'SA' record will be the primary record

record.setMateReferenceIndex(0);
record.setMateAlignmentStart(100);
record.setInferredInsertSize(99);

record.setReadBases("AAAAAAAAAA".getBytes());
record.setBaseQualities("##########".getBytes());
// check no alignments if no SA tag */
Assert.assertEquals(SAMUtils.getOtherCanonicalAlignments(record).size(),0);


record.setAttribute(SAMTagUtil.getSingleton().SA,
"2,500,+,3S2=1X2=2S,60,1;" +
"1,191,-,8M2S,60,0;");

// extract suppl alignments
final List<SAMRecord> suppl = SAMUtils.getOtherCanonicalAlignments(record);
Assert.assertNotNull(suppl);
Assert.assertEquals(suppl.size(), 2);

for(final SAMRecord other: suppl) {
Assert.assertFalse(other.getReadUnmappedFlag());
Assert.assertTrue(other.getReadPairedFlag());
Assert.assertFalse(other.getMateUnmappedFlag());
Assert.assertEquals(other.getMateAlignmentStart(),record.getMateAlignmentStart());
Assert.assertEquals(other.getMateReferenceName(),record.getMateReferenceName());

Assert.assertEquals(other.getReadName(),record.getReadName());
if( other.getReadNegativeStrandFlag()==record.getReadNegativeStrandFlag()) {
Assert.assertEquals(other.getReadString(),record.getReadString());
Assert.assertEquals(other.getBaseQualityString(),record.getBaseQualityString());
}
}

SAMRecord other = suppl.get(0);
Assert.assertFalse(other.getSupplementaryAlignmentFlag());//1st of suppl and 'record' is supplementary
Assert.assertEquals(other.getReferenceName(),"2");
Assert.assertEquals(other.getAlignmentStart(),500);
Assert.assertFalse(other.getReadNegativeStrandFlag());
Assert.assertEquals(other.getMappingQuality(), 60);
Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),1);
Assert.assertEquals(other.getCigarString(),"3S2=1X2=2S");
Assert.assertEquals(other.getInferredInsertSize(),0);


other = suppl.get(1);
Assert.assertTrue(other.getSupplementaryAlignmentFlag());
Assert.assertEquals(other.getReferenceName(),"1");
Assert.assertEquals(other.getAlignmentStart(),191);
Assert.assertTrue(other.getReadNegativeStrandFlag());
Assert.assertEquals(other.getMappingQuality(), 60);
Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),0);
Assert.assertEquals(other.getCigarString(),"8M2S");
Assert.assertEquals(other.getInferredInsertSize(),-91);//100(mate) - 191(other)


}

}

0 comments on commit c3d5a88

Please sign in to comment.