Skip to content

Commit

Permalink
*v0.3.2 (Sep 24, 2024)*:
Browse files Browse the repository at this point in the history
1. add "INFO/DNP" score for all child's "0/1" output;
2. add "INFO/HDN" tag for "high quality de novo variant", default configured as DNP > 0.8;
3. add reference call in gVCF output;
4. add "INFO/RPL" tag for saving reference PL score for gvcf output.
  • Loading branch information
sujunhao committed Sep 24, 2024
1 parent 80127b0 commit e5a8c47
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 19 deletions.
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,17 @@ Clair3-Nova is the 2nd generation of [Clair3-Trio](https://github.com/HKU-BAL/Cl
----

## Latest Updates
*v0.3.1 (July 9, 2024)*: fix bug in multiple alternative sites
*v0.3 (June 23, 2024)*: add r10.4.1 hac model and add base_err feature
*v0.3.2 (Sep 24, 2024)*:

1. add "INFO/DNP" score for all child's "0/1" output;
2. add "INFO/HDN" tag for "high quality de novo variant", default configured as DNP > 0.8;
3. add reference call in gVCF output;
4. add "INFO/RPL" tag for saving reference PL score for gvcf output.

*v0.3.1 (July 9, 2024)*: fix bug in multiple alternative sites.

*v0.3 (June 23, 2024)*: add r10.4.1 hac model and add base_err feature.

1. add r10 HAC model trained at HG002 trio
2. add `--base_err` [flag](https://github.com/HKU-BAL/Clair3/issues/220) for reducing "./." in gvcf output
3. add `--keep_iupac_bases` [flag](https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry) to showing iupac char.
Expand Down
28 changes: 23 additions & 5 deletions clair3/CallVariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def output_header():
##FILTER=<ID=RefCall,Description="Reference call">
##INFO=<ID=P,Number=0,Type=Flag,Description="Result from pileup calling">
##INFO=<ID=F,Number=0,Type=Flag,Description="Result from full-alignment calling">
##INFO=<ID=RPL,Number=.,Type=String,Description="For reference call's Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ<20 or selected by 'samtools view -F 2316' are filtered)">
Expand Down Expand Up @@ -1331,9 +1332,14 @@ def decode_alt_info(alt_info_dict):
','.join(["%.4f" % (min(1.0, 1.0 * item / read_depth)) for item in alt_list_count])

if output_config.gvcf:
PLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
alternate_base)
PLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base)
PLs = ','.join([str(x) for x in PLs])
if alternate_base == ".":
RPLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
alternate_base, is_output_ref_pl = True)
RPLs = [str(x) for x in RPLs]
information_string += ";RPL=%s" % (",".join(RPLs))


output_utilities.output("%s\t%d\t.\t%s\t%s\t%.2f\t%s\t%s\tGT:GQ:DP:AD:AF:PL\t%s:%d:%d:%s:%s:%s" % (
chromosome,
Expand Down Expand Up @@ -1368,7 +1374,7 @@ def decode_alt_info(alt_info_dict):



def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base):
def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base, is_output_ref_pl=False):
'''
PL computation
for bi-allelic: AA(00), AB(01), BB(11)
Expand All @@ -1393,7 +1399,10 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
except:
#skip N positions
if alternate_base == ".":
return [990]
if is_output_ref_pl:
break
else:
return [990]
else:
return [990] * len(genotypes[alt_num])
genotype_prob_21 = gt21_probabilities[gt21_prob_index]
Expand All @@ -1406,7 +1415,16 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
_p = genotype_prob_21 * genotype_prob_zygosity
# _p = genotype_prob_21
likelihoods.append(_p)
pass

# for reference call, compute pl as [p_ref, p_alt, p_alt^2]
if is_output_ref_pl and alternate_base == ".":
#likelihoods.append(1-likelihoods[-1])
try:
_rp = math.sqrt(1 - likelihoods[-1] + 0.25) - 0.5
likelihoods += [_rp, _rp**2]
except:
return [990] * 3


# genotype likelihood normalization
# p/sum(p)
Expand Down
31 changes: 27 additions & 4 deletions preprocess/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def readCalls(self, callPath, callType='variant', ctgName=None, ctgStart=None, c
cur_variant_start = int(line.strip('\n').split('\t')[1])
cur_variant_end = cur_variant_start - 1 + len(ref)
is_reference_call = (alt == '.') or (ref == alt)
#import pdb; pdb.set_trace()
if not is_reference_call:
# assuming AD is at the columns [-3], add 0 to AD for gVCF
ori_info = tmp[-1].split(':')
Expand All @@ -125,14 +126,36 @@ def readCalls(self, callPath, callType='variant', ctgName=None, ctgStart=None, c
# add <NON_REF> to variant calls
tmp[4] = tmp[4] + ',<NON_REF>'
if (n_alt == 1):

tmp[-1] = tmp[-1] + ',990,990,990'

elif (n_alt == 2):
tmp[-1] = tmp[-1] + ',990,990,990,990'
else:
# skip reference calls
continue
# reference calls
# original order GT:GQ:DP:AD:AF:PL
# assuming AD is at the columns [-3]
ori_info = tmp[-1].split(':')

# update DP to MIN_DP
t_n = tmp[-2].split(":")
t_n[2] = "MIN_DP"
tmp[-2] = ":".join(t_n)

# set AD field
_dp = int(ori_info[2])
_ad = int(ori_info[3])
new_ad = "%s,%s" % (_ad, _dp-_ad)
ori_info[3] = new_ad

# get pl and update INFO tage
_rpl = tmp[7].split(';')[1].split("=")[1]
ori_info[-1] = _rpl
tmp[7] = tmp[7].split(';')[0] + ";END=%s" % (tmp[1])
tmp[-1] = ':'.join(ori_info)

# add <NON_REF> to variant calls
tmp[4] = '<NON_REF>'
#import pdb; pdb.set_trace()

new_line = '\t'.join(tmp)

cur_variant_chr = tmp[0]
Expand Down
33 changes: 26 additions & 7 deletions trio/CallVariants_Denovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,18 +1458,25 @@ def decode_alt_info(alt_info_dict):
allele_frequency_s = "%.4f" % allele_frequency if len(alt_list_count) <= 1 else \
','.join(["%.4f" % (min(1.0, 1.0 * item / read_depth)) for item in alt_list_count])


#import pdb; pdb.set_trace()
DNP_info=""
if output_config.is_denovo:
DNP_info = ";DNP=%.3f" % denovo_probabilities[1] if denovo_probabilities[1] > denovo_probabilities[0] else ""
if output_config.is_denovo and output_config.trio_n_id == 0:
#DNP_info = ";DNP=%.3f" % denovo_probabilities[1] if denovo_probabilities[1] > denovo_probabilities[0] else ""
DNP_info = ";DNP=%.3f" % denovo_probabilities[1] if ((denovo_probabilities[1] >= param.output_DNP_p) or (genotype_string == "0/1")) else ""
if denovo_probabilities[1] >= param.high_DNP_p:
DNP_info += ";HDN"


##INFO=<ID=DNP,Number=.,Type=Float,Description="de novo variant probability">

if output_config.gvcf:
PLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
alternate_base)
PLs = ','.join([str(x) for x in PLs])
if alternate_base == ".":
RPLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
alternate_base, is_output_ref_pl = True)
RPLs = [str(x) for x in RPLs]
information_string += ";RPL=%s" % (",".join(RPLs))

output_utilities.output("%s\t%d\t.\t%s\t%s\t%.2f\t%s\t%s%s\tGT:GQ:DP:AD:AF:PL\t%s:%d:%d:%s:%s:%s" % (
chromosome,
Expand Down Expand Up @@ -1506,7 +1513,7 @@ def decode_alt_info(alt_info_dict):



def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base):
def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base, is_output_ref_pl=False):
'''
PL computation
for bi-allelic: AA(00), AB(01), BB(11)
Expand All @@ -1531,7 +1538,11 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
except:
#skip N positions
if alternate_base == ".":
return [990]
# for ref call, set PL as p(ref), p(non_ref)
if is_output_ref_pl:
break
else:
return [990]
else:
return [990] * len(genotypes[alt_num])
genotype_prob_21 = gt21_probabilities[gt21_prob_index]
Expand All @@ -1544,7 +1555,15 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
_p = genotype_prob_21 * genotype_prob_zygosity
# _p = genotype_prob_21
likelihoods.append(_p)
pass

# for reference call, compute pl as [p_ref, p_alt, p_alt^2]
if is_output_ref_pl and alternate_base == ".":
#likelihoods.append(1-likelihoods[-1])
try:
_rp = math.sqrt(1 - likelihoods[-1] + 0.25) - 0.5
likelihoods += [_rp, _rp**2]
except:
return [990] * 3

# genotype likelihood normalization
# p/sum(p)
Expand Down
2 changes: 2 additions & 0 deletions trio/CheckEnvs_Trio.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ def output_header(output_fn, reference_file_path, sample_name=None, cmdline=None
##INFO=<ID=P,Number=0,Type=Flag,Description="Result from pileup calling">
##INFO=<ID=F,Number=0,Type=Flag,Description="Result from full-alignment calling">
##INFO=<ID=T,Number=0,Type=Flag,Description="Result from trio calling">
##INFO=<ID=RPL,Number=.,Type=String,Description="For reference call's Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=DNP,Number=.,Type=Float,Description="de novo variant probability">
##INFO=<ID=HDN,Number=0,Type=Flag,Description="is high quality de novo variant">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ<20 or selected by 'samtools view -F 2316' are filtered)">
Expand Down
5 changes: 4 additions & 1 deletion trio/param_t.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TOOL_NAME = "clair3_nova"
VERSION='v0.3.1'
VERSION='v0.3.2'

from itertools import accumulate

Expand Down Expand Up @@ -75,3 +75,6 @@
padding_value_p1 = "60"
padding_value_p2 = "90"


high_DNP_p = 0.8
output_DNP_p = 0.5
2 changes: 2 additions & 0 deletions trio/print_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def get_header(tmp_path=None, sample_name="TMP"):
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ<20 or selected by 'samtools view -F 2316' are filtered)">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=RPL,Number=.,Type=String,Description="For reference call's Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=DNP,Number=.,Type=Float,Description="de novo variant probability">
##INFO=<ID=HDN,Number=0,Type=Flag,Description="is high quality de novo variant">
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Observed allele frequency in reads, for each ALT allele, in the same order as listed, or the REF allele for a RefCall">\n"""
)

Expand Down

0 comments on commit e5a8c47

Please sign in to comment.