1use super::tb_data::confidence_rank;
2use super::{
3 GappedAlignment, REF_PNCA, SeqIdHit, align_to_ref, base_at_ref_pos, parse_multi_fasta,
4 reverse_complement,
5};
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::collections::BTreeMap;
9use std::sync::LazyLock;
10
11const UPSTREAM_FLANK: isize = 50;
15
16type PncaNtSnpMap = BTreeMap<isize, (u8, BTreeMap<u8, (Vec<String>, String)>)>;
19
20type PncaAaSnpMap = BTreeMap<usize, (String, BTreeMap<String, (Vec<String>, String)>)>;
23
24const AA3: [&str; 20] = [
26 "Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile", "Leu", "Lys", "Met",
27 "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
28];
29
30#[derive(Debug, Deserialize, Clone)]
31struct MutationRow {
32 #[serde(rename = "Gene")]
33 gene: String,
34 #[serde(rename = "Mutation")]
35 mutation: String,
36 #[serde(rename = "drug")]
37 drug: String,
38 #[serde(rename = "confidence")]
39 confidence: String,
40}
41
42fn translate_codon(codon: &[u8]) -> Option<&'static str> {
45 let c = [
46 codon.first()?.to_ascii_uppercase(),
47 codon.get(1)?.to_ascii_uppercase(),
48 codon.get(2)?.to_ascii_uppercase(),
49 ];
50 Some(match &c {
51 b"TTT" | b"TTC" => "Phe",
52 b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => "Leu",
53 b"ATT" | b"ATC" | b"ATA" => "Ile",
54 b"ATG" => "Met",
55 b"GTT" | b"GTC" | b"GTA" | b"GTG" => "Val",
56 b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => "Ser",
57 b"CCT" | b"CCC" | b"CCA" | b"CCG" => "Pro",
58 b"ACT" | b"ACC" | b"ACA" | b"ACG" => "Thr",
59 b"GCT" | b"GCC" | b"GCA" | b"GCG" => "Ala",
60 b"TAT" | b"TAC" => "Tyr",
61 b"TAA" | b"TAG" | b"TGA" => "*",
62 b"CAT" | b"CAC" => "His",
63 b"CAA" | b"CAG" => "Gln",
64 b"AAT" | b"AAC" => "Asn",
65 b"AAA" | b"AAG" => "Lys",
66 b"GAT" | b"GAC" => "Asp",
67 b"GAA" | b"GAG" => "Glu",
68 b"TGT" | b"TGC" => "Cys",
69 b"TGG" => "Trp",
70 b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => "Arg",
71 b"GGT" | b"GGC" | b"GGA" | b"GGG" => "Gly",
72 _ => return None,
73 })
74}
75
76fn nt_pos_to_ref_idx(c_pos: isize) -> Option<usize> {
79 let offset = if c_pos > 0 { c_pos - 1 } else { c_pos };
80 let idx = UPSTREAM_FLANK + offset;
81 if idx < 0 { None } else { Some(idx as usize) }
82}
83
84fn codon_to_ref_idx(codon: usize) -> Option<usize> {
86 nt_pos_to_ref_idx(((codon - 1) * 3 + 1) as isize)
87}
88
89fn parse_pnca_resistance_snps(csv: &str) -> (PncaNtSnpMap, PncaAaSnpMap) {
97 static NT_SNP_RE: LazyLock<Regex> =
98 LazyLock::new(|| Regex::new(r"^c\.(-?\d+)([ACGT])>([ACGT])$").unwrap());
99 static CODON_RE: LazyLock<Regex> =
100 LazyLock::new(|| Regex::new(r"^p\.([A-Za-z]{3})(\d+)([A-Za-z]{3}|\*)$").unwrap());
101
102 let mut rdr = csv::Reader::from_reader(csv.as_bytes());
103 let mut nt_map: PncaNtSnpMap = BTreeMap::new();
104 let mut aa_map: PncaAaSnpMap = BTreeMap::new();
105
106 for row in rdr.deserialize::<MutationRow>() {
107 let Ok(row) = row else { continue };
108 if row.gene.trim() != "pncA" {
109 continue;
110 }
111 let confidence = row.confidence.trim().to_string();
112 if confidence.is_empty() {
113 continue;
114 }
115 let drug = row.drug.trim().to_string();
116 let m = row.mutation.trim();
117
118 if let Some(caps) = NT_SNP_RE.captures(m) {
121 let Ok(c_pos) = caps[1].parse::<isize>() else { continue };
122 let wt = caps[2].as_bytes()[0];
123 let alt = caps[3].as_bytes()[0];
124 let entry = nt_map.entry(c_pos).or_insert_with(|| (wt, BTreeMap::new()));
125 let variant = entry.1.entry(alt).or_insert_with(|| (Vec::new(), confidence.clone()));
126 if !variant.0.contains(&drug) {
127 variant.0.push(drug);
128 }
129 if confidence_rank(&confidence) < confidence_rank(&variant.1) {
130 variant.1 = confidence.clone();
131 }
132 } else if let Some(caps) = CODON_RE.captures(m) {
133 let wt_aa = caps[1].to_string();
134 let alt_aa = caps[3].to_string();
135 if !AA3.contains(&wt_aa.as_str()) || (alt_aa != "*" && !AA3.contains(&alt_aa.as_str()))
136 {
137 continue;
138 }
139 let Ok(codon) = caps[2].parse::<usize>() else { continue };
140 let entry = aa_map.entry(codon).or_insert_with(|| (wt_aa.clone(), BTreeMap::new()));
141 let variant = entry.1.entry(alt_aa).or_insert_with(|| (Vec::new(), confidence.clone()));
142 if !variant.0.contains(&drug) {
143 variant.0.push(drug);
144 }
145 if confidence_rank(&confidence) < confidence_rank(&variant.1) {
146 variant.1 = confidence.clone();
147 }
148 }
149 }
150 (nt_map, aa_map)
151}
152
153static PNCA_RESISTANCE_SNPS: LazyLock<(PncaNtSnpMap, PncaAaSnpMap)> =
154 LazyLock::new(|| parse_pnca_resistance_snps(include_str!("../../tbprofiler/mutations.csv")));
155
156#[derive(Clone, Debug, Serialize, Deserialize)]
158pub enum PncaCallKind {
159 Nucleotide {
160 wt_base: u8,
161 query_base: Option<u8>,
162 },
163 Codon {
164 codon: usize,
165 wt_aa: String,
166 query_aa: Option<String>,
167 },
168}
169
170#[derive(Clone, Debug, Serialize, Deserialize)]
171pub struct PncaSnpCall {
172 pub ref_pos: usize,
175 pub kind: PncaCallKind,
176 pub resistance_alts: BTreeMap<String, (Vec<String>, String)>,
179}
180
181impl PncaSnpCall {
182 pub fn site_label(&self) -> String {
185 match &self.kind {
186 PncaCallKind::Nucleotide { wt_base, query_base } => {
187 let offset = self.ref_pos as isize - UPSTREAM_FLANK;
189 if offset >= 0 {
190 format!("c.{}{}>{}", offset + 1, *wt_base as char, query_base.map(|b| b as char).unwrap_or('?'))
191 } else {
192 format!("c.{}{}>{}", offset, *wt_base as char, query_base.map(|b| b as char).unwrap_or('?'))
193 }
194 }
195 PncaCallKind::Codon { codon, wt_aa, query_aa } => format!("p.{}{}{}", wt_aa, codon, query_aa.as_deref().unwrap_or("?")),
196 }
197 }
198
199 pub fn call_tag(&self) -> String {
200 match &self.kind {
201 PncaCallKind::Nucleotide { wt_base, query_base } => match query_base {
202 None => String::new(),
203 Some(b) if b == wt_base => format!(""),
204 Some(b) => {
205 let key = (*b as char).to_string();
206 match self.resistance_alts.get(&key) {
207 Some((drugs, confidence)) => {
208 format!("({}, {})", drugs.join(", "), confidence)
209 }
210 None => format!("{} (mutation, untested)", *b as char),
211 }
212 }
213 },
214 PncaCallKind::Codon { wt_aa, query_aa, .. } => match query_aa {
215 None => String::new(),
216 Some(aa) if aa == wt_aa => format!(""),
217 Some(aa) => match self.resistance_alts.get(aa) {
218 Some((drugs, confidence)) => {
219 format!("({}, {})", drugs.join(", "), confidence)
220 }
221 None => format!("{aa} (mutation, untested)"),
222 },
223 },
224 }
225 }
226
227 fn evidence(&self) -> Option<PncaEvidence> {
230 let (observed, wt) = match &self.kind {
231 PncaCallKind::Nucleotide { wt_base, query_base: Some(b) } => {
232 ((*b as char).to_string(), *b == *wt_base)
233 }
234 PncaCallKind::Codon { wt_aa, query_aa: Some(aa), .. } => (aa.clone(), aa == wt_aa),
235 _ => return None,
236 };
237 if wt {
238 return Some(PncaEvidence::Wildtype);
239 }
240 Some(match self.resistance_alts.get(&observed) {
241 Some((_, conf)) => PncaEvidence::Catalogued(confidence_rank(conf)),
242 None => PncaEvidence::Uncatalogued,
243 })
244 }
245}
246
247enum PncaEvidence {
249 Wildtype,
251 Uncatalogued,
254 Catalogued(u8),
257}
258
259#[derive(Debug, Clone, Default, Serialize, Deserialize)]
261pub struct PncaSusceptibilityCalls {
262 pub snp_calls: Vec<PncaSnpCall>,
263 pub is_susceptible: Option<bool>,
264}
265
266pub fn is_susceptible_pnca(snp_calls: &[PncaSnpCall]) -> Option<bool> {
272 let mut saw_wildtype = false;
273 let mut saw_problem = false;
274 let mut min_resistant_rank: Option<u8> = None;
275
276 for call in snp_calls {
277 match call.evidence() {
278 None => saw_problem = true,
279 Some(PncaEvidence::Wildtype) => saw_wildtype = true,
280 Some(PncaEvidence::Uncatalogued) => saw_problem = true,
281 Some(PncaEvidence::Catalogued(rank)) => {
282 saw_problem = true;
283 min_resistant_rank = Some(min_resistant_rank.map_or(rank, |r| r.min(rank)));
284 }
285 }
286 }
287
288 if min_resistant_rank.is_some_and(|rank| rank < 2) {
289 Some(false)
290 } else if saw_wildtype && !saw_problem {
291 Some(true)
292 } else {
293 None
294 }
295}
296
297fn codon_at_ref_pos(
301 gapped_query: &[u8],
302 gapped_ref: &[u8],
303 ref_start: usize,
304 ref_pos: usize,
305) -> Option<[u8; 3]> {
306 if ref_pos < ref_start {
307 return None;
308 }
309 let mut current = ref_start;
310 let mut codon = [0u8; 3];
311 let mut idx = 0usize;
312 let mut collecting = false;
313 for (&q, &r) in gapped_query.iter().zip(gapped_ref.iter()) {
314 if r != b'-' {
315 if current == ref_pos {
316 collecting = true;
317 }
318 if collecting {
319 if q == b'-' {
320 return None;
321 }
322 codon[idx] = q.to_ascii_uppercase();
323 idx += 1;
324 if idx == 3 {
325 return Some(codon);
326 }
327 }
328 current += 1;
329 }
330 }
331 None
332}
333
334fn call_pnca_nt_snps(map: &PncaNtSnpMap, ga: &GappedAlignment) -> Vec<PncaSnpCall> {
335 map.iter()
336 .filter_map(|(&c_pos, (wt_base, alts))| {
337 let ref_pos = nt_pos_to_ref_idx(c_pos)?;
338 let query_base =
339 base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos);
340 let resistance_alts = alts
341 .iter()
342 .map(|(&b, v)| ((b as char).to_string(), v.clone()))
343 .collect();
344 Some(PncaSnpCall {
345 ref_pos,
346 kind: PncaCallKind::Nucleotide { wt_base: *wt_base, query_base },
347 resistance_alts,
348 })
349 })
350 .collect()
351}
352
353fn call_pnca_aa_snps(map: &PncaAaSnpMap, ga: &GappedAlignment) -> Vec<PncaSnpCall> {
354 map.iter()
355 .filter_map(|(&codon, (wt_aa, alts))| {
356 let ref_pos = codon_to_ref_idx(codon)?;
357 let query_aa =
358 codon_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos)
359 .and_then(|bases| translate_codon(&bases).map(str::to_string));
360 Some(PncaSnpCall {
361 ref_pos,
362 kind: PncaCallKind::Codon { codon, wt_aa: wt_aa.clone(), query_aa },
363 resistance_alts: alts.clone(),
364 })
365 })
366 .collect()
367}
368
369pub fn identify_sequence_pnca(query: &[u8]) -> Vec<SeqIdHit> {
381 let query = super::trim_start_end(query, super::PNCA_FWD_START, super::PNCA_FWD_END);
382 let rc = reverse_complement(query);
383 let (nt_map, aa_map) = &*PNCA_RESISTANCE_SNPS;
384
385 let mut hits: Vec<SeqIdHit> = parse_multi_fasta(REF_PNCA)
386 .into_iter()
387 .filter(|(_, _, refseq)| refseq.len() >= super::MIN_PNCA_REF_LEN)
388 .map(|(accession, description, refseq)| {
389 let fwd = align_to_ref(query, &refseq);
390 let rev = align_to_ref(&rc, &refseq);
391 let (ga, is_reverse) = if rev.identity > fwd.identity {
392 (rev, true)
393 } else {
394 (fwd, false)
395 };
396
397 let mut pnca_snp_calls = call_pnca_nt_snps(nt_map, &ga);
398 pnca_snp_calls.extend(call_pnca_aa_snps(aa_map, &ga));
399 pnca_snp_calls.sort_by_key(|c| c.ref_pos);
400
401 SeqIdHit {
402 accession,
403 description,
404 identity: ga.identity,
405 is_reverse,
406 kansasii_gastri_snp_calls: vec![],
407 marinum_ulcerans_snp_calls: vec![],
408 rrl_snp_calls: vec![],
409 rrs_snp_calls: vec![],
410 erm41_snp_calls: vec![],
411 pnca_snp_calls,
412 aligned_query: ga.gapped_query,
413 aligned_ref: ga.gapped_ref,
414 ref_start: ga.ref_start,
415 erm41_position_28_opt: None,
416 rrl_position_2058_2059_opt: None,
417 }
418 })
419 .collect();
420
421 hits.sort_by(|a, b| {
422 b.identity
423 .partial_cmp(&a.identity)
424 .unwrap_or(std::cmp::Ordering::Equal)
425 });
426 hits
427}
428
429#[cfg(test)]
430mod tests {
431 use super::*;
432
433 #[test]
434 fn test_translate_codon() {
435 assert_eq!(translate_codon(b"ATG"), Some("Met"));
436 assert_eq!(translate_codon(b"atg"), Some("Met")); assert_eq!(translate_codon(b"TGA"), Some("*"));
438 assert_eq!(translate_codon(b"GCC"), Some("Ala"));
439 assert_eq!(translate_codon(b"NNN"), None);
440 assert_eq!(translate_codon(b"AT"), None); }
442
443 #[test]
444 fn test_nt_pos_to_ref_idx_and_site_label_roundtrip() {
445 assert_eq!(nt_pos_to_ref_idx(1), Some(UPSTREAM_FLANK as usize));
447 assert_eq!(nt_pos_to_ref_idx(-1), Some(UPSTREAM_FLANK as usize - 1));
449 assert_eq!(nt_pos_to_ref_idx(103), Some(UPSTREAM_FLANK as usize + 102));
450
451 let nt_call = PncaSnpCall {
452 ref_pos: nt_pos_to_ref_idx(-11).unwrap(),
453 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: None },
454 resistance_alts: BTreeMap::new(),
455 };
456 assert_eq!(nt_call.site_label(), "c.-11A>?");
457
458 let nt_call = PncaSnpCall {
459 ref_pos: nt_pos_to_ref_idx(103).unwrap(),
460 kind: PncaCallKind::Nucleotide { wt_base: b'C', query_base: None },
461 resistance_alts: BTreeMap::new(),
462 };
463 assert_eq!(nt_call.site_label(), "c.103C>?");
464
465 let nt_call = PncaSnpCall {
467 ref_pos: nt_pos_to_ref_idx(-11).unwrap(),
468 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'C') },
469 resistance_alts: BTreeMap::new(),
470 };
471 assert_eq!(nt_call.site_label(), "c.-11A>C");
472
473 let codon_call = PncaSnpCall {
475 ref_pos: codon_to_ref_idx(102).unwrap(),
476 kind: PncaCallKind::Codon {
477 codon: 102,
478 wt_aa: "Ala".to_string(),
479 query_aa: Some("Pro".to_string()),
480 },
481 resistance_alts: BTreeMap::new(),
482 };
483 assert_eq!(codon_call.site_label(), "p.Ala102Pro");
484 }
485
486 #[test]
487 fn test_codon_to_ref_idx() {
488 assert_eq!(codon_to_ref_idx(1), Some(UPSTREAM_FLANK as usize));
490 assert_eq!(codon_to_ref_idx(2), Some(UPSTREAM_FLANK as usize + 3));
492 }
493
494 #[test]
495 fn test_parse_pnca_resistance_snps() {
496 let csv = "Gene,Mutation,type,drug,original_mutation,confidence,source,comment\n\
497 pncA,c.-11A>C,drug_resistance,pyrazinamide,c.-11A>C,Assoc w R,WHO catalogue v2,\n\
498 pncA,p.Ala102Pro,drug_resistance,pyrazinamide,p.Ala102Pro,Assoc w R,WHO catalogue v2,\n\
499 pncA,p.Ala102fs,drug_resistance,pyrazinamide,p.Ala102fs,Assoc w R,WHO catalogue v2,\n\
500 pncA,c.-744_492del,drug_resistance,pyrazinamide,c.-744_492del,,tbdb,\n\
501 rrs,n.1473T>C,drug_resistance,amikacin,n.1473T>C,Assoc w R,WHO catalogue v2,\n";
502 let (nt_map, aa_map) = parse_pnca_resistance_snps(csv);
503
504 assert_eq!(nt_map.len(), 1);
506 let (wt, alts) = &nt_map[&-11];
507 assert_eq!(*wt, b'A');
508 assert!(alts.contains_key(&b'C'));
509
510 assert_eq!(aa_map.len(), 1);
513 let (wt_aa, alts) = &aa_map[&102];
514 assert_eq!(wt_aa, "Ala");
515 assert!(alts.contains_key("Pro"));
516 }
517
518 #[test]
519 fn test_is_susceptible_pnca() {
520 assert_eq!(is_susceptible_pnca(&[]), None);
522
523 let resistance_alts = || {
524 BTreeMap::from([(
525 "C".to_string(),
526 (vec!["pyrazinamide".to_string()], "Assoc w R".to_string()),
527 )])
528 };
529
530 let uncovered_call = PncaSnpCall {
532 ref_pos: 0,
533 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: None },
534 resistance_alts: resistance_alts(),
535 };
536 assert_eq!(is_susceptible_pnca(&[uncovered_call]), None);
537
538 let wt_call = PncaSnpCall {
541 ref_pos: 0,
542 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'A') },
543 resistance_alts: resistance_alts(),
544 };
545 assert_eq!(is_susceptible_pnca(&[wt_call.clone()]), Some(true));
546
547 let resistant_call = PncaSnpCall {
550 ref_pos: 0,
551 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'C') },
552 resistance_alts: resistance_alts(),
553 };
554 assert_eq!(
555 is_susceptible_pnca(&[wt_call.clone(), resistant_call]),
556 Some(false)
557 );
558
559 let uncertain_call = PncaSnpCall {
561 ref_pos: 0,
562 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'G') },
563 resistance_alts: BTreeMap::from([(
564 "G".to_string(),
565 (vec!["pyrazinamide".to_string()], "Uncertain significance".to_string()),
566 )]),
567 };
568 assert_eq!(is_susceptible_pnca(&[uncertain_call]), None);
569
570 let uncatalogued_call = PncaSnpCall {
572 ref_pos: 0,
573 kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'T') },
574 resistance_alts: resistance_alts(),
575 };
576 assert_eq!(is_susceptible_pnca(&[uncatalogued_call]), None);
577 }
578
579 #[test]
580 fn test_identify_sequence_pnca_perfect_match() {
581 let (_, _, refseq) = parse_multi_fasta(REF_PNCA).into_iter().next().unwrap();
585 let hits = identify_sequence_pnca(&refseq);
586 assert_eq!(hits.len(), 3);
587 let hit = &hits[0];
588 assert!(hit.identity > 99.0);
589 assert!(!hit.is_reverse);
590 }
591}