cosmic_files/sequencing/
pnca.rs

1use super::tb_data::confidence_rank;
2use super::{
3    GappedAlignment, REF_PNCA, SeqIdHit, align_to_ref, base_at_ref_pos, parse_multi_fasta,
4    reverse_complement,
5};
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::collections::BTreeMap;
9use std::sync::LazyLock;
10
11/// Extra bases fetched upstream of the start codon when building `pnca/pnca_h37rv.fasta`
12/// (must match `upstream_flank` in `res/sequences/sequences.toml`'s pncA `[[genome]]` entry).
13/// HGVS `c.-N` positions are resolved against this flank.
14const UPSTREAM_FLANK: isize = 50;
15
16/// Maps a signed HGVS coding-sequence position (negative = promoter, e.g. `c.-11`) to
17/// `(wt_base, alts)` where `alts` maps each resistance alt to `(drugs, WHO confidence label)`.
18type PncaNtSnpMap = BTreeMap<isize, (u8, BTreeMap<u8, (Vec<String>, String)>)>;
19
20/// Maps a 1-based codon number to `(wt_aa, alts)` where `alts` maps each resistance alt amino
21/// acid (3-letter code, or `"*"` for nonsense) to `(drugs, WHO confidence label)`.
22type PncaAaSnpMap = BTreeMap<usize, (String, BTreeMap<String, (Vec<String>, String)>)>;
23
24/// Standard 3-letter amino acid codes used by the WHO catalogue `p.` notation.
25const AA3: [&str; 20] = [
26    "Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile", "Leu", "Lys", "Met",
27    "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
28];
29
30#[derive(Debug, Deserialize, Clone)]
31struct MutationRow {
32    #[serde(rename = "Gene")]
33    gene: String,
34    #[serde(rename = "Mutation")]
35    mutation: String,
36    #[serde(rename = "drug")]
37    drug: String,
38    #[serde(rename = "confidence")]
39    confidence: String,
40}
41
42/// Translate a single codon to its 3-letter amino acid code (`"*"` for a stop codon).
43/// Returns `None` for incomplete codons or non-ACGT bytes.
44fn translate_codon(codon: &[u8]) -> Option<&'static str> {
45    let c = [
46        codon.first()?.to_ascii_uppercase(),
47        codon.get(1)?.to_ascii_uppercase(),
48        codon.get(2)?.to_ascii_uppercase(),
49    ];
50    Some(match &c {
51        b"TTT" | b"TTC" => "Phe",
52        b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => "Leu",
53        b"ATT" | b"ATC" | b"ATA" => "Ile",
54        b"ATG" => "Met",
55        b"GTT" | b"GTC" | b"GTA" | b"GTG" => "Val",
56        b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => "Ser",
57        b"CCT" | b"CCC" | b"CCA" | b"CCG" => "Pro",
58        b"ACT" | b"ACC" | b"ACA" | b"ACG" => "Thr",
59        b"GCT" | b"GCC" | b"GCA" | b"GCG" => "Ala",
60        b"TAT" | b"TAC" => "Tyr",
61        b"TAA" | b"TAG" | b"TGA" => "*",
62        b"CAT" | b"CAC" => "His",
63        b"CAA" | b"CAG" => "Gln",
64        b"AAT" | b"AAC" => "Asn",
65        b"AAA" | b"AAG" => "Lys",
66        b"GAT" | b"GAC" => "Asp",
67        b"GAA" | b"GAG" => "Glu",
68        b"TGT" | b"TGC" => "Cys",
69        b"TGG" => "Trp",
70        b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => "Arg",
71        b"GGT" | b"GGC" | b"GGA" | b"GGG" => "Gly",
72        _ => return None,
73    })
74}
75
76/// 0-based index into `REF_PNCA` of a signed HGVS coding-sequence position. There is no
77/// `c.0` — `c.-1` sits immediately before `c.1` (the `A` of the start codon).
78fn nt_pos_to_ref_idx(c_pos: isize) -> Option<usize> {
79    let offset = if c_pos > 0 { c_pos - 1 } else { c_pos };
80    let idx = UPSTREAM_FLANK + offset;
81    if idx < 0 { None } else { Some(idx as usize) }
82}
83
84/// 0-based index into `REF_PNCA` of the first base of a 1-based codon number.
85fn codon_to_ref_idx(codon: usize) -> Option<usize> {
86    nt_pos_to_ref_idx(((codon - 1) * 3 + 1) as isize)
87}
88
89/// Parses `tbprofiler/mutations.csv`, keeping only `pncA` rows whose mutation is a simple
90/// nucleotide substitution (`c.POS[wt]>[alt]`, promoter or coding) or a single-codon amino
91/// acid substitution / nonsense call (`p.WtNNNAlt` / `p.WtNNN*`).
92///
93/// Frameshifts, in-frame indels, delins, and stop-codon extensions (`fs`, `del`, `dup`,
94/// `delins`, `ext`) are not positioned by this simple gapless-alignment model and are skipped —
95/// same limitation as the rest of this codebase, which only calls substitutions, not indels.
96fn parse_pnca_resistance_snps(csv: &str) -> (PncaNtSnpMap, PncaAaSnpMap) {
97    static NT_SNP_RE: LazyLock<Regex> =
98        LazyLock::new(|| Regex::new(r"^c\.(-?\d+)([ACGT])>([ACGT])$").unwrap());
99    static CODON_RE: LazyLock<Regex> =
100        LazyLock::new(|| Regex::new(r"^p\.([A-Za-z]{3})(\d+)([A-Za-z]{3}|\*)$").unwrap());
101
102    let mut rdr = csv::Reader::from_reader(csv.as_bytes());
103    let mut nt_map: PncaNtSnpMap = BTreeMap::new();
104    let mut aa_map: PncaAaSnpMap = BTreeMap::new();
105
106    for row in rdr.deserialize::<MutationRow>() {
107        let Ok(row) = row else { continue };
108        if row.gene.trim() != "pncA" {
109            continue;
110        }
111        let confidence = row.confidence.trim().to_string();
112        if confidence.is_empty() {
113            continue;
114        }
115        let drug = row.drug.trim().to_string();
116        let m = row.mutation.trim();
117
118        // Keep the strongest (lowest-rank) confidence seen for a given alt, since the same
119        // mutation can appear once under "who_confidence" and once under "drug_resistance".
120        if let Some(caps) = NT_SNP_RE.captures(m) {
121            let Ok(c_pos) = caps[1].parse::<isize>() else { continue };
122            let wt = caps[2].as_bytes()[0];
123            let alt = caps[3].as_bytes()[0];
124            let entry = nt_map.entry(c_pos).or_insert_with(|| (wt, BTreeMap::new()));
125            let variant = entry.1.entry(alt).or_insert_with(|| (Vec::new(), confidence.clone()));
126            if !variant.0.contains(&drug) {
127                variant.0.push(drug);
128            }
129            if confidence_rank(&confidence) < confidence_rank(&variant.1) {
130                variant.1 = confidence.clone();
131            }
132        } else if let Some(caps) = CODON_RE.captures(m) {
133            let wt_aa = caps[1].to_string();
134            let alt_aa = caps[3].to_string();
135            if !AA3.contains(&wt_aa.as_str()) || (alt_aa != "*" && !AA3.contains(&alt_aa.as_str()))
136            {
137                continue;
138            }
139            let Ok(codon) = caps[2].parse::<usize>() else { continue };
140            let entry = aa_map.entry(codon).or_insert_with(|| (wt_aa.clone(), BTreeMap::new()));
141            let variant = entry.1.entry(alt_aa).or_insert_with(|| (Vec::new(), confidence.clone()));
142            if !variant.0.contains(&drug) {
143                variant.0.push(drug);
144            }
145            if confidence_rank(&confidence) < confidence_rank(&variant.1) {
146                variant.1 = confidence.clone();
147            }
148        }
149    }
150    (nt_map, aa_map)
151}
152
153static PNCA_RESISTANCE_SNPS: LazyLock<(PncaNtSnpMap, PncaAaSnpMap)> =
154    LazyLock::new(|| parse_pnca_resistance_snps(include_str!("../../tbprofiler/mutations.csv")));
155
156/// One diagnostic pncA site: either a single nucleotide (promoter or coding) or a single codon.
157#[derive(Clone, Debug, Serialize, Deserialize)]
158pub enum PncaCallKind {
159    Nucleotide {
160        wt_base: u8,
161        query_base: Option<u8>,
162    },
163    Codon {
164        codon: usize,
165        wt_aa: String,
166        query_aa: Option<String>,
167    },
168}
169
170#[derive(Clone, Debug, Serialize, Deserialize)]
171pub struct PncaSnpCall {
172    /// 0-based position in the pncA reference sequence: the substituted base for nucleotide
173    /// calls, or the first base of the codon for codon calls.
174    pub ref_pos: usize,
175    pub kind: PncaCallKind,
176    /// Maps each resistance-conferring alt (a 1-character base string, or an amino acid
177    /// 3-letter code / `"*"`) to `(drugs, WHO confidence label)`.
178    pub resistance_alts: BTreeMap<String, (Vec<String>, String)>,
179}
180
181impl PncaSnpCall {
182    /// HGVS-style label for this site, e.g. `"c.-11A>C"` or `"p.Ala102Pro"`.
183    /// When the query didn't cover the site (`query_base`/`query_aa` is `None`), the alt is `"?"`.
184    pub fn site_label(&self) -> String {
185        match &self.kind {
186            PncaCallKind::Nucleotide { wt_base, query_base } => {
187                // Recover the signed c. position from ref_pos for display.
188                let offset = self.ref_pos as isize - UPSTREAM_FLANK;
189                if offset >= 0 {
190                    format!("c.{}{}>{}", offset + 1, *wt_base as char, query_base.map(|b| b as char).unwrap_or('?'))
191                } else {
192                    format!("c.{}{}>{}", offset, *wt_base as char, query_base.map(|b| b as char).unwrap_or('?'))
193                }
194            }
195            PncaCallKind::Codon { codon, wt_aa, query_aa } => format!("p.{}{}{}", wt_aa, codon, query_aa.as_deref().unwrap_or("?")),
196        }
197    }
198
199    pub fn call_tag(&self) -> String {
200        match &self.kind {
201            PncaCallKind::Nucleotide { wt_base, query_base } => match query_base {
202                None => String::new(),
203                Some(b) if b == wt_base => format!(""),
204                Some(b) => {
205                    let key = (*b as char).to_string();
206                    match self.resistance_alts.get(&key) {
207                        Some((drugs, confidence)) => {
208                            format!("({}, {})", drugs.join(", "), confidence)
209                        }
210                        None => format!("{} (mutation, untested)", *b as char),
211                    }
212                }
213            },
214            PncaCallKind::Codon { wt_aa, query_aa, .. } => match query_aa {
215                None => String::new(),
216                Some(aa) if aa == wt_aa => format!(""),
217                Some(aa) => match self.resistance_alts.get(aa) {
218                    Some((drugs, confidence)) => {
219                        format!("({}, {})", drugs.join(", "), confidence)
220                    }
221                    None => format!("{aa} (mutation, untested)"),
222                },
223            },
224        }
225    }
226
227    /// Classifies the allele observed at this site, or `None` when the read doesn't cover it
228    /// at all (as opposed to covering it and finding wildtype — those are not the same thing).
229    fn evidence(&self) -> Option<PncaEvidence> {
230        let (observed, wt) = match &self.kind {
231            PncaCallKind::Nucleotide { wt_base, query_base: Some(b) } => {
232                ((*b as char).to_string(), *b == *wt_base)
233            }
234            PncaCallKind::Codon { wt_aa, query_aa: Some(aa), .. } => (aa.clone(), aa == wt_aa),
235            _ => return None,
236        };
237        if wt {
238            return Some(PncaEvidence::Wildtype);
239        }
240        Some(match self.resistance_alts.get(&observed) {
241            Some((_, conf)) => PncaEvidence::Catalogued(confidence_rank(conf)),
242            None => PncaEvidence::Uncatalogued,
243        })
244    }
245}
246
247/// What the read showed at one diagnostic pncA site, for [`is_susceptible_pnca`].
248enum PncaEvidence {
249    /// Matches the wildtype allele — positive evidence of susceptibility at this site.
250    Wildtype,
251    /// Differs from wildtype but isn't in the WHO catalogue — neither confirms nor rules out
252    /// resistance, but does confirm the site was covered.
253    Uncatalogued,
254    /// Matches a catalogued resistance-conferring alt, at this confidence rank
255    /// (see [`confidence_rank`]; lower = stronger resistance evidence).
256    Catalogued(u8),
257}
258
259/// All pncA susceptibility evidence for one sample, ready for UI display.
260#[derive(Debug, Clone, Default, Serialize, Deserialize)]
261pub struct PncaSusceptibilityCalls {
262    pub snp_calls: Vec<PncaSnpCall>,
263    pub is_susceptible: Option<bool>,
264}
265
266/// Returns `Some(false)` when any covered site's allele matches a catalogued resistance-
267/// conferring alt at confidence rank `< 2` (e.g. "Assoc w R"). Returns `Some(true)` only when
268/// every diagnostic site is covered *and* every observed allele is wildtype. Returns `None`
269/// whenever any site went uncovered, an uncatalogued variant was found, weak/uncertain
270/// resistance evidence was seen, or no diagnostic site was reached at all.
271pub fn is_susceptible_pnca(snp_calls: &[PncaSnpCall]) -> Option<bool> {
272    let mut saw_wildtype = false;
273    let mut saw_problem = false;
274    let mut min_resistant_rank: Option<u8> = None;
275
276    for call in snp_calls {
277        match call.evidence() {
278            None => saw_problem = true,
279            Some(PncaEvidence::Wildtype) => saw_wildtype = true,
280            Some(PncaEvidence::Uncatalogued) => saw_problem = true,
281            Some(PncaEvidence::Catalogued(rank)) => {
282                saw_problem = true;
283                min_resistant_rank = Some(min_resistant_rank.map_or(rank, |r| r.min(rank)));
284            }
285        }
286    }
287
288    if min_resistant_rank.is_some_and(|rank| rank < 2) {
289        Some(false)
290    } else if saw_wildtype && !saw_problem {
291        Some(true)
292    } else {
293        None
294    }
295}
296
297/// Collect 3 consecutive non-deleted query bases at reference positions `ref_pos..ref_pos+3`.
298/// Returns `None` if the position is outside the aligned region or any of the 3 ref positions
299/// has a deletion in the query (consistent with the existing policy of skipping frameshifts).
300fn codon_at_ref_pos(
301    gapped_query: &[u8],
302    gapped_ref: &[u8],
303    ref_start: usize,
304    ref_pos: usize,
305) -> Option<[u8; 3]> {
306    if ref_pos < ref_start {
307        return None;
308    }
309    let mut current = ref_start;
310    let mut codon = [0u8; 3];
311    let mut idx = 0usize;
312    let mut collecting = false;
313    for (&q, &r) in gapped_query.iter().zip(gapped_ref.iter()) {
314        if r != b'-' {
315            if current == ref_pos {
316                collecting = true;
317            }
318            if collecting {
319                if q == b'-' {
320                    return None;
321                }
322                codon[idx] = q.to_ascii_uppercase();
323                idx += 1;
324                if idx == 3 {
325                    return Some(codon);
326                }
327            }
328            current += 1;
329        }
330    }
331    None
332}
333
334fn call_pnca_nt_snps(map: &PncaNtSnpMap, ga: &GappedAlignment) -> Vec<PncaSnpCall> {
335    map.iter()
336        .filter_map(|(&c_pos, (wt_base, alts))| {
337            let ref_pos = nt_pos_to_ref_idx(c_pos)?;
338            let query_base =
339                base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos);
340            let resistance_alts = alts
341                .iter()
342                .map(|(&b, v)| ((b as char).to_string(), v.clone()))
343                .collect();
344            Some(PncaSnpCall {
345                ref_pos,
346                kind: PncaCallKind::Nucleotide { wt_base: *wt_base, query_base },
347                resistance_alts,
348            })
349        })
350        .collect()
351}
352
353fn call_pnca_aa_snps(map: &PncaAaSnpMap, ga: &GappedAlignment) -> Vec<PncaSnpCall> {
354    map.iter()
355        .filter_map(|(&codon, (wt_aa, alts))| {
356            let ref_pos = codon_to_ref_idx(codon)?;
357            let query_aa =
358                codon_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos)
359                    .and_then(|bases| translate_codon(&bases).map(str::to_string));
360            Some(PncaSnpCall {
361                ref_pos,
362                kind: PncaCallKind::Codon { codon, wt_aa: wt_aa.clone(), query_aa },
363                resistance_alts: alts.clone(),
364            })
365        })
366        .collect()
367}
368
369/// Database is `REF_PNCA`: the pncA CDS plus a 50bp upstream promoter flank for each
370/// *M. tuberculosis* complex member with a distinct sequence (H37Rv `Rv2043c`, bovis AF2122/97,
371/// canettii — see `res/sequences/sequences.toml` for the rest, including the bovis BCG
372/// Pasteur/africanum/mungi/orygis references that were tried and dropped as exact duplicates
373/// of one of these three), fetched from NCBI at build time (via `fetch_sequences_from_toml()`
374/// in build.rs).
375///
376/// Like [`super::rrl::identify_sequence_rrl_ntm`], this aligns the query against every reference
377/// in the database (forward and reverse-complement) and returns one [`SeqIdHit`] per reference,
378/// sorted by identity descending, so the caller can compare how well the read matches each
379/// member of the complex rather than assuming it's H37Rv.
380pub fn identify_sequence_pnca(query: &[u8]) -> Vec<SeqIdHit> {
381    let query = super::trim_start_end(query, super::PNCA_FWD_START, super::PNCA_FWD_END);
382    let rc = reverse_complement(query);
383    let (nt_map, aa_map) = &*PNCA_RESISTANCE_SNPS;
384
385    let mut hits: Vec<SeqIdHit> = parse_multi_fasta(REF_PNCA)
386        .into_iter()
387        .filter(|(_, _, refseq)| refseq.len() >= super::MIN_PNCA_REF_LEN)
388        .map(|(accession, description, refseq)| {
389            let fwd = align_to_ref(query, &refseq);
390            let rev = align_to_ref(&rc, &refseq);
391            let (ga, is_reverse) = if rev.identity > fwd.identity {
392                (rev, true)
393            } else {
394                (fwd, false)
395            };
396
397            let mut pnca_snp_calls = call_pnca_nt_snps(nt_map, &ga);
398            pnca_snp_calls.extend(call_pnca_aa_snps(aa_map, &ga));
399            pnca_snp_calls.sort_by_key(|c| c.ref_pos);
400
401            SeqIdHit {
402                accession,
403                description,
404                identity: ga.identity,
405                is_reverse,
406                kansasii_gastri_snp_calls: vec![],
407                marinum_ulcerans_snp_calls: vec![],
408                rrl_snp_calls: vec![],
409                rrs_snp_calls: vec![],
410                erm41_snp_calls: vec![],
411                pnca_snp_calls,
412                aligned_query: ga.gapped_query,
413                aligned_ref: ga.gapped_ref,
414                ref_start: ga.ref_start,
415                erm41_position_28_opt: None,
416                rrl_position_2058_2059_opt: None,
417            }
418        })
419        .collect();
420
421    hits.sort_by(|a, b| {
422        b.identity
423            .partial_cmp(&a.identity)
424            .unwrap_or(std::cmp::Ordering::Equal)
425    });
426    hits
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432
433    #[test]
434    fn test_translate_codon() {
435        assert_eq!(translate_codon(b"ATG"), Some("Met"));
436        assert_eq!(translate_codon(b"atg"), Some("Met")); // case-insensitive
437        assert_eq!(translate_codon(b"TGA"), Some("*"));
438        assert_eq!(translate_codon(b"GCC"), Some("Ala"));
439        assert_eq!(translate_codon(b"NNN"), None);
440        assert_eq!(translate_codon(b"AT"), None); // incomplete
441    }
442
443    #[test]
444    fn test_nt_pos_to_ref_idx_and_site_label_roundtrip() {
445        // c.1 (the A of ATG) sits right after the upstream flank.
446        assert_eq!(nt_pos_to_ref_idx(1), Some(UPSTREAM_FLANK as usize));
447        // c.-1 sits immediately before it; there is no c.0.
448        assert_eq!(nt_pos_to_ref_idx(-1), Some(UPSTREAM_FLANK as usize - 1));
449        assert_eq!(nt_pos_to_ref_idx(103), Some(UPSTREAM_FLANK as usize + 102));
450
451        let nt_call = PncaSnpCall {
452            ref_pos: nt_pos_to_ref_idx(-11).unwrap(),
453            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: None },
454            resistance_alts: BTreeMap::new(),
455        };
456        assert_eq!(nt_call.site_label(), "c.-11A>?");
457
458        let nt_call = PncaSnpCall {
459            ref_pos: nt_pos_to_ref_idx(103).unwrap(),
460            kind: PncaCallKind::Nucleotide { wt_base: b'C', query_base: None },
461            resistance_alts: BTreeMap::new(),
462        };
463        assert_eq!(nt_call.site_label(), "c.103C>?");
464
465        // With a known alt base.
466        let nt_call = PncaSnpCall {
467            ref_pos: nt_pos_to_ref_idx(-11).unwrap(),
468            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'C') },
469            resistance_alts: BTreeMap::new(),
470        };
471        assert_eq!(nt_call.site_label(), "c.-11A>C");
472
473        // Codon label.
474        let codon_call = PncaSnpCall {
475            ref_pos: codon_to_ref_idx(102).unwrap(),
476            kind: PncaCallKind::Codon {
477                codon: 102,
478                wt_aa: "Ala".to_string(),
479                query_aa: Some("Pro".to_string()),
480            },
481            resistance_alts: BTreeMap::new(),
482        };
483        assert_eq!(codon_call.site_label(), "p.Ala102Pro");
484    }
485
486    #[test]
487    fn test_codon_to_ref_idx() {
488        // Codon 1 is the start codon, sitting right after the flank.
489        assert_eq!(codon_to_ref_idx(1), Some(UPSTREAM_FLANK as usize));
490        // Codon 2 starts 3 bases later.
491        assert_eq!(codon_to_ref_idx(2), Some(UPSTREAM_FLANK as usize + 3));
492    }
493
494    #[test]
495    fn test_parse_pnca_resistance_snps() {
496        let csv = "Gene,Mutation,type,drug,original_mutation,confidence,source,comment\n\
497                    pncA,c.-11A>C,drug_resistance,pyrazinamide,c.-11A>C,Assoc w R,WHO catalogue v2,\n\
498                    pncA,p.Ala102Pro,drug_resistance,pyrazinamide,p.Ala102Pro,Assoc w R,WHO catalogue v2,\n\
499                    pncA,p.Ala102fs,drug_resistance,pyrazinamide,p.Ala102fs,Assoc w R,WHO catalogue v2,\n\
500                    pncA,c.-744_492del,drug_resistance,pyrazinamide,c.-744_492del,,tbdb,\n\
501                    rrs,n.1473T>C,drug_resistance,amikacin,n.1473T>C,Assoc w R,WHO catalogue v2,\n";
502        let (nt_map, aa_map) = parse_pnca_resistance_snps(csv);
503
504        // c.-11A>C parsed as a nucleotide SNP.
505        assert_eq!(nt_map.len(), 1);
506        let (wt, alts) = &nt_map[&-11];
507        assert_eq!(*wt, b'A');
508        assert!(alts.contains_key(&b'C'));
509
510        // p.Ala102Pro parsed as a codon call; the unpositioned frameshift and large deletion,
511        // and the non-pncA row, are all skipped.
512        assert_eq!(aa_map.len(), 1);
513        let (wt_aa, alts) = &aa_map[&102];
514        assert_eq!(wt_aa, "Ala");
515        assert!(alts.contains_key("Pro"));
516    }
517
518    #[test]
519    fn test_is_susceptible_pnca() {
520        // No calls at all → unknown (nothing was covered).
521        assert_eq!(is_susceptible_pnca(&[]), None);
522
523        let resistance_alts = || {
524            BTreeMap::from([(
525                "C".to_string(),
526                (vec!["pyrazinamide".to_string()], "Assoc w R".to_string()),
527            )])
528        };
529
530        // A site that the read simply didn't cover → unknown, same as no calls at all.
531        let uncovered_call = PncaSnpCall {
532            ref_pos: 0,
533            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: None },
534            resistance_alts: resistance_alts(),
535        };
536        assert_eq!(is_susceptible_pnca(&[uncovered_call]), None);
537
538        // Wildtype observed → susceptible. A fully wildtype pncA read is real evidence of
539        // susceptibility, not an absence of information.
540        let wt_call = PncaSnpCall {
541            ref_pos: 0,
542            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'A') },
543            resistance_alts: resistance_alts(),
544        };
545        assert_eq!(is_susceptible_pnca(&[wt_call.clone()]), Some(true));
546
547        // Strong resistance evidence observed → resistant, even alongside wildtype calls
548        // elsewhere (the strongest evidence wins).
549        let resistant_call = PncaSnpCall {
550            ref_pos: 0,
551            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'C') },
552            resistance_alts: resistance_alts(),
553        };
554        assert_eq!(
555            is_susceptible_pnca(&[wt_call.clone(), resistant_call]),
556            Some(false)
557        );
558
559        // Weak/uncertain catalogue entry → unknown (not enough to call susceptible or resistant).
560        let uncertain_call = PncaSnpCall {
561            ref_pos: 0,
562            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'G') },
563            resistance_alts: BTreeMap::from([(
564                "G".to_string(),
565                (vec!["pyrazinamide".to_string()], "Uncertain significance".to_string()),
566            )]),
567        };
568        assert_eq!(is_susceptible_pnca(&[uncertain_call]), None);
569
570        // A non-wildtype allele with no catalogue entry at all → uncatalogued, unknown.
571        let uncatalogued_call = PncaSnpCall {
572            ref_pos: 0,
573            kind: PncaCallKind::Nucleotide { wt_base: b'A', query_base: Some(b'T') },
574            resistance_alts: resistance_alts(),
575        };
576        assert_eq!(is_susceptible_pnca(&[uncatalogued_call]), None);
577    }
578
579    #[test]
580    fn test_identify_sequence_pnca_perfect_match() {
581        // Aligning the H37Rv reference against the whole database should return one hit per
582        // reference (H37Rv, bovis AF2122/97, canettii), with H37Rv itself sorted first at 100%
583        // identity and the start/stop codons translated correctly.
584        let (_, _, refseq) = parse_multi_fasta(REF_PNCA).into_iter().next().unwrap();
585        let hits = identify_sequence_pnca(&refseq);
586        assert_eq!(hits.len(), 3);
587        let hit = &hits[0];
588        assert!(hit.identity > 99.0);
589        assert!(!hit.is_reverse);
590    }
591}