cosmic_files/sequencing/
erm41.rs

1use super::reverse_complement;
2use super::{ERM41_ANCHOR_L, ERM41_ANCHOR_R, ERM41_FWD_END, ERM41_FWD_START};
3use super::{GappedAlignment, SeqIdHit, align_to_ref, base_at_ref_pos, parse_fasta_seq};
4use serde::{Deserialize, Serialize};
5use std::collections::BTreeMap;
6use std::sync::LazyLock;
7
8/// Maps a reference position to `(wt_base, alts)` where `alts` maps each LoF alt base to
9/// `(mutation_label, drug)`.
10type Erm41LofSnpMap = BTreeMap<usize, (u8, BTreeMap<u8, (String, Option<String>)>)>;
11
12#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
13pub enum Erm41Position28 {
14    C28,          // Cytosine — reference allele in some strains
15    T28,          // Thymine  — inducible macrolide resistance (ATCC 19977 type)
16    G28,          // Guanine
17    A28,          // Adenine
18    Undetermined, // Anchor not found in read
19}
20
21impl std::fmt::Display for Erm41Position28 {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            Self::C28 => write!(f, "C28"),
25            Self::T28 => write!(f, "T28"),
26            Self::G28 => write!(f, "G28"),
27            Self::A28 => write!(f, "A28"),
28            Self::Undetermined => write!(f, "Position 28 not found"),
29        }
30    }
31}
32
33/// Returns susceptibility for an erm(41) call given the observed LOF SNP calls.
34///
35/// Returns `Some(true)` if any observed base is a LOF alt — a single LOF mutation is sufficient
36/// to render erm(41) non-functional, making the organism susceptible. Otherwise delegates to
37/// `pos_opt.is_susceptible()`, preserving `None` for ambiguous, undetermined, or absent positions.
38/// `pos_opt: None` (anchor not found) still allows LOF SNPs to return `Some(true)`.
39pub fn is_susceptible_erm41(pos_opt: Option<&Erm41Position28>, snp_calls: &[Erm41LofCall]) -> Option<bool> {
40    let has_lof = snp_calls
41        .iter()
42        .any(|c| c.query_base.is_some_and(|b| c.lof_alts.contains_key(&b)));
43    if has_lof {
44        Some(true)
45    } else {
46        pos_opt.and_then(|p| p.is_susceptible())
47    }
48}
49
50/// Returns `Some(true)` if any observed SNP base is a known LOF alt (erm(41) inactivated →
51/// susceptible), or `None` if no LOF alt is observed.
52pub fn is_susceptible_erm41_by_lof_snps(snp_calls: &[Erm41LofCall]) -> Option<bool> {
53    if snp_calls
54        .iter()
55        .any(|c| c.query_base.is_some_and(|b| c.lof_alts.contains_key(&b)))
56    {
57        Some(true)
58    } else {
59        None
60    }
61}
62
63/// Returns susceptibility based on position 28 alone, without considering LOF SNP calls.
64pub fn is_susceptible_erm41_by_position28(pos: &Erm41Position28) -> Option<bool> {
65    pos.is_susceptible()
66}
67
68/// All erm(41) susceptibility evidence for one sample, ready for UI display.
69#[derive(Debug, Clone, Default, Serialize, Deserialize)]
70pub struct Erm41SusceptibilityCalls {
71    pub position_28: Option<Erm41Position28>,
72    pub lof_snp_calls: Vec<Erm41LofCall>,
73    pub is_susceptible: Option<bool>,
74}
75
76impl Erm41Position28 {
77    /// Returns `Some(true)` for susceptible alleles (C28, G28, A28), `Some(false)` for T28
78    /// (inducible resistance), and `None` when the call is ambiguous or undetermined.
79    pub fn is_susceptible(&self) -> Option<bool> {
80        match self {
81            Self::C28 | Self::G28 | Self::A28 => Some(true),
82            Self::T28 => Some(false),
83            Self::Undetermined => None,
84        }
85    }
86
87    /// Returns the uppercase nucleotide byte (`b'C'`, `b'T'`, `b'G'`, `b'A'`) at erm41
88    /// position 28, located by bracketing it between `ERM41_ANCHOR_L` and `ERM41_ANCHOR_R`.
89    /// Returns `None` if either anchor is absent or the read is too short.
90    fn call_position28(read: &[u8]) -> Option<u8> {
91        let anchor_len = ERM41_ANCHOR_L.len();
92        let hit = read
93            .windows(anchor_len)
94            .position(|w| w.eq_ignore_ascii_case(ERM41_ANCHOR_L))?;
95        let pos28 = hit + anchor_len;
96        let base = read.get(pos28).copied()?.to_ascii_uppercase();
97        let right_start = pos28 + 1;
98        let right_end = right_start + ERM41_ANCHOR_R.len();
99        if right_end > read.len() {
100            return None;
101        }
102        let right_ok = read[right_start..right_end].eq_ignore_ascii_case(ERM41_ANCHOR_R);
103        if !right_ok {
104            return None;
105        }
106        Some(base)
107    }
108
109    /// Converts a raw nucleotide byte from [`call_position28`](Self::call_position28) into the
110    /// corresponding variant. Returns `None` for any byte that is not `C`, `T`, `G`, or `A`.
111    fn from_base(base: Option<u8>) -> Option<Self> {
112        match base {
113            Some(b'C') => Some(Self::C28),
114            Some(b'T') => Some(Self::T28),
115            Some(b'G') => Some(Self::G28),
116            Some(b'A') => Some(Self::A28),
117            _ => None,
118        }
119    }
120
121    /// Calls erm41 position 28 from a single read, trying the forward orientation first and
122    /// falling back to the reverse complement. Returns [`Undetermined`](Self::Undetermined) if
123    /// the anchors are not found in either orientation.
124    pub fn from_single_read(read: &[u8]) -> Self {
125        if let Some(call) = Self::from_base(Self::call_position28(read)) {
126            return call;
127        }
128        let rc = reverse_complement(read);
129        Self::from_base(Self::call_position28(&rc)).unwrap_or(Self::Undetermined)
130    }
131}
132
133fn translate_codon(codon: &[u8]) -> u8 {
134    if codon.len() < 3 {
135        return 0;
136    }
137    let c = [
138        codon[0].to_ascii_uppercase(),
139        codon[1].to_ascii_uppercase(),
140        codon[2].to_ascii_uppercase(),
141    ];
142    match &c[..] {
143        b"TTT" | b"TTC" => b'F',
144        b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => b'L',
145        b"ATT" | b"ATC" | b"ATA" => b'I',
146        b"ATG" => b'M',
147        b"GTT" | b"GTC" | b"GTA" | b"GTG" => b'V',
148        b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => b'S',
149        b"CCT" | b"CCC" | b"CCA" | b"CCG" => b'P',
150        b"ACT" | b"ACC" | b"ACA" | b"ACG" => b'T',
151        b"GCT" | b"GCC" | b"GCA" | b"GCG" => b'A',
152        b"TAT" | b"TAC" => b'Y',
153        b"TAA" | b"TAG" | b"TGA" => b'*',
154        b"CAT" | b"CAC" => b'H',
155        b"CAA" | b"CAG" => b'Q',
156        b"AAT" | b"AAC" => b'N',
157        b"AAA" | b"AAG" => b'K',
158        b"GAT" | b"GAC" => b'D',
159        b"GAA" | b"GAG" => b'E',
160        b"TGT" | b"TGC" => b'C',
161        b"TGG" => b'W',
162        b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => b'R',
163        b"GGT" | b"GGC" | b"GGA" | b"GGG" => b'G',
164        _ => 0,
165    }
166}
167
168fn three_letter_to_one(aa3: &str) -> u8 {
169    match aa3.to_ascii_uppercase().as_str() {
170        "ALA" => b'A',
171        "ARG" => b'R',
172        "ASN" => b'N',
173        "ASP" => b'D',
174        "CYS" => b'C',
175        "GLN" => b'Q',
176        "GLU" => b'E',
177        "GLY" => b'G',
178        "HIS" => b'H',
179        "ILE" => b'I',
180        "LEU" => b'L',
181        "LYS" => b'K',
182        "MET" => b'M',
183        "PHE" => b'F',
184        "PRO" => b'P',
185        "SER" => b'S',
186        "THR" => b'T',
187        "TRP" => b'W',
188        "TYR" => b'Y',
189        "VAL" => b'V',
190        _ => 0,
191    }
192}
193
194fn empty_string_as_none<'de, D>(de: D) -> Result<Option<String>, D::Error>
195where
196    D: serde::Deserializer<'de>,
197{
198    let s = Option::<String>::deserialize(de)?;
199    Ok(s.filter(|s| !s.trim().is_empty()))
200}
201
202#[derive(Debug, Deserialize)]
203struct Erm41LofRow {
204    #[serde(rename = "Gene")]
205    gene: String,
206    #[serde(rename = "Mutation")]
207    mutation: String,
208    #[serde(rename = "type")]
209    variant_type: String,
210    /// `None` means no drug resistance annotated — interpret as susceptible (`Some(true)`).
211    #[serde(rename = "drug", default, deserialize_with = "empty_string_as_none")]
212    drug: Option<String>,
213}
214
215/// Parse loss-of-function SNPs for erm(41) from a ntm-db resistance CSV.
216///
217/// Returns a map keyed by **0-based nucleotide position** in the erm(41) reference sequence.
218/// Each value is a tuple:
219/// - `.0` — the wild-type base at that position (`u8` ASCII, e.g. `b'A'`)
220/// - `.1` — `lof_alts`: a map from **alternative base** (`u8` ASCII) to a
221///   `(mutation_label, drug)` pair, where `mutation_label` is the HGVS protein annotation
222///   (e.g. `"p.Trp28*"`) and `drug` is the affected drug (`None` if absent in the CSV,
223///   interpreted as susceptible).
224///   Only single-nucleotide substitutions that produce the annotated LoF amino-acid change
225///   (stop codon or annotated replacement) are included; synonymous changes are skipped.
226fn parse_erm41_lof_snps(
227    csv: &str,
228    refseq: &[u8],
229) -> Erm41LofSnpMap {
230    let mut rdr = csv::Reader::from_reader(csv.as_bytes());
231    let mut map: Erm41LofSnpMap = BTreeMap::new();
232    for row in rdr.deserialize::<Erm41LofRow>() {
233        let row = match row {
234            Ok(r) => r,
235            Err(_) => continue,
236        };
237        if row.gene.trim() != "erm(41)" || row.variant_type.trim() != "loss_of_function" {
238            continue;
239        }
240        let m = row.mutation.trim();
241        let rest = match m.strip_prefix("p.") {
242            Some(r) if r.len() >= 4 => r,
243            _ => continue,
244        };
245        let digits_end = rest[3..]
246            .find(|c: char| !c.is_ascii_digit())
247            .map(|i| i + 3)
248            .unwrap_or(rest.len());
249        if digits_end <= 3 {
250            continue;
251        }
252        let prot_pos: usize = match rest[3..digits_end].parse() {
253            Ok(p) => p,
254            Err(_) => continue,
255        };
256        let target_str = &rest[digits_end..];
257        let target_aa = if target_str == "*" {
258            b'*'
259        } else if target_str.len() >= 3 {
260            three_letter_to_one(&target_str[..3])
261        } else {
262            continue;
263        };
264        if target_aa == 0 {
265            continue;
266        }
267        let codon_start = (prot_pos - 1) * 3;
268        if codon_start + 3 > refseq.len() {
269            continue;
270        }
271        let ref_codon = [
272            refseq[codon_start].to_ascii_uppercase(),
273            refseq[codon_start + 1].to_ascii_uppercase(),
274            refseq[codon_start + 2].to_ascii_uppercase(),
275        ];
276        for codon_pos in 0..3usize {
277            let wt_base = ref_codon[codon_pos];
278            for &alt in b"ACGT" {
279                if alt == wt_base {
280                    continue;
281                }
282                let mut alt_codon = ref_codon;
283                alt_codon[codon_pos] = alt;
284                if translate_codon(&alt_codon) == target_aa {
285                    let ref_pos = codon_start + codon_pos;
286                    let entry = map
287                        .entry(ref_pos)
288                        .or_insert_with(|| (wt_base, BTreeMap::new()));
289                    entry
290                        .1
291                        .entry(alt)
292                        .or_insert_with(|| (m.to_string(), row.drug.clone()));
293                }
294            }
295        }
296    }
297    map
298}
299
300static ERM41_LOF_SNPS: LazyLock<BTreeMap<&'static str, Erm41LofSnpMap>> = LazyLock::new(|| {
301    [(
302        super::DESC_ABSCESSUS,
303        include_str!("../../res/sequences/ntm-db/db/Mycobacterium_abscessus/variants.csv"),
304        super::REF_ERM41_ABSCESSUS,
305    )]
306    .into_iter()
307    .map(|(desc, csv, fasta)| (desc, parse_erm41_lof_snps(csv, &parse_fasta_seq(fasta))))
308    .collect()
309});
310
311#[derive(Clone, Debug, Serialize, Deserialize)]
312pub struct Erm41LofCall {
313    pub ref_pos: usize,
314    pub query_base: Option<u8>,
315    pub wt_base: u8,
316    /// Maps each loss-of-function alt base to `(mutation_label, drug)`.
317    #[serde(with = "super::serde_helpers::u8_btree_map")]
318    pub lof_alts: BTreeMap<u8, (String, Option<String>)>,
319}
320
321impl Erm41LofCall {
322    pub fn call_tag(&self) -> String {
323        match self.query_base {
324            None => "".to_string(),
325            Some(b) if b == self.wt_base => String::new(),
326            Some(b) if self.lof_alts.contains_key(&b) => {
327                format!("{}{}{} ({})", self.wt_base as char, self.ref_pos, b as char, self.lof_alts[&b].0)
328            }
329            Some(b) => format!("{}{}{}", self.wt_base as char, self.ref_pos, b as char),
330        }
331    }
332}
333
334/// Maps each known loss-of-function SNP position to the base observed in the query sequence,
335/// adjusting for the alignment offset between reference and query coordinates.
336///
337/// `snps` maps each reference position to `(wt_base, lof_alts)`, where `lof_alts` maps each
338/// loss-of-function alternate base to a `(mutation_label, drug)` pair.
339fn call_erm41_lof_snps(
340    snps: &Erm41LofSnpMap,
341    ga: &GappedAlignment,
342) -> Vec<Erm41LofCall> {
343    snps.iter()
344        .map(|(&ref_pos, (wt_base, lof_alts))| {
345            let query_base =
346                base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos);
347            Erm41LofCall {
348                ref_pos,
349                query_base,
350                wt_base: *wt_base,
351                lof_alts: lof_alts.clone(),
352            }
353        })
354        .collect()
355}
356
357pub(super) fn find_erm41_display_window(
358    bases: &[u8],
359    peak_locs: &[u16],
360) -> Option<(u16, u16, bool, u16)> {
361    // Flanking bases to show: 9 before pos28, 11 after (pos28 is the first of the 11)
362    const LEFT: usize = 9;
363    const RIGHT: usize = 11;
364
365    let anchor_len: u16 = ERM41_ANCHOR_L.len() as u16;
366
367    // --- Forward orientation: ANCHOR_L directly in PBAS ---
368    if let Some(hit) = bases
369        .windows(anchor_len as usize)
370        .position(|w| w.eq_ignore_ascii_case(ERM41_ANCHOR_L))
371    {
372        let pos28 = hit + anchor_len as usize;
373        if let Some(window) = super::scan_window(pos28, LEFT, RIGHT, peak_locs) {
374            return Some((window.0, window.1, false, pos28 as u16));
375        }
376    }
377
378    let rc_anchor_r: Vec<u8> = reverse_complement(ERM41_ANCHOR_R);
379    if let Some(hit) = bases
380        .windows(rc_anchor_r.len())
381        .position(|w| w.eq_ignore_ascii_case(&rc_anchor_r))
382    {
383        let pos28_comp = hit + rc_anchor_r.len();
384        if let Some(window) = super::scan_window(pos28_comp, RIGHT, LEFT, peak_locs) {
385            return Some((window.0, window.1, true, pos28_comp as u16));
386        }
387    }
388
389    None
390}
391
392/// Unlike identify_sequence_hsp65() and identify_sequence_rrl_ntm(), this only uses reference sequences extracted via sequences.toml and not the database fetched via fetch_myco_sequences().
393pub fn identify_sequence_erm41(query: &[u8]) -> Vec<SeqIdHit> {
394    let query = super::trim_start_end(query, ERM41_FWD_START, ERM41_FWD_END);
395    let rc = reverse_complement(query);
396    let erm41_position_28 = Erm41Position28::from_single_read(query);
397
398    let refs: &[(&str, &str, &str)] = &[
399        (
400            super::REF_ERM41_ABSCESSUS,
401            "REF_ERM41_ABSCESSUS",
402            super::DESC_ABSCESSUS,
403        ),
404        (
405            super::REF_ERM41_BOLLETII,
406            "REF_ERM41_BOLLETII",
407            super::DESC_BOLLETII,
408        ),
409        (
410            super::REF_ERM41_MASSILENSE,
411            "REF_ERM41_MASSILENSE",
412            super::DESC_MASSILIENSE,
413        ),
414    ];
415
416    let mut hits: Vec<SeqIdHit> = refs
417        .iter()
418        .map(|(fasta, accession, description)| {
419            let refseq = parse_fasta_seq(fasta);
420            let fwd = align_to_ref(query, &refseq);
421            let rev = align_to_ref(&rc, &refseq);
422            let (ga, is_reverse) = if rev.identity > fwd.identity {
423                (rev, true)
424            } else {
425                (fwd, false)
426            };
427            // Abscessus LOF SNP positions apply to all three subspecies — bolletii and
428            // massiliense are sequence-similar enough, and no separate variants.csv exists for them.
429            let erm41_snp_calls = ERM41_LOF_SNPS
430                .get(super::DESC_ABSCESSUS)
431                .map(|snps| call_erm41_lof_snps(snps, &ga))
432                .unwrap_or_default();
433            SeqIdHit {
434                accession: accession.to_string(),
435                description: description.to_string(),
436                identity: ga.identity,
437                is_reverse,
438                kansasii_gastri_snp_calls: vec![],
439                marinum_ulcerans_snp_calls: vec![],
440                rrl_snp_calls: vec![],
441                rrs_snp_calls: vec![],
442                erm41_snp_calls,
443                pnca_snp_calls: vec![],
444                aligned_query: ga.gapped_query,
445                aligned_ref: ga.gapped_ref,
446                ref_start: ga.ref_start,
447                erm41_position_28_opt: Some(erm41_position_28),
448                rrl_position_2058_2059_opt: None,
449            }
450        })
451        .collect();
452    hits.sort_by(|a, b| {
453        b.identity
454            .partial_cmp(&a.identity)
455            .unwrap_or(std::cmp::Ordering::Equal)
456    });
457    hits
458}