cosmic_files/sequencing/
rrl.rs

1use super::reverse_complement;
2use super::{RRL_ANCHOR_L, RRL_ANCHOR_R, REF_MYCO_RRL};
3use super::{GappedAlignment, SeqIdHit, align_to_ref, base_at_ref_pos, dedup_substring_same_desc, parse_multi_fasta};
4use serde::{Deserialize, Serialize};
5use std::collections::BTreeMap;
6use std::sync::LazyLock;
7
8/// Maps a reference position to `(wt_base, alts)` where `alts` maps each resistance alt to
9/// `(drugs, E.coli nomenclature)`.
10type RrlSnpMap = BTreeMap<usize, (u8, BTreeMap<u8, (Vec<String>, String)>)>;
11
12
13#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
14pub enum RrlPosition2058_2059 {
15    SusceptibleWildtype, // A2058 and A2059
16    ResistanceConferringMutation, // Any mutation at 2058 or 2059 that is not wildtype.
17    Undetermined,
18}
19
20
21impl std::fmt::Display for RrlPosition2058_2059 {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            Self::SusceptibleWildtype => write!(f, "A2058/A2059 wildtype"),
25            Self::ResistanceConferringMutation => write!(f, "Mutation 2058/2059"),
26            Self::Undetermined => write!(f, "Undetermined"),
27        }
28    }
29}
30
31impl RrlPosition2058_2059 {
32    /// Returns `Some(true)` for wildtype (susceptible), `Some(false)` for a
33    /// resistance-conferring mutation, and `None` when the call is undetermined.
34    pub fn is_susceptible(&self) -> Option<bool> {
35        match self {
36            Self::SusceptibleWildtype => Some(true),
37            Self::ResistanceConferringMutation => Some(false),
38            Self::Undetermined => None,
39        }
40    }
41
42    fn call_position_2058_2059(read: &[u8]) -> Option<(u8, u8)> {
43        let anchor_len = RRL_ANCHOR_L.len();
44        let hit = read
45            .windows(anchor_len)
46            .position(|w| w.eq_ignore_ascii_case(RRL_ANCHOR_L))?;
47        let pos2058 = hit + anchor_len;
48        let base2058 = read.get(pos2058).copied()?.to_ascii_uppercase();
49        let pos2059 = pos2058 + 1;
50        let base2059 = read.get(pos2059).copied()?.to_ascii_uppercase();
51        let right_start = pos2059 + 1;
52        let right_end = right_start + RRL_ANCHOR_R.len();
53        if right_end > read.len() {
54            return None;
55        }
56        let right_ok = read[right_start..right_end].eq_ignore_ascii_case(RRL_ANCHOR_R);
57        if !right_ok {
58            return None;
59        }
60        Some((base2058, base2059))
61    }
62
63    fn from_bases(bases: Option<(u8, u8)>) -> Option<Self> {
64        match bases {
65            Some((b'A', b'A')) => Some(Self::SusceptibleWildtype),
66            Some((_, _)) => Some(Self::ResistanceConferringMutation),
67            None => None,
68        }
69    }
70
71    pub fn from_single_read(read: &[u8]) -> Option<Self> {
72        if let Some(call) = Self::from_bases(Self::call_position_2058_2059(read)) {
73            return Some(call);
74        }
75        let rc = reverse_complement(read);
76        Self::from_bases(Self::call_position_2058_2059(&rc))
77    }
78}
79
80/// Returns `Some(true)` for wildtype (susceptible), `Some(false)` for a resistance-conferring
81/// mutation, and `None` when the call is undetermined.
82///
83/// If any `snp_calls` entry shows a resistance-conferring alt base, returns `Some(false)`
84/// regardless of `pos_opt`. Otherwise delegates to the position-based call.
85/// `pos_opt: None` (anchor not found) still allows SNP calls to return `Some(false)`.
86pub fn is_susceptible_rrl(pos_opt: Option<&RrlPosition2058_2059>, snp_calls: &[RrlSnpCall]) -> Option<bool> {
87    let has_resistance = snp_calls
88        .iter()
89        .any(|c| c.query_base.is_some_and(|b| c.resistance_bases.contains_key(&b)));
90    if has_resistance {
91        Some(false)
92    } else {
93        pos_opt.and_then(|p| p.is_susceptible())
94    }
95}
96
97/// Returns susceptibility based on position 2058/2059 alone, without considering SNP calls.
98pub fn is_susceptible_rrl_by_position(pos: &RrlPosition2058_2059) -> Option<bool> {
99    pos.is_susceptible()
100}
101
102/// Returns `Some(false)` if any observed SNP base is a resistance-conferring alt, or `None` if
103/// no resistance alt is observed.
104pub fn is_susceptible_rrl_by_snp_calls(snp_calls: &[RrlSnpCall]) -> Option<bool> {
105    if snp_calls
106        .iter()
107        .any(|c| c.query_base.is_some_and(|b| c.resistance_bases.contains_key(&b)))
108    {
109        Some(false)
110    } else {
111        None
112    }
113}
114
115/// Returns `Some(false)` only when position 2058/2059 is wildtype (`Some(true)`) **and** a
116/// resistance-conferring SNP alt is observed elsewhere; otherwise returns `None`.
117/// This captures rare mutations where resistance is not due to 2058/2059.
118pub fn is_susceptible_rrl_by_snp_calls_rare(pos_opt: Option<&RrlPosition2058_2059>, snp_calls: &[RrlSnpCall]) -> Option<bool> {
119    if is_susceptible_rrl_by_position(pos_opt?) != Some(true) {
120        return None;
121    }
122    is_susceptible_rrl_by_snp_calls(snp_calls)
123}
124
125/// All rrl susceptibility evidence for one sample, ready for UI display.
126#[derive(Debug, Clone, Default, Serialize, Deserialize)]
127pub struct RrlSusceptibilityCalls {
128    pub position_2058_2059: Option<RrlPosition2058_2059>,
129    pub snp_calls: Vec<RrlSnpCall>,
130    pub is_susceptible: Option<bool>,
131    pub is_susceptible_rare: Option<bool>,
132}
133
134
135
136#[derive(Debug, Deserialize, Clone)]
137struct ResistanceVariant {
138    #[serde(rename = "Gene")]
139    gene: String,
140    #[serde(rename = "Mutation")]
141    mutation: String,
142    #[serde(rename = "drug")]
143    drug: String,
144    #[serde(rename = "type")]
145    confers: String,
146    #[serde(rename = "E.coli-nomenclature", default)]
147    ecoli_nomenclature: String,
148}
149
150fn parse_rrl_resistance_snps(csv: &str) -> RrlSnpMap {
151    let mut rdr = csv::Reader::from_reader(csv.as_bytes());
152    let mut map: RrlSnpMap = BTreeMap::new();
153    for row in rdr.deserialize::<ResistanceVariant>() {
154        let row = row.unwrap();
155        if row.gene.trim() != "rrl" {
156            continue;
157        }
158        if row.confers.trim() != "drug_resistance" {
159            continue;
160        }
161        let drug = row.drug.trim().to_string();
162        let ecoli = row.ecoli_nomenclature.trim().to_string();
163        let m = row.mutation.trim();
164        if let Some(rest) = m.strip_prefix("n.") {
165            let digits_end = rest
166                .find(|c: char| !c.is_ascii_digit())
167                .unwrap_or(rest.len());
168            if let Ok(pos1) = rest[..digits_end].parse::<usize>() {
169                let after_pos = &rest[digits_end..];
170                if let (Some(wt), Some(alt)) = (
171                    after_pos.bytes().next(),
172                    after_pos
173                        .strip_prefix(|_: char| true)
174                        .and_then(|s| s.strip_prefix('>'))
175                        .and_then(|s| s.bytes().next()),
176                ) {
177                    let entry = map.entry(pos1 - 1).or_insert_with(|| (wt, BTreeMap::new()));
178                    let variant = entry.1.entry(alt).or_insert_with(|| (Vec::new(), ecoli.clone()));
179                    if !variant.0.contains(&drug) {
180                        variant.0.push(drug);
181                    }
182                }
183            }
184        }
185    }
186    map
187}
188
189/// Parsed and substring-deduplicated rrl reference sequences, initialised once.
190static RRL_REFS: LazyLock<Vec<(String, String, Vec<u8>)>> =
191    LazyLock::new(|| dedup_substring_same_desc(parse_multi_fasta(REF_MYCO_RRL)));
192
193static RRL_RESISTANCE_SNPS: LazyLock<BTreeMap<&'static str, RrlSnpMap>> = LazyLock::new(|| {
194    [
195        ("Mycobacterium abscessus", include_str!("../../res/sequences/ntm-db/db/Mycobacterium_abscessus/variants.csv")),
196        ("Mycobacterium avium",          include_str!("../../res/sequences/ntm-db/db/Mycobacterium_avium/variants.csv")),
197        ("Mycobacterium intracellulare", include_str!("../../res/sequences/ntm-db/db/Mycobacterium_intracellulare/variants.csv")),
198    ]
199    .into_iter()
200    .map(|(species, csv)| (species, parse_rrl_resistance_snps(csv)))
201    .collect()
202});
203
204
205#[derive(Clone, Debug, Serialize, Deserialize)]
206pub struct RrlSnpCall {
207    /// 0-based position in the rrl reference sequence.
208    pub ref_pos: usize,
209    /// Base observed in the query at this position, or `None` if not covered.
210    pub query_base: Option<u8>,
211    /// Wild-type base at this position.
212    pub wt_base: u8,
213    /// Maps each resistance-conferring alt base to `(drugs, E.coli nomenclature)`.
214    #[serde(with = "super::serde_helpers::u8_btree_map")]
215    pub resistance_bases: BTreeMap<u8, (Vec<String>, String)>,
216}
217
218impl RrlSnpCall {
219    pub fn call_tag(&self) -> String {
220        // E.coli position prefix shared by all alts at this position (e.g. "A2059").
221        let ecoli_prefix: Option<&str> = self.resistance_bases.values()
222            .next()
223            .map(|(_, nom)| &nom[..nom.len().saturating_sub(1)]);
224
225        match self.query_base {
226            None => "".to_string(),
227            Some(b) if b == self.wt_base => match ecoli_prefix {
228                Some(p) => format!("{} (wt, E.coli {})", b as char, p),
229                None    => format!("{} (wt)", b as char),
230            },
231            Some(b) if self.resistance_bases.contains_key(&b) => {
232                let (drugs, ecoli) = &self.resistance_bases[&b];
233                format!("{} ({}, E.coli {})", b as char, drugs.join(", "), ecoli)
234            }
235            Some(b) => match ecoli_prefix {
236                Some(p) => format!("{} (mutation, E.coli {}{})", b as char, p, b as char),
237                None    => format!("{} (mutation)", b as char),
238            },
239        }
240    }
241}
242
243fn call_rrl_snps(snps: &RrlSnpMap, ga: &GappedAlignment) -> Vec<RrlSnpCall> {
244    snps.iter()
245        .map(|(&ref_pos, (wt_base, alt_to_drugs))| {
246            let query_base =
247                base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos);
248            RrlSnpCall {
249                ref_pos,
250                query_base,
251                wt_base: *wt_base,
252                resistance_bases: alt_to_drugs.clone(),
253            }
254        })
255        .collect()
256}
257
258
259/// Locate the SNP site in a Sanger read and return the scan-coordinate window
260/// to display around it.
261///
262/// Returns `Some((start_scan, end_scan, is_reverse_complement, snp_base_idx))`
263/// or `None` if the anchor sequence cannot be found or the window falls
264/// outside the trace.
265///
266/// # How it works
267///
268/// The rrl SNP of interest sits at a fixed position relative to two flanking
269/// anchor sequences (`RRL_ANCHOR_L` on the left, `RRL_ANCHOR_R` on the right).
270/// The function tries both read orientations:
271///
272/// 1. **Forward** — searches `bases` for `RRL_ANCHOR_L`.  If found, the SNP
273///    position is the base immediately after the anchor.  A display window of
274///    9 bases to the left and 10 to the right is converted to scan coordinates
275///    via [`super::scan_window`], and `is_reverse_complement` is set to
276///    `false`.
277///
278/// 2. **Reverse-complement** — searches `bases` for the reverse complement of
279///    `RRL_ANCHOR_R`.  If found, the SNP position is again the base
280///    immediately after that match (which corresponds to the SNP on the
281///    complementary strand).  The left/right margins are swapped (10 left, 9
282///    right) so the displayed window covers the same biological region.
283///    `is_reverse_complement` is set to `true`.
284///
285/// The forward orientation is tried first; the reverse-complement is only
286/// attempted if the forward search fails.
287pub(super) fn find_rrl_ntm_display_window(
288    bases: &[u8],
289    peak_locs: &[u16],
290) -> Option<(u16, u16, bool, u16)> {
291    const LEFT: usize = 9;
292    const RIGHT: usize = 10;
293
294    let anchor_len: u16 = RRL_ANCHOR_L.len() as u16;
295
296    if let Some(hit) = bases
297        .windows(anchor_len as usize)
298        .position(|w| w.eq_ignore_ascii_case(RRL_ANCHOR_L))
299    {
300        let snp_pos = hit + anchor_len as usize;
301        if let Some(window) = super::scan_window(snp_pos, LEFT, RIGHT, peak_locs) {
302            return Some((window.0, window.1, false, snp_pos as u16));
303        }
304    }
305
306    let rc_anchor_r: Vec<u8> = reverse_complement(RRL_ANCHOR_R);
307    if let Some(hit) = bases
308        .windows(rc_anchor_r.len())
309        .position(|w| w.eq_ignore_ascii_case(&rc_anchor_r))
310    {
311        let snp_pos_comp = hit + rc_anchor_r.len();
312        if let Some(window) = super::scan_window(snp_pos_comp, RIGHT, LEFT, peak_locs) {
313            return Some((window.0, window.1, true, snp_pos_comp as u16));
314        }
315    }
316
317    None
318}
319
320/// Database was generated by filtering NCBI Nucleotide for Mycobacteriaceae[Organism] AND (23S ribosomal RNA[Title] OR rrl[Gene Name]) AND 1000:3200[SLEN] AND type_material[Filter] (via fetch_myco_sequences() in build.rs).
321/// Added to this database are the rrl sequences found in [ntm-db](https://github.com/pathogen-profiler/ntm-db) (via extract_ntm_db_sequences() in build.rs).
322pub fn identify_sequence_rrl_ntm(query: &[u8]) -> Vec<SeqIdHit> {
323    let rc = reverse_complement(query);
324    let rrl_position_2058_2059_opt = RrlPosition2058_2059::from_single_read(query);
325
326    let mut hits: Vec<SeqIdHit> = RRL_REFS
327        .iter()
328        .filter(|(_, _, refseq)| refseq.len() >= super::MIN_RRL_REF_LEN)
329        .map(|(accession, description, refseq)| {
330            let fwd = align_to_ref(query, refseq);
331            let rev = align_to_ref(&rc, refseq);
332            let (ga, is_reverse) = if rev.identity > fwd.identity {
333                (rev, true)
334            } else {
335                (fwd, false)
336            };
337            let rrl_snp_calls = if accession.contains(':') {
338                RRL_RESISTANCE_SNPS
339                    .get(description.as_str())
340                    .map(|snps| call_rrl_snps(snps, &ga))
341                    .unwrap_or_default()
342            } else {
343                vec![]
344            };
345            SeqIdHit {
346                accession: accession.clone(),
347                description: description.clone(),
348                identity: ga.identity,
349                is_reverse,
350                kansasii_gastri_snp_calls: vec![],
351                marinum_ulcerans_snp_calls: vec![],
352                rrl_snp_calls,
353                rrs_snp_calls: vec![],
354                erm41_snp_calls: vec![],
355                pnca_snp_calls: vec![],
356                aligned_query: ga.gapped_query,
357                aligned_ref: ga.gapped_ref,
358                ref_start: ga.ref_start,
359                erm41_position_28_opt: None,
360                rrl_position_2058_2059_opt,
361            }
362        })
363        .collect();
364
365    hits.sort_by(|a, b| {
366        b.identity
367            .partial_cmp(&a.identity)
368            .unwrap_or(std::cmp::Ordering::Equal)
369    });
370    hits
371}