cosmic_files/sequencing/
rrs.rs

1use super::{
2    GappedAlignment, REF_MYCO_RRS, SeqIdHit, align_to_ref, base_at_ref_pos,
3    dedup_substring_same_desc, parse_multi_fasta, reverse_complement,
4};
5use serde::{Deserialize, Serialize};
6use std::collections::BTreeMap;
7use std::sync::LazyLock;
8
9/// Maps a reference position to `(wt_base, alts)` where `alts` maps each resistance alt to
10/// `(drugs, E.coli nomenclature)`.
11type RrsSnpMap = BTreeMap<usize, (u8, BTreeMap<u8, (Vec<String>, String)>)>;
12
13#[derive(Debug, Deserialize, Clone)]
14struct ResistanceVariant {
15    #[serde(rename = "Gene")]
16    gene: String,
17    #[serde(rename = "Mutation")]
18    mutation: String,
19    #[serde(rename = "drug")]
20    drug: String,
21    #[serde(rename = "type")]
22    confers: String,
23    #[serde(rename = "E.coli-nomenclature", default)]
24    ecoli_nomenclature: String,
25}
26
27fn parse_rrs_resistance_snps(csv: &str) -> RrsSnpMap {
28    let mut rdr = csv::Reader::from_reader(csv.as_bytes());
29    let mut map: RrsSnpMap = BTreeMap::new();
30    for row in rdr.deserialize::<ResistanceVariant>() {
31        let row = row.unwrap();
32        if row.gene.trim() != "rrs" {
33            continue;
34        }
35        if row.confers.trim() != "drug_resistance" {
36            continue;
37        }
38        let drug = row.drug.trim().to_string();
39        let ecoli = row.ecoli_nomenclature.trim().to_string();
40        let m = row.mutation.trim();
41        if let Some(rest) = m.strip_prefix("n.") {
42            let digits_end = rest
43                .find(|c: char| !c.is_ascii_digit())
44                .unwrap_or(rest.len());
45            if let Ok(pos1) = rest[..digits_end].parse::<usize>() {
46                let after_pos = &rest[digits_end..];
47                if let (Some(wt), Some(alt)) = (
48                    after_pos.bytes().next(),
49                    after_pos
50                        .strip_prefix(|_: char| true)
51                        .and_then(|s| s.strip_prefix('>'))
52                        .and_then(|s| s.bytes().next()),
53                ) {
54                    let entry = map.entry(pos1 - 1).or_insert_with(|| (wt, BTreeMap::new()));
55                    let variant = entry.1.entry(alt).or_insert_with(|| (Vec::new(), ecoli.clone()));
56                    if !variant.0.contains(&drug) {
57                        variant.0.push(drug);
58                    }
59                }
60            }
61        }
62    }
63    map
64}
65
66static RRS_RESISTANCE_SNPS: LazyLock<BTreeMap<&'static str, RrsSnpMap>> = LazyLock::new(|| {
67    [
68        ("Mycobacterium abscessus",      include_str!("../../res/sequences/ntm-db/db/Mycobacterium_abscessus/variants.csv")),
69        ("Mycobacterium avium",          include_str!("../../res/sequences/ntm-db/db/Mycobacterium_avium/variants.csv")),
70        ("Mycobacterium intracellulare", include_str!("../../res/sequences/ntm-db/db/Mycobacterium_intracellulare/variants.csv")),
71    ]
72    .into_iter()
73    .map(|(species, csv)| (species, parse_rrs_resistance_snps(csv)))
74    .collect()
75});
76
77/// Parsed and substring-deduplicated rrs reference sequences, initialised once.
78static RRS_REFS: LazyLock<Vec<(String, String, Vec<u8>)>> =
79    LazyLock::new(|| dedup_substring_same_desc(parse_multi_fasta(REF_MYCO_RRS)));
80
81#[derive(Clone, Debug, Serialize, Deserialize)]
82pub struct RrsSnpCall {
83    /// 0-based position in the rrs reference sequence.
84    pub ref_pos: usize,
85    /// Base observed in the query at this position, or `None` if not covered.
86    pub query_base: Option<u8>,
87    /// Wild-type base at this position.
88    pub wt_base: u8,
89    /// Maps each resistance-conferring alt base to `(drugs, E.coli nomenclature)`.
90    #[serde(with = "super::serde_helpers::u8_btree_map")]
91    pub resistance_bases: BTreeMap<u8, (Vec<String>, String)>,
92}
93
94impl RrsSnpCall {
95    pub fn call_tag(&self) -> String {
96        let ecoli_prefix: Option<&str> = self.resistance_bases.values()
97            .next()
98            .map(|(_, nom)| &nom[..nom.len().saturating_sub(1)]);
99
100        match self.query_base {
101            None => "".to_string(),
102            Some(b) if b == self.wt_base => match ecoli_prefix {
103                Some(p) => format!("{} (wt, E.coli {})", b as char, p),
104                None    => format!("{} (wt)", b as char),
105            },
106            Some(b) if self.resistance_bases.contains_key(&b) => {
107                let (drugs, ecoli) = &self.resistance_bases[&b];
108                format!("{} ({}, E.coli {})", b as char, drugs.join(", "), ecoli)
109            }
110            Some(b) => match ecoli_prefix {
111                Some(p) => format!("{} (mutation, E.coli {}{})", b as char, p, b as char),
112                None    => format!("{} (mutation)", b as char),
113            },
114        }
115    }
116}
117
118/// All rrs susceptibility evidence for one sample, ready for UI display.
119#[derive(Debug, Clone, Default, Serialize, Deserialize)]
120pub struct RrsSusceptibilityCalls {
121    pub snp_calls: Vec<RrsSnpCall>,
122    pub is_susceptible: Option<bool>,
123    pub is_susceptible_rare: Option<bool>,
124}
125
126/// Returns `Some(false)` if any observed SNP base is a resistance-conferring alt, or `None` if
127/// no resistance alt is observed.
128pub fn is_susceptible_rrs(snp_calls: &[RrsSnpCall]) -> Option<bool> {
129    if snp_calls
130        .iter()
131        .any(|c| c.query_base.is_some_and(|b| c.resistance_bases.contains_key(&b)))
132    {
133        Some(false)
134    } else {
135        None
136    }
137}
138
139/// Returns `Some(false)` if any observed SNP base is a resistance-conferring alt, or `None`
140/// otherwise. Parallel to `is_susceptible_rrl_by_snp_calls_rare`: rrs has no position-based
141/// anchor call, so all SNP resistance here is inherently not explained by a position mutation.
142pub fn is_susceptible_rrs_by_snp_calls_rare(snp_calls: &[RrsSnpCall]) -> Option<bool> {
143    is_susceptible_rrs(snp_calls)
144}
145
146fn call_rrs_snps(snps: &RrsSnpMap, ga: &GappedAlignment) -> Vec<RrsSnpCall> {
147    snps.iter()
148        .map(|(&ref_pos, (wt_base, alt_to_drugs))| {
149            let query_base =
150                base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos);
151            RrsSnpCall {
152                ref_pos,
153                query_base,
154                wt_base: *wt_base,
155                resistance_bases: alt_to_drugs.clone(),
156            }
157        })
158        .collect()
159}
160
161/// Align `query` against every Mycobacteriaceae 16S rRNA reference in [`REF_MYCO_RRS`] and
162/// return all hits sorted by identity (highest first).
163///
164/// # Algorithm
165///
166/// For each reference sequence (filtered to ≥ [`MIN_RRS_REF_LEN`] bp to avoid inflated scores
167/// from truncated entries):
168///
169/// 1. **Strand**: both forward and reverse-complement alignments are scored via [`best_alignment`];
170///    the strand with the higher identity wins.
171/// 2. **Identity**: gapless (shift-only) alignment — the shorter sequence is slid along the longer
172///    and the best-matching offset is chosen. Identity = matching bases / shorter length.
173/// 3. **SNP calls**: for species that have an entry in [`RRS_RESISTANCE_SNPS`] (accession
174///    contains `':'`), aminoglycoside-resistance SNPs are mapped from reference coordinates to
175///    query coordinates using the alignment offset.
176pub fn identify_sequence_16s(query: &[u8]) -> Vec<SeqIdHit> {
177    let rc = reverse_complement(query);
178    let mut hits: Vec<SeqIdHit> = RRS_REFS
179        .iter()
180        .filter(|(_, _, refseq)| refseq.len() >= super::MIN_RRS_REF_LEN)
181        .map(|(accession, description, refseq)| {
182            let fwd = align_to_ref(query, refseq);
183            let rev = align_to_ref(&rc, refseq);
184            let (ga, is_reverse) = if rev.identity > fwd.identity {
185                (rev, true)
186            } else {
187                (fwd, false)
188            };
189            let rrs_snp_calls = if accession.contains(':') {
190                RRS_RESISTANCE_SNPS
191                    .get(description.as_str())
192                    .map(|snps| call_rrs_snps(snps, &ga))
193                    .unwrap_or_default()
194            } else {
195                vec![]
196            };
197            SeqIdHit {
198                accession: accession.clone(),
199                description: description.clone(),
200                identity: ga.identity,
201                is_reverse,
202                kansasii_gastri_snp_calls: vec![],
203                marinum_ulcerans_snp_calls: vec![],
204                rrl_snp_calls: vec![],
205                rrs_snp_calls,
206                erm41_snp_calls: vec![],
207                pnca_snp_calls: vec![],
208                aligned_query: ga.gapped_query,
209                aligned_ref: ga.gapped_ref,
210                ref_start: ga.ref_start,
211                erm41_position_28_opt: None,
212                rrl_position_2058_2059_opt: None,
213            }
214        })
215        .collect();
216    hits.sort_by(|a, b| {
217        b.identity
218            .partial_cmp(&a.identity)
219            .unwrap_or(std::cmp::Ordering::Equal)
220    });
221    hits
222}