cosmic_files/sequencing/
hsp65.rs

1use super::reverse_complement;
2use super::{GappedAlignment, REF_MYCO_HSP65, SeqIdHit, align_to_ref, base_at_ref_pos, dedup_substring_same_desc, parse_multi_fasta};
3use super::{KANSASII_GASTRI_ACCS, MARINUM_ULCERANS_ACCS};
4use serde::{Deserialize, Serialize};
5use std::sync::LazyLock;
6
7const KANSASII_GASTRI_SNPS: &[(usize, u8, u8)] = &[
8    // (0-based ref pos, gastri_base, kansasii_base)
9    (100, b'C', b'T'),
10    (130, b'C', b'T'),
11    (148, b'G', b'A'),
12    (193, b'G', b'C'),
13    (283, b'C', b'G'),
14    (304, b'T', b'C'),
15    (349, b'G', b'C'),
16    (399, b'G', b'A'),
17];
18
19const MARINUM_ULCERANS_SNPS: &[(usize, u8, u8)] = &[
20    // (0-based ref pos, marinum_base, ulcerans_base)
21    (20, b'C', b'T'),
22    (204, b'T', b'C'),
23    (212, b'G', b'A'),
24    (482, b'T', b'C'),
25    (500, b'G', b'C'),
26];
27
28
29fn call_kansasii_gastri_snps(ga: &GappedAlignment) -> Vec<KansasiiGastriSnpCall> {
30    KANSASII_GASTRI_SNPS
31        .iter()
32        .filter_map(|&(ref_pos, gastri_base, kansasii_base)| {
33            let query_base =
34                base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos)?;
35            Some(KansasiiGastriSnpCall {
36                ref_pos,
37                query_base,
38                gastri_base,
39                kansasii_base,
40            })
41        })
42        .collect()
43}
44
45fn call_marinum_ulcerans_snps(ga: &GappedAlignment) -> Vec<MarinumUlceransSnpCall> {
46    MARINUM_ULCERANS_SNPS
47        .iter()
48        .filter_map(|&(ref_pos, marinum_base, ulcerans_base)| {
49            let query_base =
50                base_at_ref_pos(&ga.gapped_query, &ga.gapped_ref, ga.ref_start, ref_pos)?;
51            Some(MarinumUlceransSnpCall {
52                ref_pos,
53                query_base,
54                marinum_base,
55                ulcerans_base,
56            })
57        })
58        .collect()
59}
60
61/// Parsed and substring-deduplicated hsp65 reference sequences, initialised once.
62static HSP65_REFS: LazyLock<Vec<(String, String, Vec<u8>)>> =
63    LazyLock::new(|| dedup_substring_same_desc(parse_multi_fasta(REF_MYCO_HSP65)));
64
65/// Trim hsp65 amplification primers from a sequencing read.
66///
67/// Unlike [`trim_start_end`](super::trim_start_end), which takes a single primer per boundary,
68/// hsp65 amplification uses two alternative primer pairs (Telenti and Patel sets), so each
69/// boundary is defined by a *set* of candidates — the earliest match wins for the start and the
70/// latest for the end.
71pub fn trim_hsp65_primers(seq: &[u8]) -> Vec<u8> {
72    const FWD_START: &[&[u8]] = &[b"ATGGTGTGTCCATCGCCAAG", b"GAGGACCCGTACGAGAAGATCGGCGCTGA"];
73    const FWD_END: &[&[u8]] = &[b"GAGCTCACCGAGGGTATGCG", b"CGCTGTCCACCCTGGTCGTC"];
74
75    // On an RC-strand query the start primers appear as RC(end primers) and vice versa,
76    // so we search both orientations for each boundary.
77    let rc_start: Vec<Vec<u8>> = FWD_END.iter().map(|p| reverse_complement(p)).collect();
78    let rc_end: Vec<Vec<u8>> = FWD_START.iter().map(|p| reverse_complement(p)).collect();
79
80    let find_start = |p: &[u8]| seq.windows(p.len()).position(|w| w.eq_ignore_ascii_case(p));
81    let find_end = |p: &[u8]| {
82        seq.windows(p.len())
83            .rposition(|w| w.eq_ignore_ascii_case(p))
84            .map(|pos| pos + p.len())
85    };
86
87    let start = FWD_START
88        .iter()
89        .map(|p| p as &[u8])
90        .chain(rc_start.iter().map(|p| p.as_slice()))
91        .filter_map(find_start)
92        .min()
93        .unwrap_or(0);
94
95    let end = FWD_END
96        .iter()
97        .map(|p| p as &[u8])
98        .chain(rc_end.iter().map(|p| p.as_slice()))
99        .filter_map(find_end)
100        .max()
101        .unwrap_or(seq.len());
102
103    seq[start..end.min(seq.len())].to_vec()
104}
105
106/// Database was generated by filtering NCBI Nucleotide for Mycobacteriaceae[Organism] AND (hsp65[Gene Name] OR groEL2[Gene Name]) AND 400:3000[SLEN] AND type_material[Filter] (via fetch_myco_sequences() in build.rs).  
107/// Added to this database are the hsp65 sequences found in [ntm-db](https://github.com/pathogen-profiler/ntm-db) (via extract_ntm_db_sequences() in build.rs).  
108pub fn identify_sequence_hsp65(query: &[u8]) -> Vec<SeqIdHit> {
109    let query = trim_hsp65_primers(query);
110    let query = query.as_slice();
111    let rc = reverse_complement(query);
112
113    let mut hits: Vec<SeqIdHit> = HSP65_REFS
114        .iter()
115        .map(|(raw_acc, description, refseq)| {
116            // Strip version suffix so SNP dispatch and is_kansasii() etc. work correctly.
117            let accession = raw_acc
118                .rsplit_once('.')
119                .map(|(base, _)| base.to_string())
120                .unwrap_or_else(|| raw_acc.clone());
121            let fwd = align_to_ref(query, refseq);
122            let rev = align_to_ref(&rc, refseq);
123            let (ga, is_reverse) = if rev.identity > fwd.identity {
124                (rev, true)
125            } else {
126                (fwd, false)
127            };
128            let kansasii_gastri_snp_calls =
129                if KANSASII_GASTRI_ACCS.contains(&accession.as_str()) {
130                    call_kansasii_gastri_snps(&ga)
131                } else {
132                    vec![]
133                };
134            let marinum_ulcerans_snp_calls =
135                if MARINUM_ULCERANS_ACCS.contains(&accession.as_str()) {
136                    call_marinum_ulcerans_snps(&ga)
137                } else {
138                    vec![]
139                };
140            SeqIdHit {
141                accession,
142                description: description.clone(),
143                identity: ga.identity,
144                is_reverse,
145                kansasii_gastri_snp_calls,
146                marinum_ulcerans_snp_calls,
147                rrl_snp_calls: vec![],
148                rrs_snp_calls: vec![],
149                erm41_snp_calls: vec![],
150                pnca_snp_calls: vec![],
151                aligned_query: ga.gapped_query,
152                aligned_ref: ga.gapped_ref,
153                ref_start: ga.ref_start,
154                erm41_position_28_opt: None,
155                rrl_position_2058_2059_opt: None,
156            }
157        })
158        .collect();
159
160    hits.sort_by(|a, b| {
161        b.identity
162            .partial_cmp(&a.identity)
163            .unwrap_or(std::cmp::Ordering::Equal)
164    });
165    hits
166}
167
168
169/// A single diagnostic SNP position in the kansasii/gastri comparison.
170#[derive(Clone, Debug, Serialize, Deserialize)]
171pub struct KansasiiGastriSnpCall {
172    /// 0-based position in the reference sequence.
173    pub ref_pos: usize,
174    /// Base observed in the query at this position (uppercase ASCII).
175    pub query_base: u8,
176    /// Expected base for M. gastri (uppercase ASCII).
177    pub gastri_base: u8,
178    /// Expected base for M. kansasii (uppercase ASCII).
179    pub kansasii_base: u8,
180}
181
182impl KansasiiGastriSnpCall {
183    pub fn is_gastri(&self) -> bool {
184        self.query_base == self.gastri_base
185    }
186    pub fn is_kansasii(&self) -> bool {
187        self.query_base == self.kansasii_base
188    }
189    /// Human-readable species tag: "M. gastri", "M. kansasii", or "?".
190    pub fn species_tag(&self) -> &'static str {
191        if self.is_gastri() {
192            "M. gastri"
193        } else if self.is_kansasii() {
194            "M. kansasii"
195        } else {
196            "?"
197        }
198    }
199}
200
201/// A single diagnostic SNP position in the marinum/ulcerans comparison.
202#[derive(Clone, Debug, Serialize, Deserialize)]
203pub struct MarinumUlceransSnpCall {
204    /// 0-based position in the reference sequence.
205    pub ref_pos: usize,
206    /// Base observed in the query at this position (uppercase ASCII).
207    pub query_base: u8,
208    /// Expected base for M. marinum (uppercase ASCII).
209    pub marinum_base: u8,
210    /// Expected base for M. ulcerans (uppercase ASCII).
211    pub ulcerans_base: u8,
212}
213
214impl MarinumUlceransSnpCall {
215    pub fn is_marinum(&self) -> bool {
216        self.query_base == self.marinum_base
217    }
218    pub fn is_ulcerans(&self) -> bool {
219        self.query_base == self.ulcerans_base
220    }
221    /// Human-readable species tag: "M. marinum", "M. ulcerans", or "?".
222    pub fn species_tag(&self) -> &'static str {
223        if self.is_marinum() {
224            "M. marinum"
225        } else if self.is_ulcerans() {
226            "M. ulcerans"
227        } else {
228            "?"
229        }
230    }
231}
232