cosmic_files/sequencing/
mod.rs

1//! Sequence-based species identification for Mycobacteriaceae.
2//!
3//! ### Generation of species identification databases
4//!
5//! The database sequences are flagged as type material using the
6//! [INSD Collaboration `type_material` qualifier](https://pmc.ncbi.nlm.nih.gov/articles/PMC4383940/).
7//! Type material ties formal species names to physical specimens (culture collections for prokaryotes,
8//! museum or herbarium specimens for eukaryotes), as annotated in the
9//! [NCBI Taxonomy Database](http://www.ncbi.nlm.nih.gov/taxonomy).
10//! 
11//! See fn fetch_myco_sequences() in build.rs for details on how the sequences were fetched from NCBI at build time.
12//! 
13//! `myco_erm41.fasta` is generated at build time but unused; erm41 identification uses
14//! per-subspecies references (`erm41_abscessus_ATCC_19977.fasta`, `erm41_bolletii_CIP_108541.fasta`, `erm41_massiliense_CCUG_48898.fasta`) instead.
15
16pub mod batch;
17pub mod bed;
18pub mod erm41;
19pub mod hsp65;
20pub mod ntfy_notify;
21pub mod pnca;
22pub mod rpob;
23pub mod rrs;
24pub mod rrl;
25pub mod serde_helpers;
26pub mod tb_data;
27
28pub use batch::SampleSusceptibilityRecord;
29
30use serde::{Deserialize, Serialize};
31
32use erm41::{Erm41LofCall, Erm41Position28, Erm41SusceptibilityCalls};
33use hsp65::{KansasiiGastriSnpCall, MarinumUlceransSnpCall};
34use pnca::{PncaSnpCall, PncaSusceptibilityCalls};
35use rrl::{RrlPosition2058_2059, RrlSnpCall, RrlSusceptibilityCalls};
36use rrs::{RrsSnpCall, RrsSusceptibilityCalls};
37
38pub const MIN_SEQ_ID_IDENTITY: f32 = 80.0;
39
40const MIN_RRS_REF_LEN: usize = 1200;
41const MIN_RRL_REF_LEN: usize = 1200;
42const MIN_RPOB_REF_LEN: usize = 400;
43const MIN_PNCA_REF_LEN: usize = 500;
44
45pub const DESC_ABSCESSUS: &str = "M. abscessus subsp. abscessus";
46pub const DESC_BOLLETII: &str = "M. abscessus subsp. bolletii";
47pub const DESC_MASSILIENSE: &str = "M. abscessus subsp. massiliense";
48
49const ERM41_FWD_START: &[u8] = b"gtgtccggccaacggtcgcg";
50const ERM41_FWD_END: &[u8] = b"tggtgatcaggcggcgctga";
51const ERM41_ANCHOR_L: &[u8] = b"GCCAACGGTCGCGACGCCAG";
52const ERM41_ANCHOR_R: &[u8] = b"GGGGCTGGTATCCGCTCACT";
53
54const RRL_ANCHOR_L: &[u8] = b"CGTTACGCGCGGCAGGACGA";
55const RRL_ANCHOR_R: &[u8] = b"AGACCCCGGGACCTTCACTA";
56
57const PNCA_FWD_START: &[u8] = b"GCGTCGGTAGGCAAACTGCC";
58const PNCA_FWD_END: &[u8] = b"AGTTGGTTTGCAGCTCCTGA";
59
60const REF_ERM41_ABSCESSUS: &str =
61    include_str!("../../res/sequences/erm41/erm41_abscessus_ATCC_19977.fasta");
62const REF_ERM41_BOLLETII: &str =
63    include_str!("../../res/sequences/erm41/erm41_bolletii_CIP_108541.fasta");
64const REF_ERM41_MASSILENSE: &str =
65    include_str!("../../res/sequences/erm41/erm41_massiliense_CCUG_48898.fasta");
66
67const ACC_GASTRI: &str = "AF547836";
68const ACC_KANSASII: &str = "AF547849";
69const ACC_MARINUM: &str = "AY299134";
70const ACC_ULCERANS: &str = "AY299145";
71const KANSASII_GASTRI_ACCS: &[&str] = &[ACC_GASTRI, ACC_KANSASII];
72const MARINUM_ULCERANS_ACCS: &[&str] = &[ACC_MARINUM, ACC_ULCERANS];
73
74/// 16S rRNA (rrs) reference sequences — Mycobacteriaceae type strains, fetched from NCBI at build time.
75const REF_MYCO_RRS: &str = include_str!("../../res/sequences/myco_rrs.fasta");
76/// hsp65 / groEL2 reference sequences — Mycobacteriaceae type strains, fetched from NCBI at build time.
77const REF_MYCO_HSP65: &str = include_str!("../../res/sequences/myco_hsp65.fasta");
78/// rpoB reference sequences — Mycobacteriaceae type strains, fetched from NCBI at build time.
79const REF_MYCO_RPOB: &str = include_str!("../../res/sequences/myco_rpob.fasta");
80/// 23S rRNA (rrl) reference sequences — Mycobacteriaceae type strains, fetched from NCBI at build time.
81const REF_MYCO_RRL: &str = include_str!("../../res/sequences/myco_rrl.fasta");
82/// pncA CDS + 50bp upstream promoter flank for each M. tuberculosis complex member with a
83/// distinct reference sequence, fetched from NCBI at build time (see `pnca` module docs).
84/// Concatenated into one multi-FASTA so `identify_sequence_pnca()` can search all of them via
85/// [`parse_multi_fasta`], the same way `identify_sequence_rrl_ntm()` searches `REF_MYCO_RRL`.
86const REF_PNCA: &str = concat!(
87    include_str!("../../res/sequences/pnca/pnca_h37rv.fasta"),
88    include_str!("../../res/sequences/pnca/pnca_bovis_AF2122_97.fasta"),
89    include_str!("../../res/sequences/pnca/pnca_canettii_CIPT_140010059.fasta"),
90);
91
92
93/// Susceptibility calls derived from AB1 capillary sequencing, keyed by gene target.
94#[derive(Debug, Clone, Default, Serialize, Deserialize)]
95pub struct SusceptibilityCalls {
96    pub erm41: Erm41SusceptibilityCalls,
97    pub rrl: RrlSusceptibilityCalls,
98    pub rrs: RrsSusceptibilityCalls,
99    pub pnca: PncaSusceptibilityCalls,
100}
101
102impl std::fmt::Display for SusceptibilityCalls {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        let mut parts: Vec<String> = Vec::new();
105
106        let mut erm: Vec<String> = Vec::new();
107        if let Some(pos) = &self.erm41.position_28 {
108            erm.push(pos.to_string());
109        }
110        for c in &self.erm41.lof_snp_calls {
111            let tag = c.call_tag();
112            if !tag.is_empty() {
113                erm.push(tag);
114            }
115        }
116        if !erm.is_empty() {
117            parts.push(format!("{}", erm.join(", ")));
118        }
119
120        let mut rrl: Vec<String> = Vec::new();
121        if let Some(pos) = &self.rrl.position_2058_2059 {
122            rrl.push(pos.to_string());
123        }
124        for c in &self.rrl.snp_calls {
125            rrl.push(c.call_tag());
126        }
127        if !rrl.is_empty() {
128            parts.push(format!("{}", rrl.join(", ")));
129        }
130
131        let rrs: Vec<String> = self.rrs.snp_calls.iter()
132            .map(|c| c.call_tag())
133            .filter(|t| !t.is_empty())
134            .collect();
135        if !rrs.is_empty() {
136            parts.push(format!("{}", rrs.join(", ")));
137        }
138
139        let pnca: Vec<String> = self
140            .pnca
141            .snp_calls
142            .iter()
143            .filter(|c| !c.call_tag().is_empty())
144            .map(|c| format!("{} {}", c.site_label(), c.call_tag()))
145            .collect();
146        if !pnca.is_empty() {
147            parts.push(format!("{}", pnca.join(", ")));
148        }
149
150        write!(f, "{}", parts.join(" | "))
151    }
152}
153
154pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
155    seq.iter()
156        .rev()
157        .map(|&b| match b.to_ascii_uppercase() {
158            b'A' => b'T',
159            b'T' => b'A',
160            b'G' => b'C',
161            b'C' => b'G',
162            _ => b'N',
163        })
164        .collect()
165}
166
167/// Trim a basecall sequence to the amplicon region defined by a primer pair.
168///
169/// Searches `seq` for `fwd_start` (forward primer) and `fwd_end` (reverse primer) using
170/// case-insensitive matching. Both orientations are tried: if the read is reverse-complemented,
171/// `rc(fwd_end)` anchors the left boundary and `rc(fwd_start)` anchors the right. The returned
172/// slice starts at the earliest primer match and ends just after the latest, so both primers are
173/// included. If a boundary primer is not found, the corresponding end of `seq` is used as a
174/// fallback, leaving that side untrimmed.
175pub fn trim_start_end<'a>(seq: &'a [u8], fwd_start: &[u8], fwd_end: &[u8]) -> &'a [u8] {
176    let rc_start: Vec<u8> = reverse_complement(fwd_end);
177    let rc_end: Vec<u8> = reverse_complement(fwd_start);
178
179    let find_start = |p: &[u8]| seq.windows(p.len()).position(|w| w.eq_ignore_ascii_case(p));
180    let find_end = |p: &[u8]| {
181        seq.windows(p.len())
182            .rposition(|w| w.eq_ignore_ascii_case(p))
183            .map(|pos| pos + p.len())
184    };
185
186    let start = [fwd_start, rc_start.as_slice()]
187        .into_iter()
188        .filter_map(find_start)
189        .min()
190        .unwrap_or(0);
191
192    let end = [fwd_end, rc_end.as_slice()]
193        .into_iter()
194        .filter_map(find_end)
195        .max()
196        .unwrap_or(seq.len());
197
198    &seq[start..end.min(seq.len())]
199}
200
201
202/// Trim leading and trailing low-quality bases using a sliding-window average.
203///
204/// Scans inward from each end with a window of [`WINDOW`] bases; the first
205/// window position (from each end) whose mean Phred quality ≥ `min_q` defines
206/// the trim boundary. Falls back to single-base scan when `seq` is shorter
207/// than the window. Returns `None` when no region meets the threshold.
208pub fn trim_to_min_quality<'a>(seq: &'a [u8], qual: &[u8], min_q: u8) -> Option<&'a [u8]> {
209    const WINDOW: usize = 15;
210
211    let n = seq.len().min(qual.len());
212
213    // Short read: single-base scan (same semantics as before, just on the
214    // clamped length so qual and seq indices stay in bounds).
215    if n < WINDOW {
216        let start = (0..n).find(|&i| qual[i] >= min_q).unwrap_or(n);
217        let end = (0..n).rev().find(|&i| qual[i] >= min_q).map(|i| i + 1).unwrap_or(0);
218        return if start < end { Some(&seq[start..end]) } else { None };
219    }
220
221    let threshold = min_q as u32 * WINDOW as u32;
222
223    // Scan left → right: first window whose average ≥ min_q.
224    let start = (0..=(n - WINDOW))
225        .find(|&i| qual[i..i + WINDOW].iter().map(|&q| q as u32).sum::<u32>() >= threshold)
226        .unwrap_or(n);
227
228    // Scan right → left: last window-end whose average ≥ min_q.
229    let end = (WINDOW..=n)
230        .rev()
231        .find(|&e| qual[e - WINDOW..e].iter().map(|&q| q as u32).sum::<u32>() >= threshold)
232        .unwrap_or(0);
233
234    if start < end { Some(&seq[start..end]) } else { None }
235}
236
237/// Parse a FASTA string, returning just the sequence bytes (ignores header).
238pub(crate) fn parse_fasta_seq(fasta: &str) -> Vec<u8> {
239    fasta
240        .lines()
241        .filter(|l| !l.starts_with('>'))
242        .flat_map(|l| l.bytes().filter(|b| b.is_ascii_alphabetic()))
243        .collect()
244}
245
246/// Parse a multi-FASTA string into `(accession, description, sequence)` tuples.
247fn parse_multi_fasta(fasta: &str) -> Vec<(String, String, Vec<u8>)> {
248    let mut result = Vec::new();
249    let mut cur_acc = String::new();
250    let mut cur_desc = String::new();
251    let mut cur_seq: Vec<u8> = Vec::new();
252    for line in fasta.lines() {
253        if let Some(rest) = line.strip_prefix('>') {
254            if !cur_acc.is_empty() {
255                result.push((
256                    cur_acc.clone(),
257                    cur_desc.clone(),
258                    std::mem::take(&mut cur_seq),
259                ));
260            }
261            let mut words = rest.splitn(4, ' ');
262            cur_acc = words.next().unwrap_or("").to_string();
263            let genus = words.next().unwrap_or("");
264            let species = words.next().unwrap_or("");
265            cur_desc = format!("{} {}", genus, species).trim().to_string();
266            // NCBI deflines for M. tuberculosis complex subspecies and other infrasubspecific
267            // taxa spell the subspecies out as "variant bovis"/"subsp. bolletii" right after
268            // the binomial (e.g. "Mycobacterium tuberculosis variant bovis AF2122/97 ...",
269            // "Mycobacterium avium subsp. paratuberculosis ..."). Without this, every such
270            // entry collapses to the same genus+species description as its parent species —
271            // e.g. the bovis pncA reference showing up indistinguishably as "Mycobacterium
272            // tuberculosis", same as H37Rv — so fold the qualifier + its epithet in too.
273            let mut qualifier_words = words.next().unwrap_or("").split_whitespace();
274            if let Some(qualifier @ ("variant" | "subsp." | "subspecies")) =
275                qualifier_words.next()
276                && let Some(epithet) = qualifier_words.next()
277            {
278                cur_desc = format!("{cur_desc} {qualifier} {epithet}");
279            }
280        } else {
281            cur_seq.extend(line.bytes().filter(|b| b.is_ascii_alphabetic()));
282        }
283    }
284    if !cur_acc.is_empty() {
285        result.push((cur_acc, cur_desc, cur_seq));
286    }
287    result
288}
289
290/// Within each `description` group, remove entries whose sequence (uppercased) is a contiguous
291/// substring of a longer entry that shares the same description. Longer entries survive; the
292/// shorter entries are redundant for alignment purposes because the aligner will find the same
293/// best position inside the longer reference.
294fn dedup_substring_same_desc(
295    mut entries: Vec<(String, String, Vec<u8>)>,
296) -> Vec<(String, String, Vec<u8>)> {
297    // Sort longest-first so inner loop only has to check shorter against longer.
298    entries.sort_by_key(|(_, _, s)| std::cmp::Reverse(s.len()));
299    let upper: Vec<Vec<u8>> = entries
300        .iter()
301        .map(|(_, _, s)| s.iter().map(|b| b.to_ascii_uppercase()).collect())
302        .collect();
303    let mut keep = vec![true; entries.len()];
304    for i in 0..entries.len() {
305        if !keep[i] {
306            continue;
307        }
308        for j in (i + 1)..entries.len() {
309            if !keep[j] {
310                continue;
311            }
312            if entries[i].1 == entries[j].1
313                && upper[i].windows(upper[j].len()).any(|w| w == upper[j])
314            {
315                keep[j] = false;
316            }
317        }
318    }
319    entries
320        .into_iter()
321        .zip(keep)
322        .filter_map(|(e, k)| k.then_some(e))
323        .collect()
324}
325
326fn format_pairwise_alignment(
327    accession: &str,
328    description: &str,
329    identity: f32,
330    is_reverse: bool,
331    gapped_query: &[u8],
332    gapped_ref: &[u8],
333    ref_start: usize,
334) -> String {
335    let match_line: Vec<u8> = gapped_ref
336        .iter()
337        .zip(gapped_query.iter())
338        .map(|(&r, &q)| {
339            if r == b'-' || q == b'-' {
340                b' '
341            } else if r.eq_ignore_ascii_case(&q) {
342                b'|'
343            } else {
344                b'.'
345            }
346        })
347        .collect();
348
349    let orient = if is_reverse {
350        "Reverse Complement"
351    } else {
352        "Forward"
353    };
354    let mut out = format!(
355        "Query vs {} ({}) — {:.1}% identity\n\n",
356        accession, description, identity
357    );
358    out.push_str(&format!("Orientation: {orient}\n\n"));
359
360    let line_width = 60usize;
361    let len = gapped_ref.len();
362    let mut ref_pos = ref_start + 1;
363    let mut query_pos = 1usize;
364    for chunk_start in (0..len).step_by(line_width) {
365        let chunk_end = (chunk_start + line_width).min(len);
366        let ref_chunk = std::str::from_utf8(&gapped_ref[chunk_start..chunk_end]).unwrap_or("");
367        let match_chunk = std::str::from_utf8(&match_line[chunk_start..chunk_end]).unwrap_or("");
368        let query_chunk = std::str::from_utf8(&gapped_query[chunk_start..chunk_end]).unwrap_or("");
369
370        out.push_str(&format!("Ref   {:5}: {}\n", ref_pos, ref_chunk));
371        out.push_str(&format!("             {match_chunk}\n"));
372        out.push_str(&format!("Query {:5}: {}\n\n", query_pos, query_chunk));
373
374        ref_pos += ref_chunk.bytes().filter(|&b| b != b'-').count();
375        query_pos += query_chunk.bytes().filter(|&b| b != b'-').count();
376    }
377    out
378}
379
380/// Alignment result from [`align_to_ref`]: gapped strings plus reference start position.
381#[derive(Debug, Clone)]
382pub struct GappedAlignment {
383    /// Percent identity: matches / reference span × 100.
384    pub identity: f32,
385    /// Query sequence with `'-'` inserted at each deletion relative to the reference.
386    pub gapped_query: Vec<u8>,
387    /// Reference slice (aligned region only) with `'-'` inserted at each query insertion.
388    /// Always the same length as `gapped_query`.
389    pub gapped_ref: Vec<u8>,
390    /// 0-based index into the full reference where this alignment starts.
391    pub ref_start: usize,
392}
393
394/// Align `query` (Sanger read) against `reference` (gene sequence) using semiglobal
395/// Smith-Waterman (free reference end-gaps, full query placed within reference).
396///
397/// Returns a [`GappedAlignment`] with gapped strings for display and SNP calling.
398/// Identity is computed as `matches / reference_span × 100`, which naturally penalises
399/// deletions in the query.
400pub fn align_to_ref(query: &[u8], reference: &[u8]) -> GappedAlignment {
401    use bio::alignment::AlignmentOperation::{Del, Ins, Match, Subst, Xclip, Yclip};
402    use bio::alignment::pairwise::Aligner;
403
404    if query.is_empty() || reference.is_empty() {
405        return GappedAlignment {
406            identity: 0.0,
407            gapped_query: vec![],
408            gapped_ref: vec![],
409            ref_start: 0,
410        };
411    }
412
413    let score_fn = |a: u8, b: u8| -> i32 {
414        if a.to_ascii_uppercase() == b.to_ascii_uppercase() { 1 } else { -1 }
415    };
416    let mut aligner = Aligner::new(-5, -1, &score_fn);
417    let alignment = aligner.semiglobal(query, reference);
418
419    let mut gapped_query = Vec::new();
420    let mut gapped_ref = Vec::new();
421    let mut qi = alignment.xstart;
422    let mut ri = alignment.ystart;
423
424    for op in &alignment.operations {
425        match op {
426            Match | Subst => {
427                gapped_query.push(query[qi]);
428                gapped_ref.push(reference[ri]);
429                qi += 1;
430                ri += 1;
431            }
432            Del => {
433                gapped_query.push(b'-');
434                gapped_ref.push(reference[ri]);
435                ri += 1;
436            }
437            Ins => {
438                gapped_query.push(query[qi]);
439                gapped_ref.push(b'-');
440                qi += 1;
441            }
442            Xclip(k) => { qi += k; }
443            Yclip(k) => { ri += k; }
444        }
445    }
446
447    let ref_span = gapped_ref.iter().filter(|&&b| b != b'-').count();
448    let matches = gapped_query
449        .iter()
450        .zip(gapped_ref.iter())
451        .filter(|&(&q, &r)| q != b'-' && r != b'-' && q.eq_ignore_ascii_case(&r))
452        .count();
453    let identity = if ref_span > 0 {
454        matches as f32 / ref_span as f32 * 100.0
455    } else {
456        0.0
457    };
458
459    GappedAlignment {
460        identity,
461        gapped_query,
462        gapped_ref,
463        ref_start: alignment.ystart,
464    }
465}
466
467/// Return the query base at a given reference position, or `None` if the position is outside
468/// the aligned region or the query has a deletion (`'-'`) there.
469pub fn base_at_ref_pos(
470    gapped_query: &[u8],
471    gapped_ref: &[u8],
472    ref_start: usize,
473    ref_pos: usize,
474) -> Option<u8> {
475    if ref_pos < ref_start {
476        return None;
477    }
478    let mut current = ref_start;
479    for (&q, &r) in gapped_query.iter().zip(gapped_ref.iter()) {
480        if r != b'-' {
481            if current == ref_pos {
482                return if q == b'-' { None } else { Some(q.to_ascii_uppercase()) };
483            }
484            current += 1;
485        }
486    }
487    None
488}
489
490fn scan_window(
491    center: usize,
492    left: usize,
493    right: usize,
494    peak_locs: &[u16],
495) -> Option<(u16, u16)> {
496    let base_start = center.checked_sub(left)?;
497    let base_end = center + right;
498    if base_end >= peak_locs.len() {
499        return None;
500    }
501    let start_scan = peak_locs[base_start];
502    let end_scan = peak_locs[base_end];
503    if start_scan >= end_scan {
504        return None;
505    }
506    Some((start_scan, end_scan))
507}
508
509/// Tries the edited basecalls (PBAS tag 2) first, falling back to raw basecalls (PBAS tag 1).
510pub fn parse_ab1_sequence(data: &[u8]) -> Option<Vec<u8>> {
511    if data.len() < 34 || &data[0..4] != b"ABIF" {
512        return None;
513    }
514
515    // Root directory entry sits at byte 6 (28 bytes long).
516    // num_elements (i32 BE) at root+12 = byte 18
517    // data_offset  (i32 BE) at root+20 = byte 26
518    let dir_count = i32::from_be_bytes(data[18..22].try_into().ok()?) as usize;
519    let dir_offset = i32::from_be_bytes(data[26..30].try_into().ok()?) as usize;
520
521    let mut pbas1: Option<Vec<u8>> = None;
522
523    for i in 0..dir_count {
524        let e = dir_offset + i * 28;
525        if e + 28 > data.len() {
526            break;
527        }
528        let tag_name = &data[e..e + 4];
529        let tag_number = i32::from_be_bytes(data[e + 4..e + 8].try_into().ok()?);
530        // num_elements at e+12, data_size at e+16, data_offset at e+20
531        let num_elems = i32::from_be_bytes(data[e + 12..e + 16].try_into().ok()?) as usize;
532        let data_size = i32::from_be_bytes(data[e + 16..e + 20].try_into().ok()?) as usize;
533        let data_off = i32::from_be_bytes(data[e + 20..e + 24].try_into().ok()?) as usize;
534
535        if tag_name == b"PBAS" {
536            // When data fits in 4 bytes it is stored inline at the data_offset field position
537            let offset = if data_size <= 4 { e + 20 } else { data_off };
538            if offset + num_elems <= data.len() {
539                let seq = data[offset..offset + num_elems].to_vec();
540                if tag_number == 2 {
541                    return Some(seq); // edited basecalls — best quality
542                } else if tag_number == 1 {
543                    pbas1 = Some(seq); // raw basecalls — keep as fallback
544                }
545            }
546        }
547    }
548
549    pbas1
550}
551
552/// Tries edited quality scores (PCON tag 2) first, falling back to raw (PCON tag 1).
553/// Each byte is a Phred quality score corresponding to the base at the same index in PBAS.
554pub fn parse_ab1_quality(data: &[u8]) -> Option<Vec<u8>> {
555    if data.len() < 34 || &data[0..4] != b"ABIF" {
556        return None;
557    }
558
559    let dir_count = i32::from_be_bytes(data[18..22].try_into().ok()?) as usize;
560    let dir_offset = i32::from_be_bytes(data[26..30].try_into().ok()?) as usize;
561
562    let mut pcon1: Option<Vec<u8>> = None;
563
564    for i in 0..dir_count {
565        let e = dir_offset + i * 28;
566        if e + 28 > data.len() {
567            break;
568        }
569        let tag_name = &data[e..e + 4];
570        let tag_number = i32::from_be_bytes(data[e + 4..e + 8].try_into().ok()?);
571        let num_elems = i32::from_be_bytes(data[e + 12..e + 16].try_into().ok()?) as usize;
572        let data_size = i32::from_be_bytes(data[e + 16..e + 20].try_into().ok()?) as usize;
573        let data_off = i32::from_be_bytes(data[e + 20..e + 24].try_into().ok()?) as usize;
574
575        if tag_name == b"PCON" {
576            let offset = if data_size <= 4 { e + 20 } else { data_off };
577            if offset + num_elems <= data.len() {
578                let qual = data[offset..offset + num_elems].to_vec();
579                if tag_number == 2 {
580                    return Some(qual); // edited quality — best
581                } else if tag_number == 1 {
582                    pcon1 = Some(qual); // raw quality — keep as fallback
583                }
584            }
585        }
586    }
587
588    pcon1
589}
590
591impl Ab1Channels {
592    /// Parse an AB1 chromatogram from raw file bytes.
593    ///
594    /// Returns `None` if the data is not a valid ABIF file or required tags are missing.
595    /// Prefers edited/analyzed data (PBAS 2, PLOC 2, DATA 9–12) over raw (PBAS 1, PLOC 1, DATA 1–4).
596    pub fn from_bytes(data: &[u8]) -> Option<Self> {
597        if data.len() < 34 || &data[0..4] != b"ABIF" {
598            return None;
599        }
600
601        let dir_count = i32::from_be_bytes(data[18..22].try_into().ok()?) as usize;
602        let dir_offset = i32::from_be_bytes(data[26..30].try_into().ok()?) as usize;
603
604        // Collect all tag entries we care about
605        let mut data_tags: std::collections::HashMap<i32, Vec<i16>> =
606            std::collections::HashMap::new();
607        let mut pbas1: Option<Vec<u8>> = None;
608        let mut pbas2: Option<Vec<u8>> = None;
609        let mut ploc1: Option<Vec<u16>> = None;
610        let mut ploc2: Option<Vec<u16>> = None;
611        let mut fwo: Option<[u8; 4]> = None;
612
613        for i in 0..dir_count {
614            let e = dir_offset + i * 28;
615            if e + 28 > data.len() {
616                break;
617            }
618            let tag_name = &data[e..e + 4];
619            let tag_number = i32::from_be_bytes(data[e + 4..e + 8].try_into().ok()?);
620            let num_elems = i32::from_be_bytes(data[e + 12..e + 16].try_into().ok()?) as usize;
621            let data_size = i32::from_be_bytes(data[e + 16..e + 20].try_into().ok()?) as usize;
622            let data_off = i32::from_be_bytes(data[e + 20..e + 24].try_into().ok()?) as usize;
623
624            let offset = if data_size <= 4 { e + 20 } else { data_off };
625
626            if tag_name == b"DATA" && (1..=12).contains(&tag_number) {
627                // Each element is an i16 BE (2 bytes)
628                if offset + num_elems * 2 <= data.len() {
629                    let values: Vec<i16> = (0..num_elems)
630                        .map(|j| {
631                            i16::from_be_bytes([data[offset + j * 2], data[offset + j * 2 + 1]])
632                        })
633                        .collect();
634                    data_tags.insert(tag_number, values);
635                }
636            } else if tag_name == b"PBAS" {
637                if offset + num_elems <= data.len() {
638                    let seq = data[offset..offset + num_elems].to_vec();
639                    match tag_number {
640                        2 => pbas2 = Some(seq),
641                        1 => pbas1 = Some(seq),
642                        _ => {}
643                    }
644                }
645            } else if tag_name == b"PLOC" {
646                // i16 BE peak scan positions
647                if offset + num_elems * 2 <= data.len() {
648                    let locs: Vec<u16> = (0..num_elems)
649                        .map(|j| {
650                            u16::from_be_bytes([data[offset + j * 2], data[offset + j * 2 + 1]])
651                        })
652                        .collect();
653                    match tag_number {
654                        2 => ploc2 = Some(locs),
655                        1 => ploc1 = Some(locs),
656                        _ => {}
657                    }
658                }
659            } else if tag_name == b"FWO_" && tag_number == 1 {
660                // 4 bytes: base letter for each channel in order
661                if offset + 4 <= data.len() {
662                    let mut arr = [0u8; 4];
663                    arr.copy_from_slice(&data[offset..offset + 4]);
664                    fwo = Some(arr);
665                }
666            }
667        }
668
669        // Prefer analyzed channels (9-12), fall back to raw (1-4)
670        let channel_indices: [(i32, i32); 4] = [(9, 1), (10, 2), (11, 3), (12, 4)];
671        let channels: [Vec<i16>; 4] = channel_indices.map(|(preferred, fallback)| {
672            data_tags
673                .remove(&preferred)
674                .or_else(|| data_tags.remove(&fallback))
675                .unwrap_or_default()
676        });
677
678        let bases = pbas2.or(pbas1)?;
679        let peak_locs = ploc2.or(ploc1).unwrap_or_else(|| vec![0u16; bases.len()]);
680        let base_order = fwo.unwrap_or(*b"ACGT");
681
682        let erm41_view_state_opt = erm41::find_erm41_display_window(&bases, &peak_locs).map(
683            |(start, end, is_reverse, pos28_base_idx)| Erm41ViewState {
684                window: (start, end),
685                is_reverse,
686                pos28_base_idx,
687            },
688        );
689
690        let rrl_ntm_view_state_opt = rrl::find_rrl_ntm_display_window(&bases, &peak_locs).map(
691            |(start, end, is_reverse, snp_base_idx)| RrlNtmViewState {
692                window: (start, end),
693                is_reverse,
694                snp_base_idx,
695            },
696        );
697
698        Some(Self {
699            channels,
700            bases,
701            peak_locs,
702            base_order,
703            erm41_view_state_opt,
704            rrl_ntm_view_state_opt,
705        })
706    }
707}
708
709/// Chromatogram display parameters for the erm(41) region.
710///
711/// Built in [`Ab1Channels::parse`] via [`erm41::find_erm41_display_window`]
712/// when [`ERM41_ANCHOR_L`] is found in the basecall sequence. Stored inside
713/// [`Ab1Channels`] and used by the UI to scroll the chromatogram to the
714/// diagnostic position 28 site.
715#[derive(Clone, Copy, Debug)]
716pub struct Erm41ViewState {
717    /// Scan-index range (inclusive start, exclusive end) of the display window.
718    pub window: (u16, u16),
719    /// `true` when the reverse complement matched the anchor.
720    pub is_reverse: bool,
721    /// Index into `bases` / `peak_locs` that corresponds to position 28.
722    pub pos28_base_idx: u16,
723}
724
725/// Chromatogram display parameters for the rrl / NTM macrolide-resistance region.
726///
727/// Built in [`Ab1Channels::parse`] via [`rrl::find_rrl_ntm_display_window`]
728/// when [`RRL_ANCHOR_L`] is found in the basecall sequence. Stored inside
729/// [`Ab1Channels`] and used by the UI to scroll the chromatogram to the
730/// diagnostic SNP site (positions 2058–2059).
731#[derive(Clone, Copy, Debug)]
732pub struct RrlNtmViewState {
733    /// Scan-index range (inclusive start, exclusive end) of the display window.
734    pub window: (u16, u16),
735    /// `true` when the reverse complement matched the anchor.
736    pub is_reverse: bool,
737    /// Index into `bases` / `peak_locs` that corresponds to the SNP site.
738    pub snp_base_idx: u16,
739}
740
741/// Parsed channel intensity data from an AB1 chromatogram.
742#[derive(Clone, Debug)]
743pub struct Ab1Channels {
744    /// Four intensity arrays in the order given by `base_order`.
745    /// Each Vec has the same length (number of scans).
746    pub channels: [Vec<i16>; 4],
747    /// Called bases (from PBAS tag).
748    pub bases: Vec<u8>,
749    /// Scan index of each base call (from PLOC tag), same length as `bases`.
750    pub peak_locs: Vec<u16>,
751    /// Which base each channel corresponds to, e.g. b"ACGT" (from FWO_ tag).
752    pub base_order: [u8; 4],
753    /// Erm41 view state; `None` when the anchor was not found in the basecall sequence.
754    pub erm41_view_state_opt: Option<Erm41ViewState>,
755    /// Rrl/NTM view state; `None` when the anchor was not found in the basecall sequence.
756    pub rrl_ntm_view_state_opt: Option<RrlNtmViewState>,
757}
758
759impl Ab1Channels {
760    /// Return the channel index for a given base byte (A/C/G/T).
761    pub fn channel_for_base(&self, base: u8) -> Option<usize> {
762        self.base_order
763            .iter()
764            .position(|&b| b.eq_ignore_ascii_case(&base))
765    }
766}
767
768/// Best-hit result from aligning an AB1 read against the reference sequences.
769#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
770pub struct SeqIdHit {
771    /// Accession of the best-matching reference (e.g. "AF547836").
772    pub accession: String,
773    /// Species name stripped to genus + species (e.g. "Mycobacterium gastri").
774    pub description: String,
775    /// Percent identity of the best local alignment window (0.0–100.0).
776    pub identity: f32,
777    /// `true` when the reverse complement of the query was the better match.
778    pub is_reverse: bool,
779    /// Calls at each diagnostic kansasii/gastri SNP position.
780    pub kansasii_gastri_snp_calls: Vec<KansasiiGastriSnpCall>,
781    /// Calls at each diagnostic marinum/ulcerans SNP position.
782    pub marinum_ulcerans_snp_calls: Vec<MarinumUlceransSnpCall>,
783    /// Calls at each rrl macrolide-resistance SNP position (23S rRNA).
784    pub rrl_snp_calls: Vec<RrlSnpCall>,
785    /// Calls at each rrs aminoglycoside-resistance SNP position (16S rRNA).
786    pub rrs_snp_calls: Vec<RrsSnpCall>,
787    /// Calls at each erm(41) loss-of-function variant position.
788    pub erm41_snp_calls: Vec<Erm41LofCall>,
789    /// Calls at each pncA pyrazinamide-resistance nucleotide/codon position.
790    pub pnca_snp_calls: Vec<PncaSnpCall>,
791    /// Gapped query sequence (forward or RC, whichever scored best). `'-'` marks deletions.
792    pub aligned_query: Vec<u8>,
793    /// Gapped reference sequence for the aligned span. Same length as `aligned_query`.
794    pub aligned_ref: Vec<u8>,
795    /// 0-based position in the full reference where the alignment starts.
796    pub ref_start: usize,
797    /// Erm41 position 28 call; `None` for non-erm41 targets.
798    pub erm41_position_28_opt: Option<Erm41Position28>,
799    /// rrl position 2058 2059 call; `None` for non-rrl targets.
800    pub rrl_position_2058_2059_opt: Option<RrlPosition2058_2059>,
801}
802
803impl SeqIdHit {
804    /// Format the alignment as a human-readable pairwise text (60-column wrapping).
805    pub fn format_pairwise_alignment(&self) -> String {
806        format_pairwise_alignment(
807            &self.accession,
808            &self.description,
809            self.identity,
810            self.is_reverse,
811            &self.aligned_query,
812            &self.aligned_ref,
813            self.ref_start,
814        )
815    }
816
817    pub fn is_kansasii(&self) -> bool {
818        self.accession == ACC_KANSASII
819    }
820    pub fn is_gastri(&self) -> bool {
821        self.accession == ACC_GASTRI
822    }
823    pub fn is_marinum(&self) -> bool {
824        self.accession == ACC_MARINUM
825    }
826    pub fn is_ulcerans(&self) -> bool {
827        self.accession == ACC_ULCERANS
828    }
829    pub fn kansasii_gastri_snp_species_call(&self) -> Option<&'static str> {
830        let gastri = self
831            .kansasii_gastri_snp_calls
832            .iter()
833            .filter(|c| c.is_gastri())
834            .count();
835        let kansasii = self
836            .kansasii_gastri_snp_calls
837            .iter()
838            .filter(|c| c.is_kansasii())
839            .count();
840        match gastri.cmp(&kansasii) {
841            std::cmp::Ordering::Greater => Some("M. gastri"),
842            std::cmp::Ordering::Less => Some("M. kansasii"),
843            std::cmp::Ordering::Equal => None,
844        }
845    }
846    pub fn marinum_ulcerans_snp_species_call(&self) -> Option<&'static str> {
847        let marinum = self
848            .marinum_ulcerans_snp_calls
849            .iter()
850            .filter(|c| c.is_marinum())
851            .count();
852        let ulcerans = self
853            .marinum_ulcerans_snp_calls
854            .iter()
855            .filter(|c| c.is_ulcerans())
856            .count();
857        match marinum.cmp(&ulcerans) {
858            std::cmp::Ordering::Greater => Some("M. marinum"),
859            std::cmp::Ordering::Less => Some("M. ulcerans"),
860            std::cmp::Ordering::Equal => None,
861        }
862    }
863}
864
865
866/// Top-level result for a processed AB1 read.
867///
868/// Owns the raw chromatogram and read-quality statistics, plus all
869/// [`SeqIdHit`] entries produced by aligning the read against the reference
870/// database. Each [`SeqIdHit`] carries the species identification and all
871/// downstream SNP / resistance calls for one reference match.
872#[derive(Clone, Debug)]
873pub struct SeqData {
874    pub chromatogram_opt: Option<Ab1Channels>,
875    pub seq_id_hits: Vec<SeqIdHit>,
876    pub length: usize,
877    pub trimmed_length: usize,
878    pub trimmed_avg_quality_opt: Option<f32>,
879}
880
881// ── PDF report ───────────────────────────────────────────────────────────────
882
883const PDF_PAGE_W: f32 = 297.0; // A4 landscape mm
884const PDF_PAGE_H: f32 = 210.0;
885const PDF_MARGIN_L: f32 = 10.0;
886const PDF_MARGIN_B: f32 = 12.0;
887const PDF_MARGIN_T: f32 = 8.0;
888const PDF_ROW_H: f32 = 5.5;
889
890// Column x-offsets from PDF_MARGIN_L (mm)
891const PDF_COL_X: [f32; 6] = [0.0, 24.0, 39.0, 110.0, 130.0, 200.0];
892const PDF_TABLE_W: f32 = 270.0; // right edge of last column relative to PDF_MARGIN_L
893const PDF_COL_HEADERS: [&str; 6] = [
894    "Sample ID", "Gene", "Species", "Susceptible", "Calls", "Filename",
895];
896
897/// Build a landscape A4 PDF report from AB1 scan records. Filtered to gene-identified
898/// records with identity ≥ `MIN_SEQ_ID_IDENTITY`, same as the CSV output, and to
899/// samples no older than `report_max_age_days` (see `TBConfig::report_max_age_days`).
900pub fn build_report_pdf(
901    records: &[batch::SampleSusceptibilityRecord],
902    report_max_age_days: u32,
903) -> Vec<u8> {
904    use printpdf::*;
905
906    let max_age = std::time::Duration::from_secs(u64::from(report_max_age_days) * 24 * 60 * 60);
907    let now = std::time::SystemTime::now();
908
909    let filtered: Vec<&batch::SampleSusceptibilityRecord> = records
910        .iter()
911        .filter(|r| r.gene.is_some() && r.identity.is_some_and(|i| i >= MIN_SEQ_ID_IDENTITY))
912        .filter(|r| {
913            r.file_created
914                .is_none_or(|created| now.duration_since(created).is_ok_and(|age| age <= max_age))
915        })
916        .collect();
917
918    let (doc, page1, layer1) = PdfDocument::new(
919        "AB1 Susceptibility Report",
920        Mm(PDF_PAGE_W),
921        Mm(PDF_PAGE_H),
922        "Layer 1",
923    );
924
925    let font = doc.add_builtin_font(BuiltinFont::Helvetica).unwrap();
926    let font_bold = doc.add_builtin_font(BuiltinFont::HelveticaBold).unwrap();
927
928    let mut layer = doc.get_page(page1).get_layer(layer1);
929    let title_y = PDF_PAGE_H - PDF_MARGIN_T - 2.0;
930    let mut y = title_y - PDF_ROW_H * 1.5;
931
932    layer.use_text(
933        format!(
934            "AB1 Susceptibility Report  -  {}  ({} records)",
935            pdf_current_date(),
936            filtered.len()
937        ),
938        10.0_f32,
939        Mm(PDF_MARGIN_L),
940        Mm(title_y),
941        &font_bold,
942    );
943
944    pdf_hline(&layer, y + 4.0);
945    pdf_write_row(&layer, &font_bold, 7.5_f32, y, &PDF_COL_HEADERS);
946    y -= PDF_ROW_H;
947    pdf_hline(&layer, y + 4.0);
948
949    let mut layer_n = 2usize;
950
951    for rec in &filtered {
952        if y < PDF_MARGIN_B {
953            let (new_page, new_layer) =
954                doc.add_page(Mm(PDF_PAGE_W), Mm(PDF_PAGE_H), format!("Layer {layer_n}"));
955            layer_n += 1;
956            layer = doc.get_page(new_page).get_layer(new_layer);
957            y = PDF_PAGE_H - PDF_MARGIN_T - PDF_ROW_H * 1.5;
958            pdf_hline(&layer, y + 4.0);
959            pdf_write_row(&layer, &font_bold, 7.5_f32, y, &PDF_COL_HEADERS);
960            y -= PDF_ROW_H;
961            pdf_hline(&layer, y + 4.0);
962        }
963
964        let species = rec.species.as_deref().unwrap_or("");
965        let species_trunc = pdf_truncate(species, 50);
966        let calls_str = rec.susceptibility_calls.to_string();
967        let calls_trunc = pdf_truncate(&calls_str, 50);
968        let fname_trunc = pdf_truncate(&rec.file_name, 50);
969
970        let cells: [&str; 6] = [
971            rec.sample_id.as_str(),
972            rec.gene.as_deref().unwrap_or(""),
973            &species_trunc,
974            pdf_sus(rec.is_susceptible),
975            &calls_trunc,
976            &fname_trunc,
977        ];
978        pdf_write_row(&layer, &font, 7.0_f32, y, &cells);
979        y -= PDF_ROW_H;
980        pdf_hline(&layer, y + 4.0);
981    }
982
983    doc.save_to_bytes().unwrap_or_default()
984}
985
986fn pdf_write_row(
987    layer: &printpdf::PdfLayerReference,
988    font: &printpdf::IndirectFontRef,
989    size: f32,
990    y: f32,
991    cells: &[&str],
992) {
993    use printpdf::Mm;
994    for (i, text) in cells.iter().enumerate() {
995        layer.use_text(*text, size, Mm(PDF_MARGIN_L + PDF_COL_X[i]), Mm(y), font);
996    }
997}
998
999fn pdf_sus(v: Option<bool>) -> &'static str {
1000    match v {
1001        Some(true) => "S",
1002        Some(false) => "R",
1003        None => "",
1004    }
1005}
1006
1007fn pdf_hline(layer: &printpdf::PdfLayerReference, y: f32) {
1008    use printpdf::{Color, Greyscale, Line, Mm, Point};
1009    layer.set_outline_color(Color::Greyscale(Greyscale::new(0.5, None)));
1010    layer.set_outline_thickness(0.3_f32);
1011    layer.add_line(Line::from_iter(vec![
1012        (Point::new(Mm(PDF_MARGIN_L), Mm(y)), false),
1013        (Point::new(Mm(PDF_MARGIN_L + PDF_TABLE_W), Mm(y)), false),
1014    ]));
1015}
1016
1017fn pdf_truncate(s: &str, max_chars: usize) -> String {
1018    if s.len() <= max_chars {
1019        s.to_string()
1020    } else {
1021        format!("{}..", &s[..max_chars])
1022    }
1023}
1024
1025fn pdf_current_date() -> String {
1026    let secs = std::time::SystemTime::now()
1027        .duration_since(std::time::UNIX_EPOCH)
1028        .unwrap_or_default()
1029        .as_secs();
1030    let (y, m, d) = pdf_days_to_ymd((secs / 86400) as u32);
1031    format!("{y:04}-{m:02}-{d:02}")
1032}
1033
1034fn pdf_days_to_ymd(days: u32) -> (u32, u32, u32) {
1035    let jdn = days + 2_440_588;
1036    let a = jdn + 32044;
1037    let b = (4 * a + 3) / 146097;
1038    let c = a - (146097 * b) / 4;
1039    let d = (4 * c + 3) / 1461;
1040    let e = c - (1461 * d) / 4;
1041    let m = (5 * e + 2) / 153;
1042    let day = e - (153 * m + 2) / 5 + 1;
1043    let month = m + 3 - 12 * (m / 10);
1044    let year = 100 * b + d - 4800 + m / 10;
1045    (year, month, day)
1046}
1047
1048#[cfg(test)]
1049mod tests {
1050    use super::parse_multi_fasta;
1051
1052    #[test]
1053    fn test_parse_multi_fasta_distinguishes_infrasubspecific_descriptions() {
1054        // The bovis and H37Rv pncA references share a binomial name, differing only in the
1055        // "variant bovis" qualifier NCBI tucks into the defline after the species — without
1056        // surfacing it, both entries describe themselves identically as "Mycobacterium
1057        // tuberculosis" in the UI (see res/sequences/pnca/pnca_bovis_AF2122_97.fasta).
1058        let fasta = "\
1059>NC_000962.3:c2289291-2288681 Mycobacterium tuberculosis H37Rv, complete genome
1060ACGT
1061>NC_002945.4:c2277615-2277005 Mycobacterium tuberculosis variant bovis AF2122/97 chromosome, complete sequence
1062ACGT
1063>HM007606.1 Mycobacterium avium subsp. paratuberculosis strain DSM 44133 23S ribosomal RNA gene, partial sequence
1064ACGT
1065>NC_xxx:1-4 Mycobacterium abscessus rrl
1066ACGT
1067";
1068        let entries = parse_multi_fasta(fasta);
1069        let descs: Vec<&str> = entries.iter().map(|(_, d, _)| d.as_str()).collect();
1070        assert_eq!(
1071            descs,
1072            vec![
1073                "Mycobacterium tuberculosis",
1074                "Mycobacterium tuberculosis variant bovis",
1075                "Mycobacterium avium subsp. paratuberculosis",
1076                // ntm-db colon-accession entries keep their plain genus+species description —
1077                // the word after species here is the gene tag ("rrl"), not a qualifier, so the
1078                // RRL/RRS_RESISTANCE_SNPS lookups keyed on it (see rrl.rs, rrs.rs) still match.
1079                "Mycobacterium abscessus",
1080            ]
1081        );
1082    }
1083}
cosmic_files/sequencing/mod.rs

cosmic_files/sequencing/
mod.rs