Skip to main content

uor_addr/canonical/nfc/
mod.rs

1//! UAX #15 Unicode NFC normalization — streaming, `no_std`, `no_alloc`.
2//!
3//! Reads UTF-8 from an `&[u8]` input slice, writes NFC-normalized
4//! UTF-8 into an `&mut [u8]` output slice, returns the number of
5//! bytes written. No allocator, no `std`, no panics on well-formed
6//! input.
7//!
8//! # Algorithm (UAX #15)
9//!
10//! 1. **Canonical decomposition (NFD).** For each input code point,
11//!    expand to its fully-recursive canonical decomposition. Hangul
12//!    syllables are decomposed algorithmically per UAX #15 §3.12; all
13//!    other decompositions are looked up in
14//!    [`tables::DECOMP_TABLE`] / [`tables::DECOMP_DATA`].
15//!
16//! 2. **Canonical reordering.** Within each "combining run" (a
17//!    starter followed by zero or more non-starters), sort the
18//!    non-starters in stable ascending order by canonical combining
19//!    class (UAX #15 §1.3).
20//!
21//! 3. **Canonical composition.** Walk the decomposed-and-reordered
22//!    sequence; for each starter, greedily compose with following
23//!    non-blocked combining marks via the canonical composition table
24//!    ([`tables::COMP_TABLE`]) plus Hangul algorithmic composition
25//!    (UAX #15 §3.12). A mark is "blocked" from a starter if any
26//!    intervening mark has canonical combining class ≥ the mark's own
27//!    class (UAX #15 D119).
28//!
29//! # Stream-safe bound
30//!
31//! The stream-safe text format pins the maximum number of consecutive
32//! non-starters at 30 (UAX #15 §3). The implementation uses a fixed
33//! 32-entry combining-run buffer (2-entry headroom) on the stack. No
34//! allocator. Input streams that violate the stream-safe bound emit
35//! [`NfcError::CombiningRunOverflow`].
36//!
37//! # UCD version pin
38//!
39//! Tables are generated from UCD `tables::UCD_VERSION` (currently
40//! `15.1.0`). Regenerate via `python3 tools/gen_nfc_tables.py` after
41//! bumping the version pin in [`crate::canonical::nfc::tables`] and
42//! the vendored data files in `data/ucd/<version>/`.
43
44mod tables;
45
46pub use tables::UCD_VERSION;
47
48/// Maximum number of consecutive non-starters per UAX #15 §3
49/// stream-safe text format (30), with 2 entries of headroom so a
50/// 30-non-starter run does not immediately overflow.
51const COMBINING_RUN_CAPACITY: usize = 32;
52
53/// Maximum length of a canonical decomposition for any single code
54/// point. Per UCD 15.1.0 the longest canonical decomposition is 18
55/// code points (Arabic ligatures, certain CJK compatibility forms);
56/// 32 gives headroom against future UCD bumps.
57const DECOMP_BUF_CAPACITY: usize = 32;
58
59/// Streaming NFC normalizer error.
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum NfcError {
62    /// `input` is not well-formed UTF-8 per RFC 3629.
63    InvalidUtf8 {
64        /// Byte offset within `input` where validation failed.
65        at: usize,
66    },
67    /// `out` is too small to hold the NFC-normalized form.
68    OutputOverflow,
69    /// A combining run in `input` exceeds the UAX #15 stream-safe
70    /// bound of 30 consecutive non-starters.
71    CombiningRunOverflow,
72}
73
74/// UAX #15 NFC_Quick_Check property values.
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum NfcQc {
77    /// Input is unambiguously already NFC. No normalization needed.
78    Yes,
79    /// Input is definitely not NFC.
80    No,
81    /// Input may or may not be NFC. A full normalization pass is
82    /// required to decide and to produce the NFC form.
83    Maybe,
84}
85
86/// UAX #15 NFC_Quick_Check (UAX #15 §6 Quick_Check). Walks `input`
87/// once; returns `Yes` if every code point has `NFC_QC = Yes` and the
88/// canonical combining classes are non-decreasing within each
89/// combining run; returns `No` if any code point has `NFC_QC = No`
90/// or any reorder is required; returns `Maybe` otherwise.
91///
92/// # Errors
93///
94/// Returns `NfcQc::No` for input containing invalid UTF-8 — the
95/// caller's input must be valid UTF-8 to be NFC.
96#[must_use]
97pub fn quick_check(input: &[u8]) -> NfcQc {
98    let s = match core::str::from_utf8(input) {
99        Ok(s) => s,
100        Err(_) => return NfcQc::No,
101    };
102    let mut last_cc: u8 = 0;
103    let mut result = NfcQc::Yes;
104    for c in s.chars() {
105        let cp = c as u32;
106        let cc = combining_class(cp);
107        if cc != 0 && cc < last_cc {
108            // Reorder required — definitely not NFC.
109            return NfcQc::No;
110        }
111        match nfc_qc_lookup(cp) {
112            NfcQc::No => return NfcQc::No,
113            NfcQc::Maybe => result = NfcQc::Maybe,
114            NfcQc::Yes => {}
115        }
116        last_cc = if cc == 0 { 0 } else { cc };
117    }
118    result
119}
120
121/// Normalize `input` (well-formed UTF-8) into NFC, writing to `out`.
122/// Returns the number of bytes written.
123///
124/// # Errors
125///
126/// - [`NfcError::InvalidUtf8`] — `input` is not well-formed UTF-8.
127/// - [`NfcError::OutputOverflow`] — `out` is too small to hold the
128///   normalized form.
129/// - [`NfcError::CombiningRunOverflow`] — a combining run in `input`
130///   exceeds UAX #15's stream-safe bound (30 non-starters).
131///
132/// # Idempotence
133///
134/// `normalize_into(normalize_into(x)) == normalize_into(x)` byte-for-byte
135/// per UAX #15 stability — verified in tests against UCD
136/// `NormalizationTest.txt`.
137pub fn normalize_into(input: &[u8], out: &mut [u8]) -> Result<usize, NfcError> {
138    // Validate UTF-8 up front so the per-char iteration can index by
139    // code point without re-validating each step.
140    let s = core::str::from_utf8(input).map_err(|e| NfcError::InvalidUtf8 {
141        at: e.valid_up_to(),
142    })?;
143
144    let mut writer = Writer::new(out);
145    let mut state = NfcState::new();
146    let mut decomp_buf = [0u32; DECOMP_BUF_CAPACITY];
147
148    for c in s.chars() {
149        let cp = c as u32;
150        let len = decompose_recursive(cp, &mut decomp_buf);
151        for &dp in &decomp_buf[..len] {
152            state.feed(dp, &mut writer)?;
153        }
154    }
155    state.flush(&mut writer)?;
156    Ok(writer.pos)
157}
158
159// ─── Buffered NFC state machine ────────────────────────────────────
160//
161// The algorithm buffers a "combining run" — a starter plus all
162// following non-starters until the next starter — and *then* resolves
163// composition. Composing eagerly per-mark is wrong: a later mark with
164// a lower combining class can become a better composition candidate
165// for the starter after canonical reorder. The buffered form per
166// UAX #15 §1.3 Algorithm 4 produces correct NFC by first sorting marks
167// in the run by ccc and then composing greedily left-to-right against
168// the starter.
169
170struct NfcState {
171    /// Current starter code point. `None` until the first starter is
172    /// seen.
173    starter: Option<u32>,
174    /// Pending non-starter marks since `starter`, kept sorted ascending
175    /// by canonical combining class (stable insertion-sort preserves
176    /// input order among equal-ccc marks).
177    pending: [u32; COMBINING_RUN_CAPACITY],
178    pending_len: u8,
179}
180
181impl NfcState {
182    fn new() -> Self {
183        Self {
184            starter: None,
185            pending: [0; COMBINING_RUN_CAPACITY],
186            pending_len: 0,
187        }
188    }
189
190    fn feed(&mut self, cp: u32, writer: &mut Writer<'_>) -> Result<(), NfcError> {
191        let cc = combining_class(cp);
192        if cc == 0 {
193            // New starter. If pending is empty, the new starter is not
194            // blocked from the current starter (no intervening marks
195            // separate them), so we may compose the two — this is how
196            // Hangul L+V→LV and LV+T→LVT compositions land, plus any
197            // non-Hangul starter+starter canonical decomposition that
198            // is not Full_Composition_Exclusion-excluded.
199            if self.pending_len == 0 {
200                if let Some(prev) = self.starter {
201                    if let Some(composed) = compose_pair(prev, cp) {
202                        self.starter = Some(composed);
203                        return Ok(());
204                    }
205                }
206            }
207            // Either no prior starter, or the pair does not compose,
208            // or marks intervene — resolve the current run and start
209            // fresh.
210            self.resolve_run(writer)?;
211            self.starter = Some(cp);
212            return Ok(());
213        }
214        // Non-starter combining mark — insert into pending at the
215        // ccc-sorted position. Stable insertion preserves input order
216        // among equal-ccc marks.
217        if (self.pending_len as usize) >= COMBINING_RUN_CAPACITY {
218            return Err(NfcError::CombiningRunOverflow);
219        }
220        let mut idx = self.pending_len as usize;
221        while idx > 0 && combining_class(self.pending[idx - 1]) > cc {
222            self.pending[idx] = self.pending[idx - 1];
223            idx -= 1;
224        }
225        self.pending[idx] = cp;
226        self.pending_len += 1;
227        Ok(())
228    }
229
230    fn flush(&mut self, writer: &mut Writer<'_>) -> Result<(), NfcError> {
231        self.resolve_run(writer)?;
232        self.starter = None;
233        Ok(())
234    }
235
236    /// Resolve the current `starter + pending` combining run: compose
237    /// `starter` greedily with non-blocked pending marks, then emit
238    /// the resulting starter plus any uncomposed marks in ccc order.
239    ///
240    /// Orphan-mark case (input starts with combining marks before any
241    /// starter): `starter` is `None`; pending marks are emitted as-is
242    /// in their already-ccc-sorted order, with no composition attempt
243    /// (there is no starter to compose against).
244    fn resolve_run(&mut self, writer: &mut Writer<'_>) -> Result<(), NfcError> {
245        if let Some(mut l) = self.starter.take() {
246            let mut max_cc_seen: u8 = 0;
247            let mut i: usize = 0;
248            let mut len = self.pending_len as usize;
249            while i < len {
250                let m = self.pending[i];
251                let cc_m = combining_class(m);
252                // M can compose with L only if no prior mark in the
253                // run has ccc >= ccc(M) — i.e. cc_m > max_cc_seen
254                // (UAX #15 D119 blocking rule).
255                if cc_m > max_cc_seen {
256                    if let Some(composed) = compose_pair(l, m) {
257                        l = composed;
258                        // Remove pending[i]: shift everything down.
259                        let mut j = i;
260                        while j + 1 < len {
261                            self.pending[j] = self.pending[j + 1];
262                            j += 1;
263                        }
264                        len -= 1;
265                        // Do not advance i; do not update max_cc_seen.
266                        continue;
267                    }
268                }
269                max_cc_seen = cc_m;
270                i += 1;
271            }
272            self.pending_len = len as u8;
273            writer.write_code_point(l)?;
274        }
275        for k in 0..(self.pending_len as usize) {
276            writer.write_code_point(self.pending[k])?;
277        }
278        self.pending_len = 0;
279        Ok(())
280    }
281}
282
283// ─── UTF-8 byte writer (no_alloc) ──────────────────────────────────
284
285struct Writer<'a> {
286    out: &'a mut [u8],
287    pos: usize,
288}
289
290impl<'a> Writer<'a> {
291    fn new(out: &'a mut [u8]) -> Self {
292        Self { out, pos: 0 }
293    }
294
295    fn write_code_point(&mut self, cp: u32) -> Result<(), NfcError> {
296        // Encode cp as UTF-8 via core::char + encode_utf8.
297        let c = char::from_u32(cp).ok_or(NfcError::OutputOverflow)?;
298        let mut buf = [0u8; 4];
299        let s = c.encode_utf8(&mut buf);
300        let bytes = s.as_bytes();
301        if self.pos + bytes.len() > self.out.len() {
302            return Err(NfcError::OutputOverflow);
303        }
304        self.out[self.pos..self.pos + bytes.len()].copy_from_slice(bytes);
305        self.pos += bytes.len();
306        Ok(())
307    }
308}
309
310// ─── Decomposition (UAX #15 §3 + §3.12) ────────────────────────────
311
312/// Recursively-expanded canonical decomposition of `cp`. Writes
313/// decomposed code points into `out` (which must be at least 18
314/// entries — the longest canonical decomposition in UCD 15.1.0).
315/// Returns the number of code points written.
316fn decompose_recursive(cp: u32, out: &mut [u32; DECOMP_BUF_CAPACITY]) -> usize {
317    // Hangul algorithmic decomposition (UAX #15 §3.12).
318    if (tables::HANGUL_S_BASE..=tables::HANGUL_S_LAST).contains(&cp) {
319        let s_index = cp - tables::HANGUL_S_BASE;
320        let l_index = s_index / tables::HANGUL_N_COUNT;
321        let v_index = (s_index % tables::HANGUL_N_COUNT) / tables::HANGUL_T_COUNT;
322        let t_index = s_index % tables::HANGUL_T_COUNT;
323        out[0] = tables::HANGUL_L_BASE + l_index;
324        out[1] = tables::HANGUL_V_BASE + v_index;
325        if t_index == 0 {
326            return 2;
327        }
328        out[2] = tables::HANGUL_T_BASE + t_index;
329        return 3;
330    }
331    // Non-Hangul: look up in DECOMP_TABLE.
332    match tables::DECOMP_TABLE.binary_search_by_key(&cp, |&(c, _, _)| c) {
333        Ok(idx) => {
334            let (_, off, len) = tables::DECOMP_TABLE[idx];
335            let off = off as usize;
336            let len = len as usize;
337            // `tables::DECOMP_DATA` is already fully recursive — copy
338            // directly.
339            out[..len].copy_from_slice(&tables::DECOMP_DATA[off..off + len]);
340            len
341        }
342        Err(_) => {
343            // No decomposition — identity.
344            out[0] = cp;
345            1
346        }
347    }
348}
349
350// ─── Composition (UAX #15 §3.12 + table) ───────────────────────────
351
352/// Canonical composition of `(starter, mark)`. Returns the composed
353/// code point if `(starter, mark)` is a primary canonical composite
354/// not in Full_Composition_Exclusion; `None` otherwise.
355fn compose_pair(starter: u32, mark: u32) -> Option<u32> {
356    // Hangul algorithmic composition (UAX #15 §3.12).
357    // L + V → LV
358    if (tables::HANGUL_L_BASE..tables::HANGUL_L_BASE + tables::HANGUL_L_COUNT).contains(&starter)
359        && (tables::HANGUL_V_BASE..tables::HANGUL_V_BASE + tables::HANGUL_V_COUNT).contains(&mark)
360    {
361        let l_index = starter - tables::HANGUL_L_BASE;
362        let v_index = mark - tables::HANGUL_V_BASE;
363        let s_index = l_index * tables::HANGUL_N_COUNT + v_index * tables::HANGUL_T_COUNT;
364        return Some(tables::HANGUL_S_BASE + s_index);
365    }
366    // LV + T → LVT  (only valid when starter is an LV syllable —
367    // i.e. (starter - S_BASE) is a multiple of T_COUNT and mark is in
368    // (T_BASE, T_BASE+T_COUNT). T_BASE itself is the "no trailing"
369    // sentinel and is not a valid composing mark.)
370    if (tables::HANGUL_S_BASE..tables::HANGUL_S_BASE + tables::HANGUL_S_COUNT).contains(&starter)
371        && (starter - tables::HANGUL_S_BASE) % tables::HANGUL_T_COUNT == 0
372        && mark > tables::HANGUL_T_BASE
373        && mark < tables::HANGUL_T_BASE + tables::HANGUL_T_COUNT
374    {
375        let t_index = mark - tables::HANGUL_T_BASE;
376        return Some(starter + t_index);
377    }
378    // Non-Hangul: look up in COMP_TABLE.
379    let key = (starter, mark);
380    tables::COMP_TABLE
381        .binary_search_by_key(&key, |&(s, m, _)| (s, m))
382        .ok()
383        .map(|idx| tables::COMP_TABLE[idx].2)
384}
385
386// ─── Combining class + NFC_QC lookups ──────────────────────────────
387
388fn combining_class(cp: u32) -> u8 {
389    match tables::CCC_TABLE.binary_search_by_key(&cp, |&(c, _)| c) {
390        Ok(idx) => tables::CCC_TABLE[idx].1,
391        Err(_) => 0,
392    }
393}
394
395fn nfc_qc_lookup(cp: u32) -> NfcQc {
396    if tables::NFC_QC_NO.binary_search(&cp).is_ok() {
397        NfcQc::No
398    } else if tables::NFC_QC_MAYBE.binary_search(&cp).is_ok() {
399        NfcQc::Maybe
400    } else {
401        NfcQc::Yes
402    }
403}
404
405#[cfg(test)]
406mod algorithm_tests {
407    extern crate alloc;
408
409    use super::*;
410    use alloc::string::String;
411
412    fn nfc(input: &str) -> String {
413        let mut out = [0u8; 1024];
414        let n = normalize_into(input.as_bytes(), &mut out).expect("ok");
415        String::from_utf8(out[..n].to_vec()).expect("utf8")
416    }
417
418    #[test]
419    fn ascii_passes_through_unchanged() {
420        assert_eq!(nfc("hello"), "hello");
421        assert_eq!(nfc(""), "");
422        assert_eq!(nfc("foo bar baz"), "foo bar baz");
423    }
424
425    #[test]
426    fn nfd_to_nfc_recomposes_combining_marks() {
427        // "café" with combining acute = "cafe\u{0301}" → NFC "caf\u{00E9}"
428        assert_eq!(nfc("cafe\u{0301}"), "caf\u{00E9}");
429    }
430
431    #[test]
432    fn nfc_input_is_idempotent() {
433        assert_eq!(nfc("caf\u{00E9}"), "caf\u{00E9}");
434        assert_eq!(nfc("\u{00C5}ngstr\u{00F6}m"), "\u{00C5}ngstr\u{00F6}m");
435    }
436
437    #[test]
438    fn double_normalisation_yields_same_output() {
439        let inputs = ["cafe\u{0301}", "A\u{030A}", "\u{1E0B}\u{0323}", "한국어"];
440        for input in inputs.iter() {
441            let once = nfc(input);
442            let twice = nfc(&once);
443            assert_eq!(once, twice, "idempotence broken for {input:?}");
444        }
445    }
446
447    #[test]
448    fn hangul_lv_composes_algorithmically() {
449        // L=U+1100 (ᄀ) + V=U+1161 (ᅡ) → LV=U+AC00 (가)
450        assert_eq!(nfc("\u{1100}\u{1161}"), "\u{AC00}");
451    }
452
453    #[test]
454    fn hangul_lvt_composes_algorithmically() {
455        // L=U+1100 + V=U+1161 + T=U+11A8 → LVT=U+AC01
456        assert_eq!(nfc("\u{1100}\u{1161}\u{11A8}"), "\u{AC01}");
457    }
458
459    #[test]
460    fn hangul_syllable_already_composed_is_idempotent() {
461        assert_eq!(nfc("\u{AC00}"), "\u{AC00}");
462        assert_eq!(nfc("한국어"), "한국어");
463    }
464
465    #[test]
466    fn canonical_reorder_then_compose_picks_lower_ccc_first() {
467        // d (U+0064) + U+0307 (ccc=230, dot above) + U+0323 (ccc=220,
468        // dot below). NFC must (a) reorder by ccc — U+0323 before
469        // U+0307 — and (b) attempt compose against the starter in
470        // ccc-sorted order. d + U+0323 → U+1E0D (LATIN SMALL LETTER D
471        // WITH DOT BELOW); U+1E0D + U+0307 has no precomposed form
472        // and remains a combining sequence. Result: U+1E0D U+0307.
473        assert_eq!(nfc("d\u{0307}\u{0323}"), "\u{1E0D}\u{0307}");
474        // Inputs in input-reordered order normalize to the same NFC.
475        assert_eq!(nfc("d\u{0323}\u{0307}"), "\u{1E0D}\u{0307}");
476    }
477
478    #[test]
479    fn composition_exclusion_pairs_do_not_recompose() {
480        // U+212B Å (Angstrom sign) decomposes to U+0041 U+030A but is
481        // a singleton — the canonical NFC is U+00C5 (Latin A with
482        // ring above), not U+212B.
483        assert_eq!(nfc("\u{212B}"), "\u{00C5}");
484    }
485
486    #[test]
487    fn quick_check_recognises_ascii_as_nfc_yes() {
488        assert_eq!(quick_check(b"hello"), NfcQc::Yes);
489        assert_eq!(quick_check(b""), NfcQc::Yes);
490    }
491
492    #[test]
493    fn quick_check_recognises_decomposed_input_as_non_nfc() {
494        // U+0301 has NFC_QC = Maybe (it's a combining mark that
495        // might recompose) — the quick check returns Maybe, but for
496        // input where reorder is required it returns No.
497        let qc = quick_check("cafe\u{0301}".as_bytes());
498        assert!(matches!(qc, NfcQc::Maybe | NfcQc::No));
499    }
500
501    #[test]
502    fn rejects_output_buffer_too_small() {
503        let mut tiny = [0u8; 1];
504        let err = normalize_into("café".as_bytes(), &mut tiny).expect_err("must error");
505        assert_eq!(err, NfcError::OutputOverflow);
506    }
507}