uor_addr/canonical/nfc/mod.rs
1//! UAX #15 Unicode NFC normalization — streaming, `no_std`, `no_alloc`.
2//!
3//! Reads UTF-8 from an `&[u8]` input slice, writes NFC-normalized
4//! UTF-8 into an `&mut [u8]` output slice, returns the number of
5//! bytes written. No allocator, no `std`, no panics on well-formed
6//! input.
7//!
8//! # Algorithm (UAX #15)
9//!
10//! 1. **Canonical decomposition (NFD).** For each input code point,
11//! expand to its fully-recursive canonical decomposition. Hangul
12//! syllables are decomposed algorithmically per UAX #15 §3.12; all
13//! other decompositions are looked up in
14//! [`tables::DECOMP_TABLE`] / [`tables::DECOMP_DATA`].
15//!
16//! 2. **Canonical reordering.** Within each "combining run" (a
17//! starter followed by zero or more non-starters), sort the
18//! non-starters in stable ascending order by canonical combining
19//! class (UAX #15 §1.3).
20//!
21//! 3. **Canonical composition.** Walk the decomposed-and-reordered
22//! sequence; for each starter, greedily compose with following
23//! non-blocked combining marks via the canonical composition table
24//! ([`tables::COMP_TABLE`]) plus Hangul algorithmic composition
25//! (UAX #15 §3.12). A mark is "blocked" from a starter if any
26//! intervening mark has canonical combining class ≥ the mark's own
27//! class (UAX #15 D119).
28//!
29//! # Stream-safe bound
30//!
31//! The stream-safe text format pins the maximum number of consecutive
32//! non-starters at 30 (UAX #15 §3). The implementation uses a fixed
33//! 32-entry combining-run buffer (2-entry headroom) on the stack. No
34//! allocator. Input streams that violate the stream-safe bound emit
35//! [`NfcError::CombiningRunOverflow`].
36//!
37//! # UCD version pin
38//!
39//! Tables are generated from UCD `tables::UCD_VERSION` (currently
40//! `15.1.0`). Regenerate via `python3 tools/gen_nfc_tables.py` after
41//! bumping the version pin in [`crate::canonical::nfc::tables`] and
42//! the vendored data files in `data/ucd/<version>/`.
43
44mod tables;
45
46pub use tables::UCD_VERSION;
47
48/// Maximum number of consecutive non-starters per UAX #15 §3
49/// stream-safe text format (30), with 2 entries of headroom so a
50/// 30-non-starter run does not immediately overflow.
51const COMBINING_RUN_CAPACITY: usize = 32;
52
53/// Maximum length of a canonical decomposition for any single code
54/// point. Per UCD 15.1.0 the longest canonical decomposition is 18
55/// code points (Arabic ligatures, certain CJK compatibility forms);
56/// 32 gives headroom against future UCD bumps.
57const DECOMP_BUF_CAPACITY: usize = 32;
58
59/// Streaming NFC normalizer error.
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum NfcError {
62 /// `input` is not well-formed UTF-8 per RFC 3629.
63 InvalidUtf8 {
64 /// Byte offset within `input` where validation failed.
65 at: usize,
66 },
67 /// `out` is too small to hold the NFC-normalized form.
68 OutputOverflow,
69 /// A combining run in `input` exceeds the UAX #15 stream-safe
70 /// bound of 30 consecutive non-starters.
71 CombiningRunOverflow,
72}
73
74/// UAX #15 NFC_Quick_Check property values.
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum NfcQc {
77 /// Input is unambiguously already NFC. No normalization needed.
78 Yes,
79 /// Input is definitely not NFC.
80 No,
81 /// Input may or may not be NFC. A full normalization pass is
82 /// required to decide and to produce the NFC form.
83 Maybe,
84}
85
86/// UAX #15 NFC_Quick_Check (UAX #15 §6 Quick_Check). Walks `input`
87/// once; returns `Yes` if every code point has `NFC_QC = Yes` and the
88/// canonical combining classes are non-decreasing within each
89/// combining run; returns `No` if any code point has `NFC_QC = No`
90/// or any reorder is required; returns `Maybe` otherwise.
91///
92/// # Errors
93///
94/// Returns `NfcQc::No` for input containing invalid UTF-8 — the
95/// caller's input must be valid UTF-8 to be NFC.
96#[must_use]
97pub fn quick_check(input: &[u8]) -> NfcQc {
98 let s = match core::str::from_utf8(input) {
99 Ok(s) => s,
100 Err(_) => return NfcQc::No,
101 };
102 let mut last_cc: u8 = 0;
103 let mut result = NfcQc::Yes;
104 for c in s.chars() {
105 let cp = c as u32;
106 let cc = combining_class(cp);
107 if cc != 0 && cc < last_cc {
108 // Reorder required — definitely not NFC.
109 return NfcQc::No;
110 }
111 match nfc_qc_lookup(cp) {
112 NfcQc::No => return NfcQc::No,
113 NfcQc::Maybe => result = NfcQc::Maybe,
114 NfcQc::Yes => {}
115 }
116 last_cc = if cc == 0 { 0 } else { cc };
117 }
118 result
119}
120
121/// Normalize `input` (well-formed UTF-8) into NFC, writing to `out`.
122/// Returns the number of bytes written.
123///
124/// # Errors
125///
126/// - [`NfcError::InvalidUtf8`] — `input` is not well-formed UTF-8.
127/// - [`NfcError::OutputOverflow`] — `out` is too small to hold the
128/// normalized form.
129/// - [`NfcError::CombiningRunOverflow`] — a combining run in `input`
130/// exceeds UAX #15's stream-safe bound (30 non-starters).
131///
132/// # Idempotence
133///
134/// `normalize_into(normalize_into(x)) == normalize_into(x)` byte-for-byte
135/// per UAX #15 stability — verified in tests against UCD
136/// `NormalizationTest.txt`.
137pub fn normalize_into(input: &[u8], out: &mut [u8]) -> Result<usize, NfcError> {
138 // Validate UTF-8 up front so the per-char iteration can index by
139 // code point without re-validating each step.
140 let s = core::str::from_utf8(input).map_err(|e| NfcError::InvalidUtf8 {
141 at: e.valid_up_to(),
142 })?;
143
144 let mut writer = Writer::new(out);
145 let mut state = NfcState::new();
146 let mut decomp_buf = [0u32; DECOMP_BUF_CAPACITY];
147
148 for c in s.chars() {
149 let cp = c as u32;
150 let len = decompose_recursive(cp, &mut decomp_buf);
151 for &dp in &decomp_buf[..len] {
152 state.feed(dp, &mut writer)?;
153 }
154 }
155 state.flush(&mut writer)?;
156 Ok(writer.pos)
157}
158
159// ─── Buffered NFC state machine ────────────────────────────────────
160//
161// The algorithm buffers a "combining run" — a starter plus all
162// following non-starters until the next starter — and *then* resolves
163// composition. Composing eagerly per-mark is wrong: a later mark with
164// a lower combining class can become a better composition candidate
165// for the starter after canonical reorder. The buffered form per
166// UAX #15 §1.3 Algorithm 4 produces correct NFC by first sorting marks
167// in the run by ccc and then composing greedily left-to-right against
168// the starter.
169
170struct NfcState {
171 /// Current starter code point. `None` until the first starter is
172 /// seen.
173 starter: Option<u32>,
174 /// Pending non-starter marks since `starter`, kept sorted ascending
175 /// by canonical combining class (stable insertion-sort preserves
176 /// input order among equal-ccc marks).
177 pending: [u32; COMBINING_RUN_CAPACITY],
178 pending_len: u8,
179}
180
181impl NfcState {
182 fn new() -> Self {
183 Self {
184 starter: None,
185 pending: [0; COMBINING_RUN_CAPACITY],
186 pending_len: 0,
187 }
188 }
189
190 fn feed(&mut self, cp: u32, writer: &mut Writer<'_>) -> Result<(), NfcError> {
191 let cc = combining_class(cp);
192 if cc == 0 {
193 // New starter. If pending is empty, the new starter is not
194 // blocked from the current starter (no intervening marks
195 // separate them), so we may compose the two — this is how
196 // Hangul L+V→LV and LV+T→LVT compositions land, plus any
197 // non-Hangul starter+starter canonical decomposition that
198 // is not Full_Composition_Exclusion-excluded.
199 if self.pending_len == 0 {
200 if let Some(prev) = self.starter {
201 if let Some(composed) = compose_pair(prev, cp) {
202 self.starter = Some(composed);
203 return Ok(());
204 }
205 }
206 }
207 // Either no prior starter, or the pair does not compose,
208 // or marks intervene — resolve the current run and start
209 // fresh.
210 self.resolve_run(writer)?;
211 self.starter = Some(cp);
212 return Ok(());
213 }
214 // Non-starter combining mark — insert into pending at the
215 // ccc-sorted position. Stable insertion preserves input order
216 // among equal-ccc marks.
217 if (self.pending_len as usize) >= COMBINING_RUN_CAPACITY {
218 return Err(NfcError::CombiningRunOverflow);
219 }
220 let mut idx = self.pending_len as usize;
221 while idx > 0 && combining_class(self.pending[idx - 1]) > cc {
222 self.pending[idx] = self.pending[idx - 1];
223 idx -= 1;
224 }
225 self.pending[idx] = cp;
226 self.pending_len += 1;
227 Ok(())
228 }
229
230 fn flush(&mut self, writer: &mut Writer<'_>) -> Result<(), NfcError> {
231 self.resolve_run(writer)?;
232 self.starter = None;
233 Ok(())
234 }
235
236 /// Resolve the current `starter + pending` combining run: compose
237 /// `starter` greedily with non-blocked pending marks, then emit
238 /// the resulting starter plus any uncomposed marks in ccc order.
239 ///
240 /// Orphan-mark case (input starts with combining marks before any
241 /// starter): `starter` is `None`; pending marks are emitted as-is
242 /// in their already-ccc-sorted order, with no composition attempt
243 /// (there is no starter to compose against).
244 fn resolve_run(&mut self, writer: &mut Writer<'_>) -> Result<(), NfcError> {
245 if let Some(mut l) = self.starter.take() {
246 let mut max_cc_seen: u8 = 0;
247 let mut i: usize = 0;
248 let mut len = self.pending_len as usize;
249 while i < len {
250 let m = self.pending[i];
251 let cc_m = combining_class(m);
252 // M can compose with L only if no prior mark in the
253 // run has ccc >= ccc(M) — i.e. cc_m > max_cc_seen
254 // (UAX #15 D119 blocking rule).
255 if cc_m > max_cc_seen {
256 if let Some(composed) = compose_pair(l, m) {
257 l = composed;
258 // Remove pending[i]: shift everything down.
259 let mut j = i;
260 while j + 1 < len {
261 self.pending[j] = self.pending[j + 1];
262 j += 1;
263 }
264 len -= 1;
265 // Do not advance i; do not update max_cc_seen.
266 continue;
267 }
268 }
269 max_cc_seen = cc_m;
270 i += 1;
271 }
272 self.pending_len = len as u8;
273 writer.write_code_point(l)?;
274 }
275 for k in 0..(self.pending_len as usize) {
276 writer.write_code_point(self.pending[k])?;
277 }
278 self.pending_len = 0;
279 Ok(())
280 }
281}
282
283// ─── UTF-8 byte writer (no_alloc) ──────────────────────────────────
284
285struct Writer<'a> {
286 out: &'a mut [u8],
287 pos: usize,
288}
289
290impl<'a> Writer<'a> {
291 fn new(out: &'a mut [u8]) -> Self {
292 Self { out, pos: 0 }
293 }
294
295 fn write_code_point(&mut self, cp: u32) -> Result<(), NfcError> {
296 // Encode cp as UTF-8 via core::char + encode_utf8.
297 let c = char::from_u32(cp).ok_or(NfcError::OutputOverflow)?;
298 let mut buf = [0u8; 4];
299 let s = c.encode_utf8(&mut buf);
300 let bytes = s.as_bytes();
301 if self.pos + bytes.len() > self.out.len() {
302 return Err(NfcError::OutputOverflow);
303 }
304 self.out[self.pos..self.pos + bytes.len()].copy_from_slice(bytes);
305 self.pos += bytes.len();
306 Ok(())
307 }
308}
309
310// ─── Decomposition (UAX #15 §3 + §3.12) ────────────────────────────
311
312/// Recursively-expanded canonical decomposition of `cp`. Writes
313/// decomposed code points into `out` (which must be at least 18
314/// entries — the longest canonical decomposition in UCD 15.1.0).
315/// Returns the number of code points written.
316fn decompose_recursive(cp: u32, out: &mut [u32; DECOMP_BUF_CAPACITY]) -> usize {
317 // Hangul algorithmic decomposition (UAX #15 §3.12).
318 if (tables::HANGUL_S_BASE..=tables::HANGUL_S_LAST).contains(&cp) {
319 let s_index = cp - tables::HANGUL_S_BASE;
320 let l_index = s_index / tables::HANGUL_N_COUNT;
321 let v_index = (s_index % tables::HANGUL_N_COUNT) / tables::HANGUL_T_COUNT;
322 let t_index = s_index % tables::HANGUL_T_COUNT;
323 out[0] = tables::HANGUL_L_BASE + l_index;
324 out[1] = tables::HANGUL_V_BASE + v_index;
325 if t_index == 0 {
326 return 2;
327 }
328 out[2] = tables::HANGUL_T_BASE + t_index;
329 return 3;
330 }
331 // Non-Hangul: look up in DECOMP_TABLE.
332 match tables::DECOMP_TABLE.binary_search_by_key(&cp, |&(c, _, _)| c) {
333 Ok(idx) => {
334 let (_, off, len) = tables::DECOMP_TABLE[idx];
335 let off = off as usize;
336 let len = len as usize;
337 // `tables::DECOMP_DATA` is already fully recursive — copy
338 // directly.
339 out[..len].copy_from_slice(&tables::DECOMP_DATA[off..off + len]);
340 len
341 }
342 Err(_) => {
343 // No decomposition — identity.
344 out[0] = cp;
345 1
346 }
347 }
348}
349
350// ─── Composition (UAX #15 §3.12 + table) ───────────────────────────
351
352/// Canonical composition of `(starter, mark)`. Returns the composed
353/// code point if `(starter, mark)` is a primary canonical composite
354/// not in Full_Composition_Exclusion; `None` otherwise.
355fn compose_pair(starter: u32, mark: u32) -> Option<u32> {
356 // Hangul algorithmic composition (UAX #15 §3.12).
357 // L + V → LV
358 if (tables::HANGUL_L_BASE..tables::HANGUL_L_BASE + tables::HANGUL_L_COUNT).contains(&starter)
359 && (tables::HANGUL_V_BASE..tables::HANGUL_V_BASE + tables::HANGUL_V_COUNT).contains(&mark)
360 {
361 let l_index = starter - tables::HANGUL_L_BASE;
362 let v_index = mark - tables::HANGUL_V_BASE;
363 let s_index = l_index * tables::HANGUL_N_COUNT + v_index * tables::HANGUL_T_COUNT;
364 return Some(tables::HANGUL_S_BASE + s_index);
365 }
366 // LV + T → LVT (only valid when starter is an LV syllable —
367 // i.e. (starter - S_BASE) is a multiple of T_COUNT and mark is in
368 // (T_BASE, T_BASE+T_COUNT). T_BASE itself is the "no trailing"
369 // sentinel and is not a valid composing mark.)
370 if (tables::HANGUL_S_BASE..tables::HANGUL_S_BASE + tables::HANGUL_S_COUNT).contains(&starter)
371 && (starter - tables::HANGUL_S_BASE) % tables::HANGUL_T_COUNT == 0
372 && mark > tables::HANGUL_T_BASE
373 && mark < tables::HANGUL_T_BASE + tables::HANGUL_T_COUNT
374 {
375 let t_index = mark - tables::HANGUL_T_BASE;
376 return Some(starter + t_index);
377 }
378 // Non-Hangul: look up in COMP_TABLE.
379 let key = (starter, mark);
380 tables::COMP_TABLE
381 .binary_search_by_key(&key, |&(s, m, _)| (s, m))
382 .ok()
383 .map(|idx| tables::COMP_TABLE[idx].2)
384}
385
386// ─── Combining class + NFC_QC lookups ──────────────────────────────
387
388fn combining_class(cp: u32) -> u8 {
389 match tables::CCC_TABLE.binary_search_by_key(&cp, |&(c, _)| c) {
390 Ok(idx) => tables::CCC_TABLE[idx].1,
391 Err(_) => 0,
392 }
393}
394
395fn nfc_qc_lookup(cp: u32) -> NfcQc {
396 if tables::NFC_QC_NO.binary_search(&cp).is_ok() {
397 NfcQc::No
398 } else if tables::NFC_QC_MAYBE.binary_search(&cp).is_ok() {
399 NfcQc::Maybe
400 } else {
401 NfcQc::Yes
402 }
403}
404
405#[cfg(test)]
406mod algorithm_tests {
407 extern crate alloc;
408
409 use super::*;
410 use alloc::string::String;
411
412 fn nfc(input: &str) -> String {
413 let mut out = [0u8; 1024];
414 let n = normalize_into(input.as_bytes(), &mut out).expect("ok");
415 String::from_utf8(out[..n].to_vec()).expect("utf8")
416 }
417
418 #[test]
419 fn ascii_passes_through_unchanged() {
420 assert_eq!(nfc("hello"), "hello");
421 assert_eq!(nfc(""), "");
422 assert_eq!(nfc("foo bar baz"), "foo bar baz");
423 }
424
425 #[test]
426 fn nfd_to_nfc_recomposes_combining_marks() {
427 // "café" with combining acute = "cafe\u{0301}" → NFC "caf\u{00E9}"
428 assert_eq!(nfc("cafe\u{0301}"), "caf\u{00E9}");
429 }
430
431 #[test]
432 fn nfc_input_is_idempotent() {
433 assert_eq!(nfc("caf\u{00E9}"), "caf\u{00E9}");
434 assert_eq!(nfc("\u{00C5}ngstr\u{00F6}m"), "\u{00C5}ngstr\u{00F6}m");
435 }
436
437 #[test]
438 fn double_normalisation_yields_same_output() {
439 let inputs = ["cafe\u{0301}", "A\u{030A}", "\u{1E0B}\u{0323}", "한국어"];
440 for input in inputs.iter() {
441 let once = nfc(input);
442 let twice = nfc(&once);
443 assert_eq!(once, twice, "idempotence broken for {input:?}");
444 }
445 }
446
447 #[test]
448 fn hangul_lv_composes_algorithmically() {
449 // L=U+1100 (ᄀ) + V=U+1161 (ᅡ) → LV=U+AC00 (가)
450 assert_eq!(nfc("\u{1100}\u{1161}"), "\u{AC00}");
451 }
452
453 #[test]
454 fn hangul_lvt_composes_algorithmically() {
455 // L=U+1100 + V=U+1161 + T=U+11A8 → LVT=U+AC01
456 assert_eq!(nfc("\u{1100}\u{1161}\u{11A8}"), "\u{AC01}");
457 }
458
459 #[test]
460 fn hangul_syllable_already_composed_is_idempotent() {
461 assert_eq!(nfc("\u{AC00}"), "\u{AC00}");
462 assert_eq!(nfc("한국어"), "한국어");
463 }
464
465 #[test]
466 fn canonical_reorder_then_compose_picks_lower_ccc_first() {
467 // d (U+0064) + U+0307 (ccc=230, dot above) + U+0323 (ccc=220,
468 // dot below). NFC must (a) reorder by ccc — U+0323 before
469 // U+0307 — and (b) attempt compose against the starter in
470 // ccc-sorted order. d + U+0323 → U+1E0D (LATIN SMALL LETTER D
471 // WITH DOT BELOW); U+1E0D + U+0307 has no precomposed form
472 // and remains a combining sequence. Result: U+1E0D U+0307.
473 assert_eq!(nfc("d\u{0307}\u{0323}"), "\u{1E0D}\u{0307}");
474 // Inputs in input-reordered order normalize to the same NFC.
475 assert_eq!(nfc("d\u{0323}\u{0307}"), "\u{1E0D}\u{0307}");
476 }
477
478 #[test]
479 fn composition_exclusion_pairs_do_not_recompose() {
480 // U+212B Å (Angstrom sign) decomposes to U+0041 U+030A but is
481 // a singleton — the canonical NFC is U+00C5 (Latin A with
482 // ring above), not U+212B.
483 assert_eq!(nfc("\u{212B}"), "\u{00C5}");
484 }
485
486 #[test]
487 fn quick_check_recognises_ascii_as_nfc_yes() {
488 assert_eq!(quick_check(b"hello"), NfcQc::Yes);
489 assert_eq!(quick_check(b""), NfcQc::Yes);
490 }
491
492 #[test]
493 fn quick_check_recognises_decomposed_input_as_non_nfc() {
494 // U+0301 has NFC_QC = Maybe (it's a combining mark that
495 // might recompose) — the quick check returns Maybe, but for
496 // input where reorder is required it returns No.
497 let qc = quick_check("cafe\u{0301}".as_bytes());
498 assert!(matches!(qc, NfcQc::Maybe | NfcQc::No));
499 }
500
501 #[test]
502 fn rejects_output_buffer_too_small() {
503 let mut tiny = [0u8; 1];
504 let err = normalize_into("café".as_bytes(), &mut tiny).expect_err("must error");
505 assert_eq!(err, NfcError::OutputOverflow);
506 }
507}