Skip to main content

uor_addr/sexp/
value.rs

1//! `SExprValue` — the typed S-expression input handle (ADR-023 amended
2//! by ADR-060).
3//!
4//! Under ADR-060 the S-expression realization no longer copies a
5//! structurally-tagged byte form into a fixed buffer. The host-boundary
6//! parser [`SExprCanon::validate`] checks the grammar in a single pass
7//! over the borrowed input — balanced parentheses (tracked with an
8//! unbounded `usize` depth counter), well-formed canonical atoms, exactly
9//! one top-level value — and the input handle then flows through the
10//! pipeline as a [`TermValue::Stream`] carrier ([`SExprCanon`]). The
11//! carrier re-tokenizes the borrowed bytes on demand and emits Rivest
12//! canonical S-expression bytes chunk-by-chunk; ψ₉ folds those chunks
13//! through the σ-axis with bounded resident memory. There is no input
14//! size ceiling, no atom-width cap, no element-count cap, and no nesting
15//! depth cap.
16//!
17//! # Input syntax
18//!
19//! [`SExprCanon::validate`] admits two equivalent surface syntaxes:
20//!
21//! - **Canonical (Rivest 1997 §4.3)** — `<n>:<bytes>` for atoms,
22//!   `(<value> <value>)` for cons, `()` for nil.
23//! - **Token list** — whitespace-separated tokens between parentheses,
24//!   each token interpreted as an atom whose bytes are the token's UTF-8
25//!   representation.
26//!
27//! # Canonical form
28//!
29//! The carrier emits Rivest's canonical S-expression form: atoms as
30//! `<n>:<bytes>` (raw length prefix, no quoting), proper lists as
31//! `(s₁ s₂ … sₙ)` with single-space separators, nil as `()`. The
32//! emission is a streaming structural rewrite — element order is
33//! preserved, so it is reproducible chunk-by-chunk from the borrowed
34//! input with no intermediate buffer.
35
36use prism::operation::TermValue;
37use prism::pipeline::{
38    ConstrainedTypeShape, ConstraintRef, IntoBindingValue, PartitionProductFields, ShapeViolation,
39    ViolationKind,
40};
41use prism::uor_foundation::pipeline::ChunkSource;
42
43// ─── ShapeViolation IRI ─────────────────────────────────────────────────
44
45/// The single grammar-validity violation the host-boundary parser raises.
46/// ADR-060 removed the depth / atom-width / element-count / total-width
47/// ceilings, so the only rejection is "not a well-formed UTF-8
48/// S-expression".
49pub(crate) const INVALID_SEXPR_VIOLATION: ShapeViolation = ShapeViolation {
50    shape_iri: "https://uor.foundation/addr/SExprValue",
51    constraint_iri: "https://uor.foundation/addr/SExprValue/validUtf8SExpr",
52    property_iri: "https://uor.foundation/addr/inputBytes",
53    expected_range: "https://uor.foundation/addr/ValidUtf8SExpr",
54    min_count: 0,
55    max_count: 1,
56    kind: ViolationKind::ValueCheck,
57};
58
59// ─── Tokenizer ──────────────────────────────────────────────────────────
60
61/// A lexical token of the S-expression surface syntax.
62enum Tok<'a> {
63    /// `(`
64    Open,
65    /// `)`
66    Close,
67    /// An atom's raw byte payload (canonical `<n>:<bytes>` or a bare
68    /// whitespace-delimited token).
69    Atom(&'a [u8]),
70}
71
72/// Single-pass tokenizer over `raw`. Invokes `on_tok` for each lexical
73/// token in source order. Reports the lexical errors that are independent
74/// of nesting structure (malformed canonical-atom length prefix,
75/// truncated canonical atom); structural validation (balanced parens,
76/// single top-level value) layers on top via the callback.
77fn for_each_token<'a>(
78    raw: &'a [u8],
79    on_tok: &mut dyn FnMut(Tok<'a>),
80) -> Result<(), ShapeViolation> {
81    let mut pos = 0;
82    while pos < raw.len() {
83        let b = raw[pos];
84        if b.is_ascii_whitespace() {
85            pos += 1;
86            continue;
87        }
88        if b == b'(' {
89            on_tok(Tok::Open);
90            pos += 1;
91            continue;
92        }
93        if b == b')' {
94            on_tok(Tok::Close);
95            pos += 1;
96            continue;
97        }
98        if b.is_ascii_digit() {
99            // Canonical atom iff the leading digit run is terminated by ':'.
100            let mut i = pos;
101            while i < raw.len() && raw[i].is_ascii_digit() {
102                i += 1;
103            }
104            if i < raw.len() && raw[i] == b':' {
105                let len = parse_usize(&raw[pos..i]).ok_or(INVALID_SEXPR_VIOLATION)?;
106                let start = i + 1;
107                let end = start.checked_add(len).ok_or(INVALID_SEXPR_VIOLATION)?;
108                if end > raw.len() {
109                    return Err(INVALID_SEXPR_VIOLATION);
110                }
111                on_tok(Tok::Atom(&raw[start..end]));
112                pos = end;
113                continue;
114            }
115            // Digits not followed by ':' — fall through to a token atom.
116        }
117        // Token atom: a maximal run of non-whitespace, non-paren bytes.
118        let start = pos;
119        while pos < raw.len() {
120            let c = raw[pos];
121            if c.is_ascii_whitespace() || c == b'(' || c == b')' {
122                break;
123            }
124            pos += 1;
125        }
126        // Non-empty by construction (`b` was neither whitespace nor paren).
127        on_tok(Tok::Atom(&raw[start..pos]));
128    }
129    Ok(())
130}
131
132/// Parse an ASCII decimal digit run into a `usize`, returning `None` on
133/// overflow (an atom length wider than the address space — rejected, not
134/// capped at an arbitrary ceiling).
135fn parse_usize(digits: &[u8]) -> Option<usize> {
136    if digits.is_empty() {
137        return None;
138    }
139    let mut n: usize = 0;
140    for &d in digits {
141        let v = (d.wrapping_sub(b'0')) as usize;
142        if v > 9 {
143            return None;
144        }
145        n = n.checked_mul(10)?.checked_add(v)?;
146    }
147    Some(n)
148}
149
150/// `usize → ASCII decimal` into a 20-byte scratch (`u64::MAX` is 20
151/// digits).
152fn format_usize_into(buf: &mut [u8; 20], mut n: usize) -> &[u8] {
153    if n == 0 {
154        buf[0] = b'0';
155        return &buf[..1];
156    }
157    let mut idx = buf.len();
158    while n > 0 {
159        idx -= 1;
160        buf[idx] = b'0' + (n % 10) as u8;
161        n /= 10;
162    }
163    &buf[idx..]
164}
165
166// ─── SExprCanon — the streaming canonical-form carrier ───────────────────
167
168/// The Rivest-canonical-form [`ChunkSource`] over a borrowed,
169/// grammar-validated S-expression byte slice. Constructed by
170/// [`crate::sexp::address`] after [`SExprCanon::validate`] succeeds; lives
171/// in the caller's stack frame while the model folds it.
172#[derive(Clone, Copy, Debug)]
173pub struct SExprCanon<'a> {
174    raw: &'a [u8],
175}
176
177impl<'a> SExprCanon<'a> {
178    /// Wrap a grammar-validated raw S-expression slice. Call
179    /// [`SExprCanon::validate`] first; the carrier assumes well-formed
180    /// input (its [`ChunkSource`] emission cannot surface errors).
181    #[must_use]
182    pub fn new(raw: &'a [u8]) -> Self {
183        Self { raw }
184    }
185
186    /// Validate `raw` against the S-expression grammar in a single pass:
187    /// UTF-8, balanced parentheses, exactly one top-level value, and
188    /// well-formed canonical atoms.
189    ///
190    /// # Errors
191    ///
192    /// [`INVALID_SEXPR_VIOLATION`] (`validUtf8SExpr`) if `raw` is not a
193    /// well-formed UTF-8 S-expression.
194    pub fn validate(raw: &[u8]) -> Result<(), ShapeViolation> {
195        core::str::from_utf8(raw).map_err(|_| INVALID_SEXPR_VIOLATION)?;
196        let mut depth: usize = 0;
197        let mut seen_top = false;
198        let mut err: Option<ShapeViolation> = None;
199        for_each_token(raw, &mut |tok| {
200            if err.is_some() {
201                return;
202            }
203            match tok {
204                Tok::Atom(_) => {
205                    if depth == 0 {
206                        if seen_top {
207                            err = Some(INVALID_SEXPR_VIOLATION);
208                        } else {
209                            seen_top = true;
210                        }
211                    }
212                }
213                Tok::Open => {
214                    if depth == 0 && seen_top {
215                        err = Some(INVALID_SEXPR_VIOLATION);
216                    } else {
217                        depth += 1;
218                    }
219                }
220                Tok::Close => {
221                    if depth == 0 {
222                        err = Some(INVALID_SEXPR_VIOLATION);
223                    } else {
224                        depth -= 1;
225                        if depth == 0 {
226                            seen_top = true;
227                        }
228                    }
229                }
230            }
231        })?;
232        if let Some(e) = err {
233            return Err(e);
234        }
235        if depth != 0 || !seen_top {
236            return Err(INVALID_SEXPR_VIOLATION);
237        }
238        Ok(())
239    }
240}
241
242impl ChunkSource for SExprCanon<'_> {
243    fn for_each_chunk(&self, f: &mut dyn FnMut(&[u8])) {
244        // Streaming Rivest canonicalization: a single space separates
245        // consecutive elements of a list; no space immediately follows an
246        // opening paren. A single running flag captures this across any
247        // nesting depth — `need_space` is set after every atom and `)`,
248        // cleared after every `(`.
249        let mut need_space = false;
250        // Validation has already succeeded for `self.raw`, so the
251        // tokenizer cannot error here; ignore its Result.
252        let _ = for_each_token(self.raw, &mut |tok| match tok {
253            Tok::Open => {
254                if need_space {
255                    f(b" ");
256                }
257                f(b"(");
258                need_space = false;
259            }
260            Tok::Close => {
261                f(b")");
262                need_space = true;
263            }
264            Tok::Atom(bytes) => {
265                if need_space {
266                    f(b" ");
267                }
268                let mut buf = [0u8; 20];
269                f(format_usize_into(&mut buf, bytes.len()));
270                f(b":");
271                f(bytes);
272                need_space = true;
273            }
274        });
275    }
276}
277
278// ─── SExprValue — the typed input handle ─────────────────────────────────
279
280/// Typed S-expression input handle (ADR-060 stream carrier). A thin,
281/// `Copy` borrow of a [`SExprCanon`]; `as_binding_value` returns the
282/// `Stream` carrier zero-copy.
283#[derive(Clone, Copy, Debug)]
284pub struct SExprValue<'a>(&'a SExprCanon<'a>);
285
286impl<'a> SExprValue<'a> {
287    /// Wrap a validated canonical-form carrier as a model input handle.
288    #[must_use]
289    pub fn new(canon: &'a SExprCanon<'a>) -> Self {
290        Self(canon)
291    }
292}
293
294impl ConstrainedTypeShape for SExprValue<'_> {
295    const IRI: &'static str = "https://uor.foundation/addr/SExprValue";
296    const SITE_COUNT: usize = 1;
297    const CONSTRAINTS: &'static [ConstraintRef] = &[];
298    const CYCLE_SIZE: u64 = u64::MAX;
299}
300
301impl prism::uor_foundation::pipeline::__sdk_seal::Sealed for SExprValue<'_> {}
302
303impl<'a> IntoBindingValue<'a> for SExprValue<'a> {
304    fn as_binding_value<const INLINE_BYTES: usize>(&self) -> TermValue<'a, INLINE_BYTES> {
305        // The Rivest canonical form streams from the borrowed carrier; ψ₉
306        // folds it chunk-by-chunk. `self.0` is `&'a SExprCanon`, so the
307        // returned carrier borrows the input's `'a`-lived data
308        // independently of the `&self` call borrow.
309        TermValue::stream(self.0)
310    }
311}
312
313impl PartitionProductFields for SExprValue<'_> {
314    const FIELDS: &'static [(u32, u32)] = &[];
315    const FIELD_NAMES: &'static [&'static str] = &[];
316}
317
318// ─── Convenience alloc surface (feature = "alloc") ──────────────────────
319
320/// Validate `raw` and materialize its Rivest canonical S-expression bytes
321/// — the same byte sequence ψ₉ folds through the σ-axis.
322///
323/// **Available only under the `alloc` feature.** The no_alloc pipeline
324/// path never materializes the canonical form; it streams it through the
325/// hasher via [`SExprCanon`]'s [`ChunkSource`] impl.
326///
327/// # Errors
328///
329/// Surfaces the [`ShapeViolation`] [`SExprCanon::validate`] would raise.
330#[cfg(feature = "alloc")]
331pub fn canonicalize(raw: &[u8]) -> Result<alloc::vec::Vec<u8>, ShapeViolation> {
332    extern crate alloc;
333    SExprCanon::validate(raw)?;
334    let canon = SExprCanon::new(raw);
335    let mut out = alloc::vec::Vec::new();
336    canon.for_each_chunk(&mut |chunk| out.extend_from_slice(chunk));
337    Ok(out)
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn validates_nil() {
346        SExprCanon::validate(b"()").expect("valid nil");
347    }
348
349    #[test]
350    fn validates_atom_canonical_form() {
351        SExprCanon::validate(b"5:hello").expect("valid canonical atom");
352    }
353
354    #[test]
355    fn validates_token_list() {
356        SExprCanon::validate(b"(a b c)").expect("valid token list");
357    }
358
359    #[test]
360    fn rejects_unbalanced_parens() {
361        let err = SExprCanon::validate(b"((").expect_err("unbalanced parens");
362        assert_eq!(err.constraint_iri, INVALID_SEXPR_VIOLATION.constraint_iri);
363    }
364
365    #[test]
366    fn rejects_two_top_level_values() {
367        // Two bare token atoms, and two sibling lists — neither is a single
368        // top-level value. (`5:a 5:b` would *not* qualify: the `5:` length
369        // prefix swallows ` 5:b` as one canonical atom payload.)
370        for raw in [b"abc def".as_slice(), b"(a)(b)".as_slice()] {
371            let err = SExprCanon::validate(raw).expect_err("two top-level");
372            assert_eq!(err.constraint_iri, INVALID_SEXPR_VIOLATION.constraint_iri);
373        }
374    }
375
376    #[test]
377    fn rejects_truncated_canonical_atom() {
378        let err = SExprCanon::validate(b"5:abc").expect_err("declared 5, has 3");
379        assert_eq!(err.constraint_iri, INVALID_SEXPR_VIOLATION.constraint_iri);
380    }
381
382    #[test]
383    fn accepts_arbitrary_nesting_depth() {
384        extern crate alloc;
385        let mut s = alloc::string::String::new();
386        for _ in 0..4096 {
387            s.push('(');
388        }
389        s.push('x');
390        for _ in 0..4096 {
391            s.push(')');
392        }
393        // No depth cap (ADR-060): deep nesting is accepted, not rejected.
394        SExprCanon::validate(s.as_bytes()).expect("deep nesting is valid");
395    }
396
397    #[cfg(feature = "alloc")]
398    const CANONICAL_FIXTURES: &[(&[u8], &[u8])] = &[
399        (b"()", b"()"),
400        (b"(a b c)", b"(1:a 1:b 1:c)"),
401        (b"5:hello", b"5:hello"),
402        (b"(hello world)", b"(5:hello 5:world)"),
403        (b"((a) (b))", b"((1:a) (1:b))"),
404        (b"(a (b c) d)", b"(1:a (1:b 1:c) 1:d)"),
405        (b"(  a\t b\n c  )", b"(1:a 1:b 1:c)"),
406        (b"(1:a 1:b 1:c)", b"(1:a 1:b 1:c)"),
407        (b"(())", b"(())"),
408    ];
409
410    #[cfg(feature = "alloc")]
411    #[test]
412    fn canonicalizer_matches_rivest_canonical_form() {
413        for (raw, expected) in CANONICAL_FIXTURES {
414            let canon = canonicalize(raw).expect("valid");
415            assert_eq!(canon, *expected, "raw={raw:?}");
416        }
417    }
418
419    #[cfg(feature = "alloc")]
420    #[test]
421    fn canonicalize_is_idempotent_on_its_own_output() {
422        for (raw, _expected) in CANONICAL_FIXTURES {
423            let once = canonicalize(raw).expect("valid");
424            let twice = canonicalize(&once).expect("re-canonicalises");
425            assert_eq!(once, twice, "idempotence broken for {raw:?}");
426        }
427    }
428}