uor_addr/sexp/value.rs
1//! `SExprValue` — the typed S-expression input handle (ADR-023 amended
2//! by ADR-060).
3//!
4//! Under ADR-060 the S-expression realization no longer copies a
5//! structurally-tagged byte form into a fixed buffer. The host-boundary
6//! parser [`SExprCanon::validate`] checks the grammar in a single pass
7//! over the borrowed input — balanced parentheses (tracked with an
8//! unbounded `usize` depth counter), well-formed canonical atoms, exactly
9//! one top-level value — and the input handle then flows through the
10//! pipeline as a [`TermValue::Stream`] carrier ([`SExprCanon`]). The
11//! carrier re-tokenizes the borrowed bytes on demand and emits Rivest
12//! canonical S-expression bytes chunk-by-chunk; ψ₉ folds those chunks
13//! through the σ-axis with bounded resident memory. There is no input
14//! size ceiling, no atom-width cap, no element-count cap, and no nesting
15//! depth cap.
16//!
17//! # Input syntax
18//!
19//! [`SExprCanon::validate`] admits two equivalent surface syntaxes:
20//!
21//! - **Canonical (Rivest 1997 §4.3)** — `<n>:<bytes>` for atoms,
22//! `(<value> <value>)` for cons, `()` for nil.
23//! - **Token list** — whitespace-separated tokens between parentheses,
24//! each token interpreted as an atom whose bytes are the token's UTF-8
25//! representation.
26//!
27//! # Canonical form
28//!
29//! The carrier emits Rivest's canonical S-expression form: atoms as
30//! `<n>:<bytes>` (raw length prefix, no quoting), proper lists as
31//! `(s₁ s₂ … sₙ)` with single-space separators, nil as `()`. The
32//! emission is a streaming structural rewrite — element order is
33//! preserved, so it is reproducible chunk-by-chunk from the borrowed
34//! input with no intermediate buffer.
35
36use prism::operation::TermValue;
37use prism::pipeline::{
38 ConstrainedTypeShape, ConstraintRef, IntoBindingValue, PartitionProductFields, ShapeViolation,
39 ViolationKind,
40};
41use prism::uor_foundation::pipeline::ChunkSource;
42
43// ─── ShapeViolation IRI ─────────────────────────────────────────────────
44
45/// The single grammar-validity violation the host-boundary parser raises.
46/// ADR-060 removed the depth / atom-width / element-count / total-width
47/// ceilings, so the only rejection is "not a well-formed UTF-8
48/// S-expression".
49pub(crate) const INVALID_SEXPR_VIOLATION: ShapeViolation = ShapeViolation {
50 shape_iri: "https://uor.foundation/addr/SExprValue",
51 constraint_iri: "https://uor.foundation/addr/SExprValue/validUtf8SExpr",
52 property_iri: "https://uor.foundation/addr/inputBytes",
53 expected_range: "https://uor.foundation/addr/ValidUtf8SExpr",
54 min_count: 0,
55 max_count: 1,
56 kind: ViolationKind::ValueCheck,
57};
58
59// ─── Tokenizer ──────────────────────────────────────────────────────────
60
61/// A lexical token of the S-expression surface syntax.
62enum Tok<'a> {
63 /// `(`
64 Open,
65 /// `)`
66 Close,
67 /// An atom's raw byte payload (canonical `<n>:<bytes>` or a bare
68 /// whitespace-delimited token).
69 Atom(&'a [u8]),
70}
71
72/// Single-pass tokenizer over `raw`. Invokes `on_tok` for each lexical
73/// token in source order. Reports the lexical errors that are independent
74/// of nesting structure (malformed canonical-atom length prefix,
75/// truncated canonical atom); structural validation (balanced parens,
76/// single top-level value) layers on top via the callback.
77fn for_each_token<'a>(
78 raw: &'a [u8],
79 on_tok: &mut dyn FnMut(Tok<'a>),
80) -> Result<(), ShapeViolation> {
81 let mut pos = 0;
82 while pos < raw.len() {
83 let b = raw[pos];
84 if b.is_ascii_whitespace() {
85 pos += 1;
86 continue;
87 }
88 if b == b'(' {
89 on_tok(Tok::Open);
90 pos += 1;
91 continue;
92 }
93 if b == b')' {
94 on_tok(Tok::Close);
95 pos += 1;
96 continue;
97 }
98 if b.is_ascii_digit() {
99 // Canonical atom iff the leading digit run is terminated by ':'.
100 let mut i = pos;
101 while i < raw.len() && raw[i].is_ascii_digit() {
102 i += 1;
103 }
104 if i < raw.len() && raw[i] == b':' {
105 let len = parse_usize(&raw[pos..i]).ok_or(INVALID_SEXPR_VIOLATION)?;
106 let start = i + 1;
107 let end = start.checked_add(len).ok_or(INVALID_SEXPR_VIOLATION)?;
108 if end > raw.len() {
109 return Err(INVALID_SEXPR_VIOLATION);
110 }
111 on_tok(Tok::Atom(&raw[start..end]));
112 pos = end;
113 continue;
114 }
115 // Digits not followed by ':' — fall through to a token atom.
116 }
117 // Token atom: a maximal run of non-whitespace, non-paren bytes.
118 let start = pos;
119 while pos < raw.len() {
120 let c = raw[pos];
121 if c.is_ascii_whitespace() || c == b'(' || c == b')' {
122 break;
123 }
124 pos += 1;
125 }
126 // Non-empty by construction (`b` was neither whitespace nor paren).
127 on_tok(Tok::Atom(&raw[start..pos]));
128 }
129 Ok(())
130}
131
132/// Parse an ASCII decimal digit run into a `usize`, returning `None` on
133/// overflow (an atom length wider than the address space — rejected, not
134/// capped at an arbitrary ceiling).
135fn parse_usize(digits: &[u8]) -> Option<usize> {
136 if digits.is_empty() {
137 return None;
138 }
139 let mut n: usize = 0;
140 for &d in digits {
141 let v = (d.wrapping_sub(b'0')) as usize;
142 if v > 9 {
143 return None;
144 }
145 n = n.checked_mul(10)?.checked_add(v)?;
146 }
147 Some(n)
148}
149
150/// `usize → ASCII decimal` into a 20-byte scratch (`u64::MAX` is 20
151/// digits).
152fn format_usize_into(buf: &mut [u8; 20], mut n: usize) -> &[u8] {
153 if n == 0 {
154 buf[0] = b'0';
155 return &buf[..1];
156 }
157 let mut idx = buf.len();
158 while n > 0 {
159 idx -= 1;
160 buf[idx] = b'0' + (n % 10) as u8;
161 n /= 10;
162 }
163 &buf[idx..]
164}
165
166// ─── SExprCanon — the streaming canonical-form carrier ───────────────────
167
168/// The Rivest-canonical-form [`ChunkSource`] over a borrowed,
169/// grammar-validated S-expression byte slice. Constructed by
170/// [`crate::sexp::address`] after [`SExprCanon::validate`] succeeds; lives
171/// in the caller's stack frame while the model folds it.
172#[derive(Clone, Copy, Debug)]
173pub struct SExprCanon<'a> {
174 raw: &'a [u8],
175}
176
177impl<'a> SExprCanon<'a> {
178 /// Wrap a grammar-validated raw S-expression slice. Call
179 /// [`SExprCanon::validate`] first; the carrier assumes well-formed
180 /// input (its [`ChunkSource`] emission cannot surface errors).
181 #[must_use]
182 pub fn new(raw: &'a [u8]) -> Self {
183 Self { raw }
184 }
185
186 /// Validate `raw` against the S-expression grammar in a single pass:
187 /// UTF-8, balanced parentheses, exactly one top-level value, and
188 /// well-formed canonical atoms.
189 ///
190 /// # Errors
191 ///
192 /// [`INVALID_SEXPR_VIOLATION`] (`validUtf8SExpr`) if `raw` is not a
193 /// well-formed UTF-8 S-expression.
194 pub fn validate(raw: &[u8]) -> Result<(), ShapeViolation> {
195 core::str::from_utf8(raw).map_err(|_| INVALID_SEXPR_VIOLATION)?;
196 let mut depth: usize = 0;
197 let mut seen_top = false;
198 let mut err: Option<ShapeViolation> = None;
199 for_each_token(raw, &mut |tok| {
200 if err.is_some() {
201 return;
202 }
203 match tok {
204 Tok::Atom(_) => {
205 if depth == 0 {
206 if seen_top {
207 err = Some(INVALID_SEXPR_VIOLATION);
208 } else {
209 seen_top = true;
210 }
211 }
212 }
213 Tok::Open => {
214 if depth == 0 && seen_top {
215 err = Some(INVALID_SEXPR_VIOLATION);
216 } else {
217 depth += 1;
218 }
219 }
220 Tok::Close => {
221 if depth == 0 {
222 err = Some(INVALID_SEXPR_VIOLATION);
223 } else {
224 depth -= 1;
225 if depth == 0 {
226 seen_top = true;
227 }
228 }
229 }
230 }
231 })?;
232 if let Some(e) = err {
233 return Err(e);
234 }
235 if depth != 0 || !seen_top {
236 return Err(INVALID_SEXPR_VIOLATION);
237 }
238 Ok(())
239 }
240}
241
242impl ChunkSource for SExprCanon<'_> {
243 fn for_each_chunk(&self, f: &mut dyn FnMut(&[u8])) {
244 // Streaming Rivest canonicalization: a single space separates
245 // consecutive elements of a list; no space immediately follows an
246 // opening paren. A single running flag captures this across any
247 // nesting depth — `need_space` is set after every atom and `)`,
248 // cleared after every `(`.
249 let mut need_space = false;
250 // Validation has already succeeded for `self.raw`, so the
251 // tokenizer cannot error here; ignore its Result.
252 let _ = for_each_token(self.raw, &mut |tok| match tok {
253 Tok::Open => {
254 if need_space {
255 f(b" ");
256 }
257 f(b"(");
258 need_space = false;
259 }
260 Tok::Close => {
261 f(b")");
262 need_space = true;
263 }
264 Tok::Atom(bytes) => {
265 if need_space {
266 f(b" ");
267 }
268 let mut buf = [0u8; 20];
269 f(format_usize_into(&mut buf, bytes.len()));
270 f(b":");
271 f(bytes);
272 need_space = true;
273 }
274 });
275 }
276}
277
278// ─── SExprValue — the typed input handle ─────────────────────────────────
279
280/// Typed S-expression input handle (ADR-060 stream carrier). A thin,
281/// `Copy` borrow of a [`SExprCanon`]; `as_binding_value` returns the
282/// `Stream` carrier zero-copy.
283#[derive(Clone, Copy, Debug)]
284pub struct SExprValue<'a>(&'a SExprCanon<'a>);
285
286impl<'a> SExprValue<'a> {
287 /// Wrap a validated canonical-form carrier as a model input handle.
288 #[must_use]
289 pub fn new(canon: &'a SExprCanon<'a>) -> Self {
290 Self(canon)
291 }
292}
293
294impl ConstrainedTypeShape for SExprValue<'_> {
295 const IRI: &'static str = "https://uor.foundation/addr/SExprValue";
296 const SITE_COUNT: usize = 1;
297 const CONSTRAINTS: &'static [ConstraintRef] = &[];
298 const CYCLE_SIZE: u64 = u64::MAX;
299}
300
301impl prism::uor_foundation::pipeline::__sdk_seal::Sealed for SExprValue<'_> {}
302
303impl<'a> IntoBindingValue<'a> for SExprValue<'a> {
304 fn as_binding_value<const INLINE_BYTES: usize>(&self) -> TermValue<'a, INLINE_BYTES> {
305 // The Rivest canonical form streams from the borrowed carrier; ψ₉
306 // folds it chunk-by-chunk. `self.0` is `&'a SExprCanon`, so the
307 // returned carrier borrows the input's `'a`-lived data
308 // independently of the `&self` call borrow.
309 TermValue::stream(self.0)
310 }
311}
312
313impl PartitionProductFields for SExprValue<'_> {
314 const FIELDS: &'static [(u32, u32)] = &[];
315 const FIELD_NAMES: &'static [&'static str] = &[];
316}
317
318// ─── Convenience alloc surface (feature = "alloc") ──────────────────────
319
320/// Validate `raw` and materialize its Rivest canonical S-expression bytes
321/// — the same byte sequence ψ₉ folds through the σ-axis.
322///
323/// **Available only under the `alloc` feature.** The no_alloc pipeline
324/// path never materializes the canonical form; it streams it through the
325/// hasher via [`SExprCanon`]'s [`ChunkSource`] impl.
326///
327/// # Errors
328///
329/// Surfaces the [`ShapeViolation`] [`SExprCanon::validate`] would raise.
330#[cfg(feature = "alloc")]
331pub fn canonicalize(raw: &[u8]) -> Result<alloc::vec::Vec<u8>, ShapeViolation> {
332 extern crate alloc;
333 SExprCanon::validate(raw)?;
334 let canon = SExprCanon::new(raw);
335 let mut out = alloc::vec::Vec::new();
336 canon.for_each_chunk(&mut |chunk| out.extend_from_slice(chunk));
337 Ok(out)
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn validates_nil() {
346 SExprCanon::validate(b"()").expect("valid nil");
347 }
348
349 #[test]
350 fn validates_atom_canonical_form() {
351 SExprCanon::validate(b"5:hello").expect("valid canonical atom");
352 }
353
354 #[test]
355 fn validates_token_list() {
356 SExprCanon::validate(b"(a b c)").expect("valid token list");
357 }
358
359 #[test]
360 fn rejects_unbalanced_parens() {
361 let err = SExprCanon::validate(b"((").expect_err("unbalanced parens");
362 assert_eq!(err.constraint_iri, INVALID_SEXPR_VIOLATION.constraint_iri);
363 }
364
365 #[test]
366 fn rejects_two_top_level_values() {
367 // Two bare token atoms, and two sibling lists — neither is a single
368 // top-level value. (`5:a 5:b` would *not* qualify: the `5:` length
369 // prefix swallows ` 5:b` as one canonical atom payload.)
370 for raw in [b"abc def".as_slice(), b"(a)(b)".as_slice()] {
371 let err = SExprCanon::validate(raw).expect_err("two top-level");
372 assert_eq!(err.constraint_iri, INVALID_SEXPR_VIOLATION.constraint_iri);
373 }
374 }
375
376 #[test]
377 fn rejects_truncated_canonical_atom() {
378 let err = SExprCanon::validate(b"5:abc").expect_err("declared 5, has 3");
379 assert_eq!(err.constraint_iri, INVALID_SEXPR_VIOLATION.constraint_iri);
380 }
381
382 #[test]
383 fn accepts_arbitrary_nesting_depth() {
384 extern crate alloc;
385 let mut s = alloc::string::String::new();
386 for _ in 0..4096 {
387 s.push('(');
388 }
389 s.push('x');
390 for _ in 0..4096 {
391 s.push(')');
392 }
393 // No depth cap (ADR-060): deep nesting is accepted, not rejected.
394 SExprCanon::validate(s.as_bytes()).expect("deep nesting is valid");
395 }
396
397 #[cfg(feature = "alloc")]
398 const CANONICAL_FIXTURES: &[(&[u8], &[u8])] = &[
399 (b"()", b"()"),
400 (b"(a b c)", b"(1:a 1:b 1:c)"),
401 (b"5:hello", b"5:hello"),
402 (b"(hello world)", b"(5:hello 5:world)"),
403 (b"((a) (b))", b"((1:a) (1:b))"),
404 (b"(a (b c) d)", b"(1:a (1:b 1:c) 1:d)"),
405 (b"( a\t b\n c )", b"(1:a 1:b 1:c)"),
406 (b"(1:a 1:b 1:c)", b"(1:a 1:b 1:c)"),
407 (b"(())", b"(())"),
408 ];
409
410 #[cfg(feature = "alloc")]
411 #[test]
412 fn canonicalizer_matches_rivest_canonical_form() {
413 for (raw, expected) in CANONICAL_FIXTURES {
414 let canon = canonicalize(raw).expect("valid");
415 assert_eq!(canon, *expected, "raw={raw:?}");
416 }
417 }
418
419 #[cfg(feature = "alloc")]
420 #[test]
421 fn canonicalize_is_idempotent_on_its_own_output() {
422 for (raw, _expected) in CANONICAL_FIXTURES {
423 let once = canonicalize(raw).expect("valid");
424 let twice = canonicalize(&once).expect("re-canonicalises");
425 assert_eq!(once, twice, "idempotence broken for {raw:?}");
426 }
427 }
428}