Skip to main content

uor_addr/cbor/
value.rs

1//! CBOR typed input (ADR-023 amended by ADR-060) with RFC 8949 §4.2
2//! Deterministic-Encoding canonical-form byte output.
3//!
4//! CBOR canonicalization is **not** a streaming transform: §4.2.1 sorts
5//! map keys by the bytewise lexicographic order of their *encoded* keys,
6//! which needs per-map storage. The realization therefore materializes the
7//! canonical form once, in an `alloc` buffer ([`canonicalize`]), with **no**
8//! width / depth / count ceilings beyond the single
9//! [`MAX_CBOR_DEPTH`](crate::cbor::shapes::bounds::MAX_CBOR_DEPTH)
10//! native-stack overflow guard. The handle then flows through the pipeline
11//! as a zero-copy [`TermValue::Borrowed`] carrier and ψ₉ folds it through
12//! the σ-axis.
13//!
14//! # RFC 8949 §4.2 Deterministic Encoding (the canonical form)
15//!
16//! The canonicalizer accepts any well-formed CBOR data item and re-emits
17//! it under the deterministic-encoding rules:
18//!
19//! - **Preferred (shortest) integer/argument encoding** (§4.2.1 / §4.1):
20//!   every head's argument uses the fewest bytes (inline 0–23, then 1, 2,
21//!   4, 8).
22//! - **Definite-length only** (§4.2.1): indefinite-length byte/text
23//!   strings, arrays, and maps in the *input* are folded to their
24//!   definite-length canonical form.
25//! - **Map keys sorted** (§4.2.1) bytewise-lexicographically by their
26//!   canonical encodings; duplicate keys are rejected.
27//! - **Shortest-float / canonical NaN** (§4.2.2): a float is emitted in
28//!   the shortest of half / single / double that round-trips its value
29//!   exactly; every NaN collapses to the canonical half-precision
30//!   `0xf9 0x7e 0x00`.
31//!
32//! Exactly one top-level data item is admitted (trailing bytes are
33//! rejected).
34
35use prism::operation::TermValue;
36use prism::pipeline::{
37    ConstrainedTypeShape, ConstraintRef, IntoBindingValue, PartitionProductFields,
38};
39
40// ─── ShapeViolation IRIs (alloc-gated canonicalizer) ────────────────────
41
42#[cfg(feature = "alloc")]
43const INVALID_CBOR_VIOLATION: prism::pipeline::ShapeViolation = prism::pipeline::ShapeViolation {
44    shape_iri: "https://uor.foundation/addr/CborValue",
45    constraint_iri: "https://uor.foundation/addr/CborValue/wellFormedCbor",
46    property_iri: "https://uor.foundation/addr/inputBytes",
47    expected_range: "https://uor.foundation/addr/WellFormedCbor",
48    min_count: 0,
49    max_count: 1,
50    kind: prism::pipeline::ViolationKind::ValueCheck,
51};
52
53#[cfg(feature = "alloc")]
54const DEPTH_BOUND_VIOLATION: prism::pipeline::ShapeViolation = prism::pipeline::ShapeViolation {
55    shape_iri: "https://uor.foundation/addr/CborValue",
56    constraint_iri: "https://uor.foundation/addr/CborValue/depthBound",
57    property_iri: "https://uor.foundation/addr/CborValue/depth",
58    expected_range: "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
59    min_count: 0,
60    max_count: crate::cbor::shapes::bounds::MAX_CBOR_DEPTH as u32,
61    kind: prism::pipeline::ViolationKind::CardinalityViolation,
62};
63
64// ─── CborCarrier — the borrowed model-input handle (no_alloc) ───────────
65
66/// Borrowed canonical-CBOR input handle (ADR-060 borrowed carrier). A
67/// thin, `Copy` borrow of canonical bytes produced by [`canonicalize`];
68/// `as_binding_value` returns the `Borrowed` carrier zero-copy.
69#[derive(Clone, Copy, Debug)]
70pub struct CborCarrier<'a>(&'a [u8]);
71
72impl<'a> CborCarrier<'a> {
73    /// Wrap a canonical-CBOR byte slice as a model input handle.
74    #[must_use]
75    pub fn new(canonical_bytes: &'a [u8]) -> Self {
76        Self(canonical_bytes)
77    }
78
79    /// Borrow the canonical-CBOR bytes.
80    #[must_use]
81    pub fn canonical_bytes(&self) -> &'a [u8] {
82        self.0
83    }
84}
85
86impl ConstrainedTypeShape for CborCarrier<'_> {
87    const IRI: &'static str = "https://uor.foundation/addr/CborValue";
88    const SITE_COUNT: usize = 1;
89    const CONSTRAINTS: &'static [ConstraintRef] = &[];
90    const CYCLE_SIZE: u64 = u64::MAX;
91}
92
93impl prism::uor_foundation::pipeline::__sdk_seal::Sealed for CborCarrier<'_> {}
94
95impl<'a> IntoBindingValue<'a> for CborCarrier<'a> {
96    fn as_binding_value<const INLINE_BYTES: usize>(&self) -> TermValue<'a, INLINE_BYTES> {
97        TermValue::borrowed(self.0)
98    }
99}
100
101impl PartitionProductFields for CborCarrier<'_> {
102    const FIELDS: &'static [(u32, u32)] = &[];
103    const FIELD_NAMES: &'static [&'static str] = &[];
104}
105
106// ═════════════════════════════════════════════════════════════════════
107// alloc-gated RFC 8949 §4.2 deterministic-encoding canonicalizer
108// ═════════════════════════════════════════════════════════════════════
109
110#[cfg(feature = "alloc")]
111pub use alloc_impl::canonicalize;
112
113#[cfg(feature = "alloc")]
114mod alloc_impl {
115    extern crate alloc;
116    use alloc::vec::Vec;
117    use prism::pipeline::ShapeViolation;
118
119    use super::{DEPTH_BOUND_VIOLATION, INVALID_CBOR_VIOLATION};
120    use crate::cbor::shapes::bounds::MAX_CBOR_DEPTH;
121
122    /// Re-encode `raw` (any well-formed CBOR item) into its RFC 8949 §4.2
123    /// deterministic-encoding canonical form.
124    ///
125    /// # Errors
126    ///
127    /// [`ShapeViolation`] if `raw` is not exactly one well-formed CBOR data
128    /// item, contains a reserved/invalid head, a non-UTF-8 text string, a
129    /// map with duplicate keys, or nests deeper than [`MAX_CBOR_DEPTH`].
130    pub fn canonicalize(raw: &[u8]) -> Result<Vec<u8>, ShapeViolation> {
131        let mut p = Parser { data: raw, pos: 0 };
132        let mut out = Vec::new();
133        p.item(&mut out, 0)?;
134        if p.pos != raw.len() {
135            return Err(INVALID_CBOR_VIOLATION); // trailing bytes — not a single item
136        }
137        Ok(out)
138    }
139
140    const BREAK: u8 = 0xff;
141
142    struct Parser<'a> {
143        data: &'a [u8],
144        pos: usize,
145    }
146
147    impl<'a> Parser<'a> {
148        fn byte(&mut self) -> Result<u8, ShapeViolation> {
149            let b = *self.data.get(self.pos).ok_or(INVALID_CBOR_VIOLATION)?;
150            self.pos += 1;
151            Ok(b)
152        }
153
154        fn take(&mut self, n: usize) -> Result<&'a [u8], ShapeViolation> {
155            let end = self.pos.checked_add(n).ok_or(INVALID_CBOR_VIOLATION)?;
156            let s = self.data.get(self.pos..end).ok_or(INVALID_CBOR_VIOLATION)?;
157            self.pos = end;
158            Ok(s)
159        }
160
161        /// Read a head, returning `(major, additional_info, argument)`. For
162        /// `additional_info == 31` the argument is meaningless (indefinite /
163        /// break); callers branch on `ai` first.
164        fn head(&mut self) -> Result<(u8, u8, u64), ShapeViolation> {
165            let ib = self.byte()?;
166            let major = ib >> 5;
167            let ai = ib & 0x1f;
168            let arg = match ai {
169                0..=23 => u64::from(ai),
170                24 => u64::from(self.byte()?),
171                25 => {
172                    let b = self.take(2)?;
173                    u64::from(u16::from_be_bytes([b[0], b[1]]))
174                }
175                26 => {
176                    let b = self.take(4)?;
177                    u64::from(u32::from_be_bytes([b[0], b[1], b[2], b[3]]))
178                }
179                27 => {
180                    let b = self.take(8)?;
181                    u64::from_be_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]])
182                }
183                31 => 0, // indefinite / break — caller handles
184                _ => return Err(INVALID_CBOR_VIOLATION), // 28,29,30 reserved
185            };
186            Ok((major, ai, arg))
187        }
188
189        fn item(&mut self, out: &mut Vec<u8>, depth: usize) -> Result<(), ShapeViolation> {
190            if depth > MAX_CBOR_DEPTH {
191                return Err(DEPTH_BOUND_VIOLATION);
192            }
193            let (major, ai, arg) = self.head()?;
194            match major {
195                0 => emit_head(out, 0, arg), // unsigned int
196                1 => emit_head(out, 1, arg), // negative int
197                2 => {
198                    let bytes = self.string_payload(ai, arg, 2)?;
199                    emit_head(out, 2, bytes.len() as u64);
200                    out.extend_from_slice(&bytes);
201                }
202                3 => {
203                    let bytes = self.string_payload(ai, arg, 3)?;
204                    if core::str::from_utf8(&bytes).is_err() {
205                        return Err(INVALID_CBOR_VIOLATION);
206                    }
207                    emit_head(out, 3, bytes.len() as u64);
208                    out.extend_from_slice(&bytes);
209                }
210                4 => self.array(out, ai, arg, depth)?,
211                5 => self.map(out, ai, arg, depth)?,
212                6 => {
213                    if ai == 31 {
214                        return Err(INVALID_CBOR_VIOLATION);
215                    }
216                    emit_head(out, 6, arg); // tag
217                    self.item(out, depth + 1)?; // tagged content
218                }
219                7 => self.simple_or_float(out, ai, arg)?,
220                _ => unreachable!("major is 3 bits"),
221            }
222            Ok(())
223        }
224
225        /// Collect a (possibly indefinite-length) byte/text string payload
226        /// into a contiguous buffer. `expect_major` is 2 or 3; indefinite
227        /// chunks must each be definite strings of the same major type.
228        fn string_payload(
229            &mut self,
230            ai: u8,
231            arg: u64,
232            expect_major: u8,
233        ) -> Result<Vec<u8>, ShapeViolation> {
234            if ai != 31 {
235                return Ok(self.take(usize_arg(arg)?)?.to_vec());
236            }
237            let mut buf = Vec::new();
238            loop {
239                let ib = self.byte()?;
240                if ib == BREAK {
241                    break;
242                }
243                let major = ib >> 5;
244                let cai = ib & 0x1f;
245                if major != expect_major || cai == 31 {
246                    return Err(INVALID_CBOR_VIOLATION); // nested indefinite / wrong type
247                }
248                let n = self.arg_for(cai)?;
249                buf.extend_from_slice(self.take(usize_arg(n)?)?);
250            }
251            Ok(buf)
252        }
253
254        /// Read just the argument for an already-consumed initial byte's
255        /// additional-info `ai` (no major byte read).
256        fn arg_for(&mut self, ai: u8) -> Result<u64, ShapeViolation> {
257            Ok(match ai {
258                0..=23 => u64::from(ai),
259                24 => u64::from(self.byte()?),
260                25 => {
261                    let b = self.take(2)?;
262                    u64::from(u16::from_be_bytes([b[0], b[1]]))
263                }
264                26 => {
265                    let b = self.take(4)?;
266                    u64::from(u32::from_be_bytes([b[0], b[1], b[2], b[3]]))
267                }
268                27 => {
269                    let b = self.take(8)?;
270                    u64::from_be_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]])
271                }
272                _ => return Err(INVALID_CBOR_VIOLATION),
273            })
274        }
275
276        fn array(
277            &mut self,
278            out: &mut Vec<u8>,
279            ai: u8,
280            arg: u64,
281            depth: usize,
282        ) -> Result<(), ShapeViolation> {
283            if ai != 31 {
284                let n = usize_arg(arg)?;
285                emit_head(out, 4, n as u64);
286                for _ in 0..n {
287                    self.item(out, depth + 1)?;
288                }
289                return Ok(());
290            }
291            // Indefinite: canonicalize each element into a buffer, count the
292            // emitted items, then emit the definite-length head + body.
293            let mut elems = Vec::new();
294            loop {
295                if *self.data.get(self.pos).ok_or(INVALID_CBOR_VIOLATION)? == BREAK {
296                    self.pos += 1;
297                    break;
298                }
299                self.item(&mut elems, depth + 1)?;
300            }
301            emit_head(out, 4, count_items(&elems));
302            out.extend_from_slice(&elems);
303            Ok(())
304        }
305
306        fn map(
307            &mut self,
308            out: &mut Vec<u8>,
309            ai: u8,
310            arg: u64,
311            depth: usize,
312        ) -> Result<(), ShapeViolation> {
313            let mut pairs: Vec<(Vec<u8>, Vec<u8>)> = Vec::new();
314            if ai != 31 {
315                let n = usize_arg(arg)?;
316                for _ in 0..n {
317                    let mut k = Vec::new();
318                    self.item(&mut k, depth + 1)?;
319                    let mut v = Vec::new();
320                    self.item(&mut v, depth + 1)?;
321                    pairs.push((k, v));
322                }
323            } else {
324                loop {
325                    if *self.data.get(self.pos).ok_or(INVALID_CBOR_VIOLATION)? == BREAK {
326                        self.pos += 1;
327                        break;
328                    }
329                    let mut k = Vec::new();
330                    self.item(&mut k, depth + 1)?;
331                    let mut v = Vec::new();
332                    self.item(&mut v, depth + 1)?;
333                    pairs.push((k, v));
334                }
335            }
336            // §4.2.1: sort by bytewise-lexicographic order of encoded keys.
337            pairs.sort_by(|a, b| a.0.cmp(&b.0));
338            // Reject duplicate keys.
339            for w in pairs.windows(2) {
340                if w[0].0 == w[1].0 {
341                    return Err(INVALID_CBOR_VIOLATION);
342                }
343            }
344            emit_head(out, 5, pairs.len() as u64);
345            for (k, v) in pairs {
346                out.extend_from_slice(&k);
347                out.extend_from_slice(&v);
348            }
349            Ok(())
350        }
351
352        fn simple_or_float(
353            &mut self,
354            out: &mut Vec<u8>,
355            ai: u8,
356            arg: u64,
357        ) -> Result<(), ShapeViolation> {
358            match ai {
359                // simple value (false/true/null/undefined/simple 0..=23)
360                0..=23 => {
361                    out.push(0xe0 | (arg as u8));
362                    Ok(())
363                }
364                // 1-byte simple value (32..=255; 0..=31 are not well-formed here)
365                24 => {
366                    if arg < 32 {
367                        return Err(INVALID_CBOR_VIOLATION);
368                    }
369                    out.push(0xf8);
370                    out.push(arg as u8);
371                    Ok(())
372                }
373                25 => {
374                    // float16 → f64 → canonical
375                    emit_canonical_float(out, half_to_f64(arg as u16));
376                    Ok(())
377                }
378                26 => {
379                    emit_canonical_float(out, f32::from_bits(arg as u32) as f64);
380                    Ok(())
381                }
382                27 => {
383                    emit_canonical_float(out, f64::from_bits(arg));
384                    Ok(())
385                }
386                _ => Err(INVALID_CBOR_VIOLATION), // 28,29,30,31(break)
387            }
388        }
389    }
390
391    fn usize_arg(arg: u64) -> Result<usize, ShapeViolation> {
392        usize::try_from(arg).map_err(|_| INVALID_CBOR_VIOLATION)
393    }
394
395    /// Emit a CBOR head for `major` with the shortest argument encoding
396    /// (RFC 8949 §4.1 preferred serialization).
397    fn emit_head(out: &mut Vec<u8>, major: u8, arg: u64) {
398        let m = major << 5;
399        if arg < 24 {
400            out.push(m | (arg as u8));
401        } else if arg <= u64::from(u8::MAX) {
402            out.push(m | 24);
403            out.push(arg as u8);
404        } else if arg <= u64::from(u16::MAX) {
405            out.push(m | 25);
406            out.extend_from_slice(&(arg as u16).to_be_bytes());
407        } else if arg <= u64::from(u32::MAX) {
408            out.push(m | 26);
409            out.extend_from_slice(&(arg as u32).to_be_bytes());
410        } else {
411            out.push(m | 27);
412            out.extend_from_slice(&arg.to_be_bytes());
413        }
414    }
415
416    /// Count the number of top-level canonical CBOR items in `buf` (used
417    /// only for indefinite-length array/map element counting). `buf` is
418    /// always well-formed canonical output we just produced.
419    fn count_items(buf: &[u8]) -> u64 {
420        let mut p = Walker { data: buf, pos: 0 };
421        let mut n = 0u64;
422        while p.pos < buf.len() {
423            p.skip();
424            n += 1;
425        }
426        n
427    }
428
429    struct Walker<'a> {
430        data: &'a [u8],
431        pos: usize,
432    }
433    impl Walker<'_> {
434        fn b(&mut self) -> u8 {
435            let v = self.data[self.pos];
436            self.pos += 1;
437            v
438        }
439        fn arg(&mut self, ai: u8) -> u64 {
440            match ai {
441                0..=23 => u64::from(ai),
442                24 => u64::from(self.b()),
443                25 => {
444                    let v = u16::from_be_bytes([self.data[self.pos], self.data[self.pos + 1]]);
445                    self.pos += 2;
446                    u64::from(v)
447                }
448                26 => {
449                    let mut a = [0u8; 4];
450                    a.copy_from_slice(&self.data[self.pos..self.pos + 4]);
451                    self.pos += 4;
452                    u64::from(u32::from_be_bytes(a))
453                }
454                27 => {
455                    let mut a = [0u8; 8];
456                    a.copy_from_slice(&self.data[self.pos..self.pos + 8]);
457                    self.pos += 8;
458                    u64::from_be_bytes(a)
459                }
460                _ => 0,
461            }
462        }
463        fn skip(&mut self) {
464            let ib = self.b();
465            let major = ib >> 5;
466            let ai = ib & 0x1f;
467            let arg = self.arg(ai);
468            match major {
469                0 | 1 => {}
470                2 | 3 => self.pos += arg as usize,
471                4 => {
472                    for _ in 0..arg {
473                        self.skip();
474                    }
475                }
476                5 => {
477                    for _ in 0..arg {
478                        self.skip();
479                        self.skip();
480                    }
481                }
482                6 => self.skip(),
483                7 => {
484                    // canonical output: simple inline (handled by arg), 1-byte
485                    // simple already consumed via ai==24, floats via ai 25/26/27
486                    // already consumed by `arg`.
487                }
488                _ => {}
489            }
490        }
491    }
492
493    // ─── IEEE-754 half-precision helpers (RFC 8949 §4.2.2 shortest float) ──
494
495    /// Decode an IEEE-754 binary16 bit pattern to the bit pattern of the
496    /// exactly-equal `f32` (every binary16 value is representable in
497    /// binary32). Pure integer arithmetic — `no_std` / no-libm safe.
498    fn half_to_f32_bits(h: u16) -> u32 {
499        let sign = (u32::from(h) & 0x8000) << 16;
500        let exp = (h >> 10) & 0x1f;
501        let mant = u32::from(h & 0x03ff);
502        if exp == 0 {
503            if mant == 0 {
504                return sign; // ±0
505            }
506            // Subnormal: normalize into a binary32 normal number.
507            let mut e: i32 = -1;
508            let mut m = mant;
509            loop {
510                e += 1;
511                m <<= 1;
512                if m & 0x0400 != 0 {
513                    break;
514                }
515            }
516            let mant32 = (m & 0x03ff) << 13;
517            let exp32 = (127 - 15 - e) as u32;
518            return sign | (exp32 << 23) | mant32;
519        }
520        if exp == 0x1f {
521            return sign | 0x7f80_0000 | (mant << 13); // Inf / NaN
522        }
523        let exp32 = (i32::from(exp) - 15 + 127) as u32;
524        sign | (exp32 << 23) | (mant << 13)
525    }
526
527    /// Decode an IEEE-754 binary16 bit pattern to `f64` (exact, via the
528    /// exactly-equal binary32). The `as f64` widening is a core float cast,
529    /// not a libm intrinsic.
530    fn half_to_f64(h: u16) -> f64 {
531        f64::from(f32::from_bits(half_to_f32_bits(h)))
532    }
533
534    /// Round-to-nearest-even encode `f32` to a binary16 bit pattern. The
535    /// caller verifies the round-trip, so an imperfect edge case only costs
536    /// shortness, never correctness.
537    fn f32_to_half_bits(f: f32) -> u16 {
538        let x = f.to_bits();
539        let sign = ((x >> 16) & 0x8000) as u16;
540        let exp = ((x >> 23) & 0xff) as i32;
541        let mant = x & 0x007f_ffff;
542        if exp == 0xff {
543            return if mant == 0 {
544                sign | 0x7c00
545            } else {
546                sign | 0x7e00
547            };
548        }
549        let e = exp - 127 + 15;
550        if e >= 0x1f {
551            return sign | 0x7c00; // overflow → ±Inf
552        }
553        if e <= 0 {
554            if e < -10 {
555                return sign; // underflow → ±0
556            }
557            let mant_full = mant | 0x0080_0000;
558            let shift = (14 - e) as u32;
559            let half_mant = (mant_full >> shift) as u16;
560            let round_rem = mant_full & ((1 << shift) - 1);
561            let halfway = 1u32 << (shift - 1);
562            let mut bits = sign | half_mant;
563            if round_rem > halfway || (round_rem == halfway && (half_mant & 1) == 1) {
564                bits += 1;
565            }
566            return bits;
567        }
568        let half_mant = (mant >> 13) as u16;
569        let round_rem = mant & 0x1fff;
570        let mut bits = sign | ((e as u16) << 10) | half_mant;
571        if round_rem > 0x1000 || (round_rem == 0x1000 && (half_mant & 1) == 1) {
572            bits += 1; // may carry into the exponent — round-trip check guards
573        }
574        bits
575    }
576
577    /// Emit `v` as the shortest of half / single / double precision that
578    /// round-trips it exactly (§4.2.2); every NaN collapses to the
579    /// canonical half-precision quiet NaN `0xf9 0x7e 0x00`.
580    fn emit_canonical_float(out: &mut Vec<u8>, v: f64) {
581        if v.is_nan() {
582            out.extend_from_slice(&[0xf9, 0x7e, 0x00]);
583            return;
584        }
585        let single = v as f32;
586        if f64::from(single) == v || (v.is_infinite() && single.is_infinite()) {
587            let hb = f32_to_half_bits(single);
588            if half_to_f64(hb).to_bits() == v.to_bits() {
589                out.push(0xf9);
590                out.extend_from_slice(&hb.to_be_bytes());
591                return;
592            }
593            out.push(0xfa);
594            out.extend_from_slice(&single.to_be_bytes());
595            return;
596        }
597        out.push(0xfb);
598        out.extend_from_slice(&v.to_bits().to_be_bytes());
599    }
600}