Skip to main content

uor_addr/xml/
value.rs

1//! `XmlValue` — the typed XML input handle (ADR-023 amended by ADR-060)
2//! with W3C Canonical XML 1.1 (subset) byte-output discipline.
3//!
4//! See [`crate::xml`] for the supported subset and deviations from full
5//! XML-C14N 1.1.
6//!
7//! # ADR-060 carrier model
8//!
9//! XML canonicalization is **not** a streaming transform: XML-C14N 1.1
10//! §1.1 rule 3 sorts each element's attributes lexicographically, and
11//! well-formedness checking matches nested close tags — both inherently
12//! need storage proportional to the element / nesting size. The
13//! realization therefore materializes the canonical form once, in an
14//! `alloc` buffer ([`canonicalize`]), with **no** width or count
15//! ceilings: element names, attribute values, text runs, attribute
16//! counts, and child counts are unbounded. The handle then flows through
17//! the pipeline as a zero-copy [`TermValue::Borrowed`] carrier over those
18//! canonical bytes, and ψ₉ folds them through the σ-axis.
19//!
20//! The single bound retained is [`MAX_XML_DEPTH`] — a native-stack
21//! overflow guard on the recursive-descent canonicalizer, not a content
22//! ceiling.
23
24use prism::operation::TermValue;
25use prism::pipeline::{
26    ConstrainedTypeShape, ConstraintRef, IntoBindingValue, PartitionProductFields,
27};
28// `ShapeViolation` / `ViolationKind` / `MAX_XML_DEPTH` are consumed only by
29// the `alloc`-gated canonicalizer below.
30#[cfg(feature = "alloc")]
31use crate::xml::shapes::bounds::MAX_XML_DEPTH;
32#[cfg(feature = "alloc")]
33use prism::pipeline::{ShapeViolation, ViolationKind};
34
35// ─── ShapeViolation IRIs ────────────────────────────────────────────────
36
37#[cfg(feature = "alloc")]
38const INVALID_XML_VIOLATION: ShapeViolation = ShapeViolation {
39    shape_iri: "https://uor.foundation/addr/XmlValue",
40    constraint_iri: "https://uor.foundation/addr/XmlValue/validXml",
41    property_iri: "https://uor.foundation/addr/inputBytes",
42    expected_range: "https://uor.foundation/addr/ValidUtf8Xml",
43    min_count: 0,
44    max_count: 1,
45    kind: ViolationKind::ValueCheck,
46};
47
48#[cfg(feature = "alloc")]
49const DEPTH_BOUND_VIOLATION: ShapeViolation = ShapeViolation {
50    shape_iri: "https://uor.foundation/addr/XmlValue",
51    constraint_iri: "https://uor.foundation/addr/XmlValue/depthBound",
52    property_iri: "https://uor.foundation/addr/XmlValue/depth",
53    expected_range: "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
54    min_count: 0,
55    max_count: MAX_XML_DEPTH as u32,
56    kind: ViolationKind::CardinalityViolation,
57};
58
59// ─── XmlValue — the typed input handle ──────────────────────────────────
60
61/// Typed XML input handle (ADR-060 borrowed carrier). A thin, `Copy`
62/// borrow of canonical-XML bytes produced by [`canonicalize`];
63/// `as_binding_value` returns the `Borrowed` carrier zero-copy.
64#[derive(Clone, Copy, Debug)]
65pub struct XmlValue<'a>(&'a [u8]);
66
67impl<'a> XmlValue<'a> {
68    /// Wrap a canonical-XML byte slice as a model input handle.
69    #[must_use]
70    pub fn new(canonical_bytes: &'a [u8]) -> Self {
71        Self(canonical_bytes)
72    }
73
74    /// Borrow the canonical-XML bytes.
75    #[must_use]
76    pub fn canonical_bytes(&self) -> &'a [u8] {
77        self.0
78    }
79}
80
81impl ConstrainedTypeShape for XmlValue<'_> {
82    const IRI: &'static str = "https://uor.foundation/addr/XmlValue";
83    const SITE_COUNT: usize = 1;
84    const CONSTRAINTS: &'static [ConstraintRef] = &[];
85    const CYCLE_SIZE: u64 = u64::MAX;
86}
87
88impl prism::uor_foundation::pipeline::__sdk_seal::Sealed for XmlValue<'_> {}
89
90impl<'a> IntoBindingValue<'a> for XmlValue<'a> {
91    fn as_binding_value<const INLINE_BYTES: usize>(&self) -> TermValue<'a, INLINE_BYTES> {
92        // The canonical form is materialized by `canonicalize`; ψ₉ folds
93        // it. `self.0` is `&'a [u8]`, so the carrier borrows the input's
94        // `'a`-lived data independently of the `&self` call borrow.
95        TermValue::borrowed(self.0)
96    }
97}
98
99impl PartitionProductFields for XmlValue<'_> {
100    const FIELDS: &'static [(u32, u32)] = &[];
101    const FIELD_NAMES: &'static [&'static str] = &[];
102}
103
104// ─── Canonicalizer (alloc) ──────────────────────────────────────────────
105
106/// Parse + canonicalize per the W3C XML-C14N 1.1 subset documented in
107/// [`crate::xml`]. Single recursive-descent pass over `raw` that emits
108/// the canonical form directly — no fixed buffer, no width/count caps.
109///
110/// **Available only under the `alloc` feature.** The model handle
111/// ([`XmlValue`]) is `no_alloc`; canonicalization itself needs heap
112/// storage (per-element attribute sort scratch + the canonical output).
113///
114/// # Errors
115///
116/// - [`INVALID_XML_VIOLATION`] (`validXml`) — `raw` is not a well-formed
117///   UTF-8 document in the supported subset.
118/// - [`DEPTH_BOUND_VIOLATION`] (`depthBound`) — nesting exceeds the
119///   [`MAX_XML_DEPTH`] native-stack-safety bound.
120#[cfg(feature = "alloc")]
121pub fn canonicalize(raw: &[u8]) -> Result<alloc::vec::Vec<u8>, ShapeViolation> {
122    extern crate alloc;
123    core::str::from_utf8(raw).map_err(|_| INVALID_XML_VIOLATION)?;
124    let mut p = Parser::new(raw);
125    let mut out = alloc::vec::Vec::new();
126    p.skip_ws();
127    emit_element(&mut p, &mut out, 0)?;
128    p.skip_ws();
129    if !p.is_eof() {
130        return Err(INVALID_XML_VIOLATION);
131    }
132    Ok(out)
133}
134
135#[cfg(feature = "alloc")]
136struct Parser<'a> {
137    src: &'a [u8],
138    pos: usize,
139}
140
141#[cfg(feature = "alloc")]
142impl<'a> Parser<'a> {
143    fn new(src: &'a [u8]) -> Self {
144        Self { src, pos: 0 }
145    }
146
147    fn skip_ws(&mut self) {
148        while self.pos < self.src.len() && self.src[self.pos].is_ascii_whitespace() {
149            self.pos += 1;
150        }
151    }
152
153    fn is_eof(&self) -> bool {
154        self.pos >= self.src.len()
155    }
156}
157
158/// Parse one element from `p` and append its canonical form to `out`.
159#[cfg(feature = "alloc")]
160fn emit_element(
161    p: &mut Parser<'_>,
162    out: &mut alloc::vec::Vec<u8>,
163    depth: usize,
164) -> Result<(), ShapeViolation> {
165    use alloc::vec::Vec;
166
167    if depth > MAX_XML_DEPTH {
168        return Err(DEPTH_BOUND_VIOLATION);
169    }
170    if p.pos >= p.src.len() || p.src[p.pos] != b'<' {
171        return Err(INVALID_XML_VIOLATION);
172    }
173    p.pos += 1;
174    if p.pos < p.src.len() && (p.src[p.pos] == b'!' || p.src[p.pos] == b'?') {
175        return Err(INVALID_XML_VIOLATION);
176    }
177    let name_start = p.pos;
178    let name_len = parse_name_len(p)?;
179    let name = &p.src[name_start..name_start + name_len];
180
181    // Collect this element's attributes (entity-decoded values), then sort
182    // lexicographically by name per XML-C14N 1.1 §1.1 rule 3.
183    let mut attrs: Vec<(&[u8], Vec<u8>)> = Vec::new();
184    loop {
185        p.skip_ws();
186        if p.pos >= p.src.len() {
187            return Err(INVALID_XML_VIOLATION);
188        }
189        if p.src[p.pos] == b'>' || p.src[p.pos] == b'/' {
190            break;
191        }
192        attrs.push(parse_attr(p)?);
193    }
194    attrs.sort_by(|a, b| a.0.cmp(b.0));
195
196    out.push(b'<');
197    out.extend_from_slice(name);
198    for (k, v) in &attrs {
199        out.push(b' ');
200        out.extend_from_slice(k);
201        out.extend_from_slice(b"=\"");
202        escape_attr_into(v, out);
203        out.push(b'"');
204    }
205
206    if p.src[p.pos] == b'/' {
207        // Self-closing — canonical form expands to `<name…></name>`.
208        p.pos += 1;
209        if p.pos >= p.src.len() || p.src[p.pos] != b'>' {
210            return Err(INVALID_XML_VIOLATION);
211        }
212        p.pos += 1;
213        out.extend_from_slice(b"></");
214        out.extend_from_slice(name);
215        out.push(b'>');
216        return Ok(());
217    }
218    if p.src[p.pos] != b'>' {
219        return Err(INVALID_XML_VIOLATION);
220    }
221    p.pos += 1;
222    out.push(b'>');
223
224    // Children.
225    loop {
226        if p.pos >= p.src.len() {
227            return Err(INVALID_XML_VIOLATION);
228        }
229        if p.src[p.pos] == b'<' {
230            if p.pos + 1 < p.src.len() && p.src[p.pos + 1] == b'/' {
231                // Close tag — must match the open name.
232                p.pos += 2;
233                let close_start = p.pos;
234                let close_len = parse_name_len(p)?;
235                if &p.src[close_start..close_start + close_len] != name {
236                    return Err(INVALID_XML_VIOLATION);
237                }
238                p.skip_ws();
239                if p.pos >= p.src.len() || p.src[p.pos] != b'>' {
240                    return Err(INVALID_XML_VIOLATION);
241                }
242                p.pos += 1;
243                out.extend_from_slice(b"</");
244                out.extend_from_slice(name);
245                out.push(b'>');
246                return Ok(());
247            }
248            if p.pos + 8 < p.src.len() && &p.src[p.pos..p.pos + 9] == b"<![CDATA[" {
249                // CDATA collapses to escaped text per XML-C14N 1.1 §1.1.
250                p.pos += 9;
251                let start = p.pos;
252                while p.pos + 2 < p.src.len() && &p.src[p.pos..p.pos + 3] != b"]]>" {
253                    p.pos += 1;
254                }
255                if p.pos + 2 >= p.src.len() {
256                    return Err(INVALID_XML_VIOLATION);
257                }
258                let cdata = &p.src[start..p.pos];
259                p.pos += 3;
260                escape_text_into(cdata, out);
261                continue;
262            }
263            if p.pos + 1 < p.src.len() && p.src[p.pos + 1] == b'?' {
264                // Processing instruction → `<?target data?>`.
265                p.pos += 2;
266                let target_start = p.pos;
267                let target_len = parse_name_len(p)?;
268                let target = &p.src[target_start..target_start + target_len];
269                p.skip_ws();
270                let data_start = p.pos;
271                while p.pos + 1 < p.src.len() && &p.src[p.pos..p.pos + 2] != b"?>" {
272                    p.pos += 1;
273                }
274                if p.pos + 1 >= p.src.len() {
275                    return Err(INVALID_XML_VIOLATION);
276                }
277                let raw_data = &p.src[data_start..p.pos];
278                p.pos += 2;
279                let mut end = raw_data.len();
280                while end > 0 && raw_data[end - 1].is_ascii_whitespace() {
281                    end -= 1;
282                }
283                out.extend_from_slice(b"<?");
284                out.extend_from_slice(target);
285                if end > 0 {
286                    out.push(b' ');
287                    out.extend_from_slice(&raw_data[..end]);
288                }
289                out.extend_from_slice(b"?>");
290                continue;
291            }
292            // Nested element.
293            emit_element(p, out, depth + 1)?;
294            continue;
295        }
296        // Text content — entity-decoded, then escaped.
297        let text_start = p.pos;
298        while p.pos < p.src.len() && p.src[p.pos] != b'<' {
299            p.pos += 1;
300        }
301        let decoded = decode_entities(&p.src[text_start..p.pos])?;
302        escape_text_into(&decoded, out);
303    }
304}
305
306#[cfg(feature = "alloc")]
307fn parse_name_len(p: &mut Parser<'_>) -> Result<usize, ShapeViolation> {
308    let start = p.pos;
309    while p.pos < p.src.len() {
310        let b = p.src[p.pos];
311        if b.is_ascii_alphanumeric() || b == b'_' || b == b'-' || b == b'.' {
312            p.pos += 1;
313        } else {
314            break;
315        }
316    }
317    let len = p.pos - start;
318    if len == 0 {
319        return Err(INVALID_XML_VIOLATION);
320    }
321    Ok(len)
322}
323
324/// Parse `name="value"` (or `name='value'`); returns the borrowed name
325/// and the entity-decoded value.
326#[cfg(feature = "alloc")]
327fn parse_attr<'a>(p: &mut Parser<'a>) -> Result<(&'a [u8], alloc::vec::Vec<u8>), ShapeViolation> {
328    let name_start = p.pos;
329    let name_len = parse_name_len(p)?;
330    let name = &p.src[name_start..name_start + name_len];
331    p.skip_ws();
332    if p.pos >= p.src.len() || p.src[p.pos] != b'=' {
333        return Err(INVALID_XML_VIOLATION);
334    }
335    p.pos += 1;
336    p.skip_ws();
337    if p.pos >= p.src.len() {
338        return Err(INVALID_XML_VIOLATION);
339    }
340    let quote = p.src[p.pos];
341    if quote != b'"' && quote != b'\'' {
342        return Err(INVALID_XML_VIOLATION);
343    }
344    p.pos += 1;
345    let value_start = p.pos;
346    while p.pos < p.src.len() && p.src[p.pos] != quote {
347        p.pos += 1;
348    }
349    if p.pos >= p.src.len() {
350        return Err(INVALID_XML_VIOLATION);
351    }
352    let raw_value = &p.src[value_start..p.pos];
353    p.pos += 1;
354    Ok((name, decode_entities(raw_value)?))
355}
356
357/// Resolve the five predefined entities plus numeric character
358/// references into a freshly-allocated UTF-8 byte sequence.
359#[cfg(feature = "alloc")]
360fn decode_entities(text: &[u8]) -> Result<alloc::vec::Vec<u8>, ShapeViolation> {
361    use alloc::vec::Vec;
362    let mut out = Vec::new();
363    let mut i = 0;
364    while i < text.len() {
365        let b = text[i];
366        if b != b'&' {
367            out.push(b);
368            i += 1;
369            continue;
370        }
371        let entity_start = i + 1;
372        let mut j = entity_start;
373        while j < text.len() && text[j] != b';' {
374            j += 1;
375        }
376        if j >= text.len() {
377            return Err(INVALID_XML_VIOLATION);
378        }
379        let entity = &text[entity_start..j];
380        let cp = match entity {
381            b"lt" => '<' as u32,
382            b"gt" => '>' as u32,
383            b"amp" => '&' as u32,
384            b"quot" => '"' as u32,
385            b"apos" => '\'' as u32,
386            _ if entity.starts_with(b"#x") || entity.starts_with(b"#X") => {
387                let hex = &entity[2..];
388                let s = core::str::from_utf8(hex).map_err(|_| INVALID_XML_VIOLATION)?;
389                u32::from_str_radix(s, 16).map_err(|_| INVALID_XML_VIOLATION)?
390            }
391            _ if entity.starts_with(b"#") => {
392                let dec = &entity[1..];
393                let s = core::str::from_utf8(dec).map_err(|_| INVALID_XML_VIOLATION)?;
394                s.parse::<u32>().map_err(|_| INVALID_XML_VIOLATION)?
395            }
396            _ => return Err(INVALID_XML_VIOLATION),
397        };
398        let c = char::from_u32(cp).ok_or(INVALID_XML_VIOLATION)?;
399        let mut buf = [0u8; 4];
400        out.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
401        i = j + 1;
402    }
403    Ok(out)
404}
405
406/// XML-C14N 1.1 §1.1 rule 4 — attribute-value character replacement.
407#[cfg(feature = "alloc")]
408fn escape_attr_into(bytes: &[u8], out: &mut alloc::vec::Vec<u8>) {
409    for &b in bytes {
410        match b {
411            b'<' => out.extend_from_slice(b"&lt;"),
412            b'>' => out.extend_from_slice(b"&gt;"),
413            b'&' => out.extend_from_slice(b"&amp;"),
414            b'"' => out.extend_from_slice(b"&quot;"),
415            b'\t' => out.extend_from_slice(b"&#x9;"),
416            b'\n' => out.extend_from_slice(b"&#xA;"),
417            b'\r' => out.extend_from_slice(b"&#xD;"),
418            _ => out.push(b),
419        }
420    }
421}
422
423/// XML-C14N 1.1 §1.1 rule 5 — text-content character replacement.
424#[cfg(feature = "alloc")]
425fn escape_text_into(bytes: &[u8], out: &mut alloc::vec::Vec<u8>) {
426    for &b in bytes {
427        match b {
428            b'<' => out.extend_from_slice(b"&lt;"),
429            b'>' => out.extend_from_slice(b"&gt;"),
430            b'&' => out.extend_from_slice(b"&amp;"),
431            b'\r' => out.extend_from_slice(b"&#xD;"),
432            _ => out.push(b),
433        }
434    }
435}
436
437#[cfg(all(test, feature = "alloc"))]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn canonicalizes_with_lexicographic_attribute_ordering() {
443        let canon = canonicalize(br#"<root b="2" a="1"/>"#).expect("valid");
444        assert_eq!(canon, br#"<root a="1" b="2"></root>"#);
445    }
446
447    #[test]
448    fn canonicalizer_collapses_cdata_to_text() {
449        let canon = canonicalize(b"<root><![CDATA[<hello>]]></root>").expect("valid");
450        assert_eq!(canon, b"<root>&lt;hello&gt;</root>");
451    }
452
453    #[test]
454    fn canonicalizer_escapes_attribute_values() {
455        let canon = canonicalize(br#"<root attr="&lt;v&gt;"/>"#).expect("valid");
456        assert_eq!(canon, br#"<root attr="&lt;v&gt;"></root>"#);
457    }
458
459    #[test]
460    fn canonicalizer_is_idempotent() {
461        let inputs: &[&[u8]] = &[
462            b"<root/>",
463            b"<root><child/></root>",
464            br#"<root a="1" b="2"><child>text</child></root>"#,
465        ];
466        for raw in inputs {
467            let once = canonicalize(raw).expect("valid");
468            let twice = canonicalize(&once).expect("re-canonicalises");
469            assert_eq!(once, twice, "idempotence broken for {raw:?}");
470        }
471    }
472
473    #[test]
474    fn rejects_mismatched_close_tag() {
475        let err = canonicalize(b"<a></b>").expect_err("mismatch");
476        assert_eq!(err.constraint_iri, INVALID_XML_VIOLATION.constraint_iri);
477    }
478
479    #[test]
480    fn accepts_unbounded_attribute_and_name_widths() {
481        extern crate alloc;
482        // Element name, attribute name, and value all far exceed the old
483        // fixed-buffer ceilings; ADR-060 admits them.
484        let long_name = "n".repeat(5000);
485        let long_val = "v".repeat(20_000);
486        let doc = alloc::format!(r#"<{long_name} attr="{long_val}"/>"#);
487        let canon = canonicalize(doc.as_bytes()).expect("unbounded widths admitted");
488        let expected = alloc::format!(r#"<{long_name} attr="{long_val}"></{long_name}>"#);
489        assert_eq!(canon, expected.as_bytes());
490    }
491
492    #[test]
493    fn rejects_overdeep_nesting() {
494        extern crate alloc;
495        use alloc::format;
496        use alloc::string::String;
497        let mut s = String::new();
498        for i in 0..(MAX_XML_DEPTH + 2) {
499            s.push_str(&format!("<n{i}>"));
500        }
501        for i in (0..(MAX_XML_DEPTH + 2)).rev() {
502            s.push_str(&format!("</n{i}>"));
503        }
504        let err = canonicalize(s.as_bytes()).expect_err("overdeep");
505        assert_eq!(err.constraint_iri, DEPTH_BOUND_VIOLATION.constraint_iri);
506    }
507}