1use prism::operation::TermValue;
25use prism::pipeline::{
26 ConstrainedTypeShape, ConstraintRef, IntoBindingValue, PartitionProductFields,
27};
28#[cfg(feature = "alloc")]
31use crate::xml::shapes::bounds::MAX_XML_DEPTH;
32#[cfg(feature = "alloc")]
33use prism::pipeline::{ShapeViolation, ViolationKind};
34
35#[cfg(feature = "alloc")]
38const INVALID_XML_VIOLATION: ShapeViolation = ShapeViolation {
39 shape_iri: "https://uor.foundation/addr/XmlValue",
40 constraint_iri: "https://uor.foundation/addr/XmlValue/validXml",
41 property_iri: "https://uor.foundation/addr/inputBytes",
42 expected_range: "https://uor.foundation/addr/ValidUtf8Xml",
43 min_count: 0,
44 max_count: 1,
45 kind: ViolationKind::ValueCheck,
46};
47
48#[cfg(feature = "alloc")]
49const DEPTH_BOUND_VIOLATION: ShapeViolation = ShapeViolation {
50 shape_iri: "https://uor.foundation/addr/XmlValue",
51 constraint_iri: "https://uor.foundation/addr/XmlValue/depthBound",
52 property_iri: "https://uor.foundation/addr/XmlValue/depth",
53 expected_range: "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
54 min_count: 0,
55 max_count: MAX_XML_DEPTH as u32,
56 kind: ViolationKind::CardinalityViolation,
57};
58
59#[derive(Clone, Copy, Debug)]
65pub struct XmlValue<'a>(&'a [u8]);
66
67impl<'a> XmlValue<'a> {
68 #[must_use]
70 pub fn new(canonical_bytes: &'a [u8]) -> Self {
71 Self(canonical_bytes)
72 }
73
74 #[must_use]
76 pub fn canonical_bytes(&self) -> &'a [u8] {
77 self.0
78 }
79}
80
81impl ConstrainedTypeShape for XmlValue<'_> {
82 const IRI: &'static str = "https://uor.foundation/addr/XmlValue";
83 const SITE_COUNT: usize = 1;
84 const CONSTRAINTS: &'static [ConstraintRef] = &[];
85 const CYCLE_SIZE: u64 = u64::MAX;
86}
87
88impl prism::uor_foundation::pipeline::__sdk_seal::Sealed for XmlValue<'_> {}
89
90impl<'a> IntoBindingValue<'a> for XmlValue<'a> {
91 fn as_binding_value<const INLINE_BYTES: usize>(&self) -> TermValue<'a, INLINE_BYTES> {
92 TermValue::borrowed(self.0)
96 }
97}
98
99impl PartitionProductFields for XmlValue<'_> {
100 const FIELDS: &'static [(u32, u32)] = &[];
101 const FIELD_NAMES: &'static [&'static str] = &[];
102}
103
104#[cfg(feature = "alloc")]
121pub fn canonicalize(raw: &[u8]) -> Result<alloc::vec::Vec<u8>, ShapeViolation> {
122 extern crate alloc;
123 core::str::from_utf8(raw).map_err(|_| INVALID_XML_VIOLATION)?;
124 let mut p = Parser::new(raw);
125 let mut out = alloc::vec::Vec::new();
126 p.skip_ws();
127 emit_element(&mut p, &mut out, 0)?;
128 p.skip_ws();
129 if !p.is_eof() {
130 return Err(INVALID_XML_VIOLATION);
131 }
132 Ok(out)
133}
134
135#[cfg(feature = "alloc")]
136struct Parser<'a> {
137 src: &'a [u8],
138 pos: usize,
139}
140
141#[cfg(feature = "alloc")]
142impl<'a> Parser<'a> {
143 fn new(src: &'a [u8]) -> Self {
144 Self { src, pos: 0 }
145 }
146
147 fn skip_ws(&mut self) {
148 while self.pos < self.src.len() && self.src[self.pos].is_ascii_whitespace() {
149 self.pos += 1;
150 }
151 }
152
153 fn is_eof(&self) -> bool {
154 self.pos >= self.src.len()
155 }
156}
157
158#[cfg(feature = "alloc")]
160fn emit_element(
161 p: &mut Parser<'_>,
162 out: &mut alloc::vec::Vec<u8>,
163 depth: usize,
164) -> Result<(), ShapeViolation> {
165 use alloc::vec::Vec;
166
167 if depth > MAX_XML_DEPTH {
168 return Err(DEPTH_BOUND_VIOLATION);
169 }
170 if p.pos >= p.src.len() || p.src[p.pos] != b'<' {
171 return Err(INVALID_XML_VIOLATION);
172 }
173 p.pos += 1;
174 if p.pos < p.src.len() && (p.src[p.pos] == b'!' || p.src[p.pos] == b'?') {
175 return Err(INVALID_XML_VIOLATION);
176 }
177 let name_start = p.pos;
178 let name_len = parse_name_len(p)?;
179 let name = &p.src[name_start..name_start + name_len];
180
181 let mut attrs: Vec<(&[u8], Vec<u8>)> = Vec::new();
184 loop {
185 p.skip_ws();
186 if p.pos >= p.src.len() {
187 return Err(INVALID_XML_VIOLATION);
188 }
189 if p.src[p.pos] == b'>' || p.src[p.pos] == b'/' {
190 break;
191 }
192 attrs.push(parse_attr(p)?);
193 }
194 attrs.sort_by(|a, b| a.0.cmp(b.0));
195
196 out.push(b'<');
197 out.extend_from_slice(name);
198 for (k, v) in &attrs {
199 out.push(b' ');
200 out.extend_from_slice(k);
201 out.extend_from_slice(b"=\"");
202 escape_attr_into(v, out);
203 out.push(b'"');
204 }
205
206 if p.src[p.pos] == b'/' {
207 p.pos += 1;
209 if p.pos >= p.src.len() || p.src[p.pos] != b'>' {
210 return Err(INVALID_XML_VIOLATION);
211 }
212 p.pos += 1;
213 out.extend_from_slice(b"></");
214 out.extend_from_slice(name);
215 out.push(b'>');
216 return Ok(());
217 }
218 if p.src[p.pos] != b'>' {
219 return Err(INVALID_XML_VIOLATION);
220 }
221 p.pos += 1;
222 out.push(b'>');
223
224 loop {
226 if p.pos >= p.src.len() {
227 return Err(INVALID_XML_VIOLATION);
228 }
229 if p.src[p.pos] == b'<' {
230 if p.pos + 1 < p.src.len() && p.src[p.pos + 1] == b'/' {
231 p.pos += 2;
233 let close_start = p.pos;
234 let close_len = parse_name_len(p)?;
235 if &p.src[close_start..close_start + close_len] != name {
236 return Err(INVALID_XML_VIOLATION);
237 }
238 p.skip_ws();
239 if p.pos >= p.src.len() || p.src[p.pos] != b'>' {
240 return Err(INVALID_XML_VIOLATION);
241 }
242 p.pos += 1;
243 out.extend_from_slice(b"</");
244 out.extend_from_slice(name);
245 out.push(b'>');
246 return Ok(());
247 }
248 if p.pos + 8 < p.src.len() && &p.src[p.pos..p.pos + 9] == b"<![CDATA[" {
249 p.pos += 9;
251 let start = p.pos;
252 while p.pos + 2 < p.src.len() && &p.src[p.pos..p.pos + 3] != b"]]>" {
253 p.pos += 1;
254 }
255 if p.pos + 2 >= p.src.len() {
256 return Err(INVALID_XML_VIOLATION);
257 }
258 let cdata = &p.src[start..p.pos];
259 p.pos += 3;
260 escape_text_into(cdata, out);
261 continue;
262 }
263 if p.pos + 1 < p.src.len() && p.src[p.pos + 1] == b'?' {
264 p.pos += 2;
266 let target_start = p.pos;
267 let target_len = parse_name_len(p)?;
268 let target = &p.src[target_start..target_start + target_len];
269 p.skip_ws();
270 let data_start = p.pos;
271 while p.pos + 1 < p.src.len() && &p.src[p.pos..p.pos + 2] != b"?>" {
272 p.pos += 1;
273 }
274 if p.pos + 1 >= p.src.len() {
275 return Err(INVALID_XML_VIOLATION);
276 }
277 let raw_data = &p.src[data_start..p.pos];
278 p.pos += 2;
279 let mut end = raw_data.len();
280 while end > 0 && raw_data[end - 1].is_ascii_whitespace() {
281 end -= 1;
282 }
283 out.extend_from_slice(b"<?");
284 out.extend_from_slice(target);
285 if end > 0 {
286 out.push(b' ');
287 out.extend_from_slice(&raw_data[..end]);
288 }
289 out.extend_from_slice(b"?>");
290 continue;
291 }
292 emit_element(p, out, depth + 1)?;
294 continue;
295 }
296 let text_start = p.pos;
298 while p.pos < p.src.len() && p.src[p.pos] != b'<' {
299 p.pos += 1;
300 }
301 let decoded = decode_entities(&p.src[text_start..p.pos])?;
302 escape_text_into(&decoded, out);
303 }
304}
305
306#[cfg(feature = "alloc")]
307fn parse_name_len(p: &mut Parser<'_>) -> Result<usize, ShapeViolation> {
308 let start = p.pos;
309 while p.pos < p.src.len() {
310 let b = p.src[p.pos];
311 if b.is_ascii_alphanumeric() || b == b'_' || b == b'-' || b == b'.' {
312 p.pos += 1;
313 } else {
314 break;
315 }
316 }
317 let len = p.pos - start;
318 if len == 0 {
319 return Err(INVALID_XML_VIOLATION);
320 }
321 Ok(len)
322}
323
324#[cfg(feature = "alloc")]
327fn parse_attr<'a>(p: &mut Parser<'a>) -> Result<(&'a [u8], alloc::vec::Vec<u8>), ShapeViolation> {
328 let name_start = p.pos;
329 let name_len = parse_name_len(p)?;
330 let name = &p.src[name_start..name_start + name_len];
331 p.skip_ws();
332 if p.pos >= p.src.len() || p.src[p.pos] != b'=' {
333 return Err(INVALID_XML_VIOLATION);
334 }
335 p.pos += 1;
336 p.skip_ws();
337 if p.pos >= p.src.len() {
338 return Err(INVALID_XML_VIOLATION);
339 }
340 let quote = p.src[p.pos];
341 if quote != b'"' && quote != b'\'' {
342 return Err(INVALID_XML_VIOLATION);
343 }
344 p.pos += 1;
345 let value_start = p.pos;
346 while p.pos < p.src.len() && p.src[p.pos] != quote {
347 p.pos += 1;
348 }
349 if p.pos >= p.src.len() {
350 return Err(INVALID_XML_VIOLATION);
351 }
352 let raw_value = &p.src[value_start..p.pos];
353 p.pos += 1;
354 Ok((name, decode_entities(raw_value)?))
355}
356
357#[cfg(feature = "alloc")]
360fn decode_entities(text: &[u8]) -> Result<alloc::vec::Vec<u8>, ShapeViolation> {
361 use alloc::vec::Vec;
362 let mut out = Vec::new();
363 let mut i = 0;
364 while i < text.len() {
365 let b = text[i];
366 if b != b'&' {
367 out.push(b);
368 i += 1;
369 continue;
370 }
371 let entity_start = i + 1;
372 let mut j = entity_start;
373 while j < text.len() && text[j] != b';' {
374 j += 1;
375 }
376 if j >= text.len() {
377 return Err(INVALID_XML_VIOLATION);
378 }
379 let entity = &text[entity_start..j];
380 let cp = match entity {
381 b"lt" => '<' as u32,
382 b"gt" => '>' as u32,
383 b"amp" => '&' as u32,
384 b"quot" => '"' as u32,
385 b"apos" => '\'' as u32,
386 _ if entity.starts_with(b"#x") || entity.starts_with(b"#X") => {
387 let hex = &entity[2..];
388 let s = core::str::from_utf8(hex).map_err(|_| INVALID_XML_VIOLATION)?;
389 u32::from_str_radix(s, 16).map_err(|_| INVALID_XML_VIOLATION)?
390 }
391 _ if entity.starts_with(b"#") => {
392 let dec = &entity[1..];
393 let s = core::str::from_utf8(dec).map_err(|_| INVALID_XML_VIOLATION)?;
394 s.parse::<u32>().map_err(|_| INVALID_XML_VIOLATION)?
395 }
396 _ => return Err(INVALID_XML_VIOLATION),
397 };
398 let c = char::from_u32(cp).ok_or(INVALID_XML_VIOLATION)?;
399 let mut buf = [0u8; 4];
400 out.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
401 i = j + 1;
402 }
403 Ok(out)
404}
405
406#[cfg(feature = "alloc")]
408fn escape_attr_into(bytes: &[u8], out: &mut alloc::vec::Vec<u8>) {
409 for &b in bytes {
410 match b {
411 b'<' => out.extend_from_slice(b"<"),
412 b'>' => out.extend_from_slice(b">"),
413 b'&' => out.extend_from_slice(b"&"),
414 b'"' => out.extend_from_slice(b"""),
415 b'\t' => out.extend_from_slice(b"	"),
416 b'\n' => out.extend_from_slice(b"
"),
417 b'\r' => out.extend_from_slice(b"
"),
418 _ => out.push(b),
419 }
420 }
421}
422
423#[cfg(feature = "alloc")]
425fn escape_text_into(bytes: &[u8], out: &mut alloc::vec::Vec<u8>) {
426 for &b in bytes {
427 match b {
428 b'<' => out.extend_from_slice(b"<"),
429 b'>' => out.extend_from_slice(b">"),
430 b'&' => out.extend_from_slice(b"&"),
431 b'\r' => out.extend_from_slice(b"
"),
432 _ => out.push(b),
433 }
434 }
435}
436
437#[cfg(all(test, feature = "alloc"))]
438mod tests {
439 use super::*;
440
441 #[test]
442 fn canonicalizes_with_lexicographic_attribute_ordering() {
443 let canon = canonicalize(br#"<root b="2" a="1"/>"#).expect("valid");
444 assert_eq!(canon, br#"<root a="1" b="2"></root>"#);
445 }
446
447 #[test]
448 fn canonicalizer_collapses_cdata_to_text() {
449 let canon = canonicalize(b"<root><![CDATA[<hello>]]></root>").expect("valid");
450 assert_eq!(canon, b"<root><hello></root>");
451 }
452
453 #[test]
454 fn canonicalizer_escapes_attribute_values() {
455 let canon = canonicalize(br#"<root attr="<v>"/>"#).expect("valid");
456 assert_eq!(canon, br#"<root attr="<v>"></root>"#);
457 }
458
459 #[test]
460 fn canonicalizer_is_idempotent() {
461 let inputs: &[&[u8]] = &[
462 b"<root/>",
463 b"<root><child/></root>",
464 br#"<root a="1" b="2"><child>text</child></root>"#,
465 ];
466 for raw in inputs {
467 let once = canonicalize(raw).expect("valid");
468 let twice = canonicalize(&once).expect("re-canonicalises");
469 assert_eq!(once, twice, "idempotence broken for {raw:?}");
470 }
471 }
472
473 #[test]
474 fn rejects_mismatched_close_tag() {
475 let err = canonicalize(b"<a></b>").expect_err("mismatch");
476 assert_eq!(err.constraint_iri, INVALID_XML_VIOLATION.constraint_iri);
477 }
478
479 #[test]
480 fn accepts_unbounded_attribute_and_name_widths() {
481 extern crate alloc;
482 let long_name = "n".repeat(5000);
485 let long_val = "v".repeat(20_000);
486 let doc = alloc::format!(r#"<{long_name} attr="{long_val}"/>"#);
487 let canon = canonicalize(doc.as_bytes()).expect("unbounded widths admitted");
488 let expected = alloc::format!(r#"<{long_name} attr="{long_val}"></{long_name}>"#);
489 assert_eq!(canon, expected.as_bytes());
490 }
491
492 #[test]
493 fn rejects_overdeep_nesting() {
494 extern crate alloc;
495 use alloc::format;
496 use alloc::string::String;
497 let mut s = String::new();
498 for i in 0..(MAX_XML_DEPTH + 2) {
499 s.push_str(&format!("<n{i}>"));
500 }
501 for i in (0..(MAX_XML_DEPTH + 2)).rev() {
502 s.push_str(&format!("</n{i}>"));
503 }
504 let err = canonicalize(s.as_bytes()).expect_err("overdeep");
505 assert_eq!(err.constraint_iri, DEPTH_BOUND_VIOLATION.constraint_iri);
506 }
507}