From e6f1eb0f085439e0bc4e0907d4d48edbd7d38d92 Mon Sep 17 00:00:00 2001 From: Shautvast Date: Sat, 19 Jul 2025 13:31:11 +0200 Subject: [PATCH] skip comments --- src/maven/xml/mod.rs | 2 +- src/maven/xml/sax_parser.rs | 73 +++++++++++++++++++++++++------- src/maven/xml/sax_parser_test.rs | 15 +++++++ src/maven/xml/test/comment.xml | 4 ++ 4 files changed, 77 insertions(+), 17 deletions(-) create mode 100644 src/maven/xml/test/comment.xml diff --git a/src/maven/xml/mod.rs b/src/maven/xml/mod.rs index d403a79..2638317 100644 --- a/src/maven/xml/mod.rs +++ b/src/maven/xml/mod.rs @@ -5,7 +5,7 @@ mod debug; #[derive(Debug)] pub struct Attribute { name: String, - namespace: Option, + _namespace: Option, value: String, } diff --git a/src/maven/xml/sax_parser.rs b/src/maven/xml/sax_parser.rs index bbeb1c0..b577a6d 100644 --- a/src/maven/xml/sax_parser.rs +++ b/src/maven/xml/sax_parser.rs @@ -23,6 +23,7 @@ impl<'a> SAXParser<'a> { } fn parse(&mut self) -> anyhow::Result<()> { + self.advance()?; self.expect( "", "Content is not allowed in prolog.", @@ -36,7 +37,9 @@ impl<'a> SAXParser<'a> { while self.position < self.xml.len() { if self.current == '<' { self.advance()?; - if self.current != '/' { + if self.current == '!' { + self.skip_comment()?; + } else if self.current != '/' { self.parse_start_element()?; } else { self.parse_end_element()?; @@ -47,6 +50,29 @@ impl<'a> SAXParser<'a> { Ok(()) } + fn skip_comment(&mut self) -> anyhow::Result<()> { + self.expect("!--", "Expect comment start")?; + let mut c = self.current; + let mut end_in_sight = 0; + while end_in_sight < 3 && self.position < self.xml.len() { + match c { + '-' if end_in_sight < 2 => { + end_in_sight += 1; + } + '>' if end_in_sight == 2 => { + end_in_sight += 1; + } + _ if end_in_sight > 0 => { + end_in_sight -= 0; + } + _ => {} + } + c = self.advance()?; + } + self.skip_whitespace()?; + Ok(()) + } + fn parse_start_element(&mut self) -> anyhow::Result<()> { let name = self.read_until(" />")?; let mut atts = vec![]; @@ -59,26 +85,33 @@ impl<'a> SAXParser<'a> { self.handler.start_element("", name.as_str(), "", atts); self.skip_whitespace()?; + if self.current == '/' { + self.advance()?; + } self.expect_char('>')?; + self.skip_whitespace()?; Ok(()) } fn parse_attribute(&mut self) -> anyhow::Result { let att_name = self.read_until("=")?; self.skip_whitespace()?; + self.expect("=", "Expected =")?; self.expect("\"", "Expected start of attribute value")?; let att_value = self.read_until("\"")?; Ok(Attribute { name: att_name.trim().to_string(), - namespace: Some("".to_string()), + _namespace: Some("".to_string()), value: att_value, }) } fn parse_end_element(&mut self) -> anyhow::Result<()> { + self.advance()?; let name = self.read_until(">")?; self.handler.end_element("", name.as_str(), ""); + self.expect(">", "Expect end of element")?; Ok(()) } @@ -87,6 +120,9 @@ impl<'a> SAXParser<'a> { let mut c = self.current; let until = until.chars().collect::>(); while !until.contains(&c) { + if self.position > self.xml.len() { + return Err(anyhow::anyhow!("End reached while expecting {:?}", until)); + } c = self.advance()?; } Ok(self.xml[start - 1..self.position - 1] @@ -103,33 +139,38 @@ impl<'a> SAXParser<'a> { } fn advance(&mut self) -> anyhow::Result { + if self.position > self.xml.len() { + return Err(anyhow::anyhow!( + "End reached while expecting {:?}", + self.current + )); + } self.position += 1; - self.current = self.xml[self.position - 1]; + self.current = if self.position <= self.xml.len() { + self.xml[self.position - 1] + } else { + '\0' + }; Ok(self.current) } - fn next_char(&mut self) -> anyhow::Result { - if self.position >= self.xml.len() { - Err(anyhow::anyhow!("End reached")) - } else { - Ok(self.xml[self.position + 1]) - } - } - - fn expect(&mut self, header_line: &str, message: &str) -> anyhow::Result<()> { - for c in header_line.chars() { + fn expect(&mut self, expected: &str, message: &str) -> anyhow::Result<()> { + for c in expected.chars() { if !self.expect_char(c)? { return Err(anyhow::anyhow!(message.to_string())); } } - self.advance()?; Ok(()) } fn expect_char(&mut self, expected: char) -> anyhow::Result { - if self.position >= self.xml.len() { + if self.position > self.xml.len() { return Ok(false); } - Ok(self.advance()? == expected) + let same = self.current == expected; + if same { + self.advance()?; + } + Ok(same) } } diff --git a/src/maven/xml/sax_parser_test.rs b/src/maven/xml/sax_parser_test.rs index af41f19..3f73452 100644 --- a/src/maven/xml/sax_parser_test.rs +++ b/src/maven/xml/sax_parser_test.rs @@ -16,6 +16,7 @@ mod tests { #[test] fn test_xml_header() { + initialize(); let test_xml = include_str!("test/header.xml"); let mut testhandler = TestHandler::new(); parse_string(test_xml.to_string(), Box::new(&mut testhandler)) @@ -61,6 +62,20 @@ mod tests { assert!(testhandler.end_element_called); assert!(testhandler.end_document_called); } + + #[test] + fn test_ignore_comment() { + let test_xml = include_str!("test/comment.xml"); + let mut testhandler = TestHandler::new(); + parse_string(test_xml.to_string(), Box::new(&mut testhandler)) + .expect("Failed to parse test xml"); + assert!(testhandler.start_document_called); + assert!(testhandler.start_element_called); + assert!(!testhandler.elements.is_empty()); + assert_eq!(testhandler.elements[0], r#""#); + assert!(testhandler.end_element_called); + assert!(testhandler.end_document_called); + } } #[derive(Debug)] diff --git a/src/maven/xml/test/comment.xml b/src/maven/xml/test/comment.xml new file mode 100644 index 0000000..c500003 --- /dev/null +++ b/src/maven/xml/test/comment.xml @@ -0,0 +1,4 @@ + + + +