use crate::xml::{Attribute, SaxError, SaxHandler}; use std::collections::HashMap; // So I decided to model it after java SAX api, which was a bad choice // it defines a trait, like the java SAXHandler interface // The rusty way to do it would be using a SAXEvent enum with different variants where there are now trait methods. // That would also imply that you go from push (current) to a pull parser, which gives you more control. But hey, who needs that? /// Parses an xml as string and call the SAXHandler functions accordingly // no streaming? Nah. I could do that, but I didn't feel the need, because maven pom.xmls should not be of gigbabyte size // It's not a lot of code, but it handles my pom files well, so far. I have see implementations (xml_oxide) that are way bigger. // It's a basic recursive descent parser. // It handles namespaces (and prefixes) correctly AFAIK // No validation (DTD/XSD) // I probably missed some other XML functionality that I don't think I need. pub fn parse_string(xml: &str, handler: Box<&mut dyn SaxHandler>) -> Result<(), SaxError> { SAXParser::new(xml, handler).parse() } // struct containing the parser state struct SAXParser<'a> { xml: Vec, handler: Box<&'a mut dyn SaxHandler>, position: usize, current_line: usize, current: char, char_buffer: Vec, namespace_stack: Vec<(String, isize)>, prefix_mapping: HashMap, } impl<'a> SAXParser<'a> { /// fn new(xml: &str, handler: Box<&'a mut dyn SaxHandler>) -> Self { Self { xml: xml.chars().collect(), handler, position: 0, current_line: 0, current: '\0', char_buffer: Vec::new(), namespace_stack: Vec::new(), prefix_mapping: HashMap::new(), } } fn parse(&mut self) -> Result<(), SaxError> { self.advance()?; self.expect( "", "Content is not allowed in prolog.", )?; self.skip_whitespace()?; self.handler.start_document(); self.parse_elements() } fn parse_elements(&mut self) -> Result<(), SaxError> { while self.position < self.xml.len() { if self.current == '<' { if !self.char_buffer.is_empty() { self.handler.characters(&self.char_buffer); self.char_buffer.clear(); } self.advance()?; if self.current == '!' { self.skip_comment()?; } else if self.current != '/' { self.parse_start_element()?; } else { self.parse_end_element()?; } } else { self.char_buffer.push(self.current); self.advance()?; } } self.handler.end_document(); Ok(()) } fn skip_comment(&mut self) -> Result<(), SaxError> { self.expect("!--", "Expect comment start")?; let mut c = self.current; let mut end_in_sight = 0; while end_in_sight < 3 && self.position < self.xml.len() { match c { '-' if end_in_sight < 2 => { end_in_sight += 1; } '>' if end_in_sight == 2 => { end_in_sight += 1; } _ if end_in_sight == 2 => { return Err(SaxError::BadCharacter); } _ if end_in_sight > 0 => { end_in_sight = 0; } _ => {} } c = self.advance()?; } self.skip_whitespace()?; Ok(()) } fn parse_start_element(&mut self) -> Result<(), SaxError> { let qname = self.read_until(" \t\n/>")?; let mut atts = vec![]; let mut c = self.current; while c.is_whitespace() { self.skip_whitespace()?; if self.current == '/' { break; } atts.push(self.parse_attribute()?); c = self.advance()?; } let (namespace, lname) = if qname.contains(":") { let tokens = qname.splitn(2, ":").collect::>(); let prefix = tokens[0].to_string(); let name = tokens[1].to_string(); let namespace = self.prefix_mapping.get(&prefix); if let Some(namespace) = namespace { (Some(namespace.to_string()), name) } else { return Err(SaxError::UndeclaredNamespacePrefix(prefix)); } } else if !self.namespace_stack.is_empty() { let (name, count) = self.namespace_stack.pop().unwrap(); self.namespace_stack.push((name.clone(), count + 1)); (Some(name.clone()), qname) } else { (None, qname) }; let qualified_name = if let Some(namespace) = &namespace { &format!("{}:{}", namespace.clone(), &lname) } else { &lname }; self.handler .start_element(namespace.clone(), lname.as_str(), qualified_name, atts); self.skip_whitespace()?; if self.current == '/' { self.advance()?; let namespace = self.pop_namespace(); self.handler .end_element(namespace, lname.as_str(), qualified_name); } self.expect_char('>')?; self.skip_whitespace()?; Ok(()) } fn parse_attribute(&mut self) -> Result { let att_name = self.read_until("=")?; self.skip_whitespace()?; self.expect("=", "Expected =")?; self.skip_whitespace()?; self.expect( r#"""#, &format!( "Expected start of attribute value at line {}. Instead found [{}]", self.current_line, self.current ), )?; let att_value = self.read_until("\"")?; if att_name.starts_with("xmlns:") { let prefix = att_name[6..].to_string(); self.prefix_mapping .insert(prefix.clone(), att_value.to_string()); self.handler.start_prefix_mapping(&prefix, &att_value); } let namespace = if att_name == "xmlns" { self.namespace_stack.push((att_value.clone(), -1)); Some(att_value.clone()) } else { None }; Ok(Attribute { name: att_name.trim().to_string(), namespace, value: att_value, }) } fn parse_end_element(&mut self) -> Result<(), SaxError> { self.advance()?; let name = self.read_until(">")?; let namespace = self.pop_namespace(); self.handler.end_element(namespace, name.as_str(), ""); self.expect(">", "Expect end of element")?; self.skip_whitespace()?; Ok(()) } fn pop_namespace(&mut self) -> Option { let namespace = if !self.namespace_stack.is_empty() { let (name, count) = self.namespace_stack.pop().unwrap(); if count > 0 { self.namespace_stack.push((name.to_string(), count - 1)); Some(name) } else { None } } else { None }; namespace } fn read_until(&mut self, until: &str) -> Result { let start = self.position; let mut c = self.current; let until = until.chars().collect::>(); while !until.contains(&c) { if self.position > self.xml.len() { return Err(SaxError::UnexpectedEof); } c = self.advance()?; } Ok(self.xml[start - 1..self.position - 1] .iter() .collect::()) } fn skip_whitespace(&mut self) -> Result<(), SaxError> { let mut c = self.current; while (c.is_whitespace()) && self.position < self.xml.len() { c = self.advance()?; } Ok(()) } fn advance(&mut self) -> Result { if self.position > self.xml.len() { return Err(SaxError::UnexpectedEof); } self.position += 1; self.current = if self.position <= self.xml.len() { self.xml[self.position - 1] } else { '\0' }; // print!("{}", self.current); if self.current == '\n' { self.current_line += 1; } Ok(self.current) } fn expect(&mut self, expected: &str, message: &str) -> Result<(), SaxError> { for c in expected.chars() { if !self.expect_char(c)? { return Err(SaxError::UnexpectedCharacter(message.to_string())); } } Ok(()) } fn expect_char(&mut self, expected: char) -> Result { if self.position > self.xml.len() { return Ok(false); } let same = self.current == expected; if same { self.advance()?; } Ok(same) } }