//! This file implements the function to read a grammar from a string, //! in the [augmented Backus Naur //! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form). //! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its //! specifications. //! //! In fact, the rules of ABNF are considered to be too strict. For //! example, only ASCII characters are allowed in the format, even //! within comments. I think any character should be allowed within //! comments, so the implementation here does that. #![allow(dead_code)] use super::*; /// The type of errors encountered during parsing a grammar in the /// ABNF format. #[derive(Debug)] #[non_exhaustive] pub enum Error { /// A rule name without rule body. HangingRuleName(usize), /// The minimum is greater than maximum? MinGreaterThanMax(usize), /// A digit is expected, but is not encountered. InvalidDigit(usize), /// A character that is not allowed. InvalidCharacter(usize), /// A numeric value is not completed. HangingNumeric(usize), /// A repetition is hanging. HangingRepetition(usize), /// A double quote is hanging. HangingDQuote(usize), /// A percentage sign is hanging. HangingPercent(usize), /// An equal sign is hanging. HangingEqualsSign(usize), /// An operation is unsupported. UnsupportedOperation, /// Encounters an unknown node when constructing the regular /// expression. RegexUnknownNode(usize), /// An index is out of bounds when constructing the regular /// expression. RegexIndexOutOfBounds(usize, usize), /// A node is duplicated when constructing the regular expression. RegexDuplicatedNode(usize), /// An edge is duplicated when constructing the regular /// expression. RegexDuplicatedEdge(usize, usize), /// The constructed regular expression graph has no root. RegexNoRoot, /// The constructed regular expression graph has cycles. RegexCycle, /// The stack illegally becomes empty when constructing a regular /// expression. RegexEmptyStack(usize), /// The action stack illegally becomes empty when constructing a /// regular expression. RegexEmptyActionStack(usize), /// An unbalanced parenthesis is encountered when constructing a /// regular expression. RegexUnbalancedParen(usize), /// An invalid repetition specification is encountered when /// constructing a regular expression. RegexInvalidRepeat(usize), /// An invalid comment character is encountered. RegexInvalidComment(usize), /// An invalid numeric value is encoutnered. RegexInvalidNumeric(usize), /// A rule should have an equal sign. MissingEqualSign(usize), /// An unimplemented feature is required. Unimplemented(usize), /// Input-Output error. IOError(std::io::Error), /// The grammar is empty. EmptyGrammar, /// Found an appending rule associated to a non-existent rule. AppendToNothing(usize), /// Found a rule declaration associated to an existing rule. CreateAgain(usize), /// This terminal is not found. UnknownTerminal(String), /// This non-terminal is not found. UnknownNonterminal(String), /// Something invalid has occurred. Invalid, } impl Error { fn translate(self, offset: usize) -> Self { match self { Error::HangingRuleName(offseta) => Error::HangingRuleName(offseta + offset), Error::MinGreaterThanMax(offseta) => Error::MinGreaterThanMax(offseta + offset), Error::InvalidDigit(offseta) => Error::InvalidDigit(offseta + offset), Error::InvalidCharacter(offseta) => Error::InvalidCharacter(offseta + offset), Error::HangingNumeric(offseta) => Error::HangingNumeric(offseta + offset), Error::HangingRepetition(offseta) => Error::HangingRepetition(offseta + offset), Error::HangingDQuote(offseta) => Error::HangingDQuote(offseta + offset), Error::HangingPercent(offseta) => Error::HangingPercent(offseta + offset), Error::HangingEqualsSign(offseta) => Error::HangingEqualsSign(offseta + offset), Error::RegexEmptyStack(offseta) => Error::RegexEmptyStack(offseta + offset), Error::RegexEmptyActionStack(offseta) => Error::RegexEmptyActionStack(offseta + offset), Error::RegexUnbalancedParen(offseta) => Error::RegexUnbalancedParen(offseta + offset), Error::RegexInvalidRepeat(offseta) => Error::RegexInvalidRepeat(offseta + offset), Error::RegexInvalidComment(offseta) => Error::RegexInvalidComment(offseta + offset), Error::RegexInvalidNumeric(offseta) => Error::RegexInvalidNumeric(offseta + offset), Error::MissingEqualSign(offseta) => Error::MissingEqualSign(offseta + offset), Error::Unimplemented(offseta) => Error::Unimplemented(offseta + offset), Error::AppendToNothing(offseta) => Error::AppendToNothing(offseta + offset), Error::CreateAgain(offseta) => Error::CreateAgain(offseta + offset), _ => self, } } } impl std::error::Error for Error {} impl From for Error { fn from(e: std::io::Error) -> Self { Self::IOError(e) } } impl From for Error { fn from(value: graph::error::Error) -> Self { match value { graph::error::Error::IndexOutOfBounds(index, bound) => { Error::RegexIndexOutOfBounds(index, bound) } graph::error::Error::DuplicatedNode(n) => Error::RegexDuplicatedNode(n), graph::error::Error::DuplicatedEdge(from, to) => Error::RegexDuplicatedEdge(from, to), _ => Error::Invalid, } } } impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Error::HangingRuleName(pos) => write!(f, "a rule name without body at {pos}"), Error::MinGreaterThanMax(pos) => write!( f, "minimum should not be strictly greater than the maximum at {pos}." ), Error::InvalidDigit(pos) => { write!(f, "a digit is expected at {pos}, but is not found.") } Error::InvalidCharacter(pos) => write!(f, "the character at {pos} is not allowed"), Error::HangingNumeric(pos) => write!(f, "a numeric value is hanging at {pos}"), Error::HangingRepetition(pos) => write!(f, "a repetition is hanging at {pos}"), Error::HangingDQuote(pos) => write!(f, "a double quote is hanging at {pos}"), Error::HangingPercent(pos) => write!(f, "a percentage sign is hanging at {pos}"), Error::HangingEqualsSign(pos) => write!(f, "an equal sign is hanging at {pos}"), Error::UnsupportedOperation => write!(f, "an unsupported operation is encountered"), Error::RegexUnknownNode(n) => write!(f, "an unknown node {n} is encountered \ when constructing the regular expression."), Error::RegexIndexOutOfBounds(index, bound) => write!(f, "an index {index} is out of \ bound {bound} when constructing the regular expression."), Error::RegexDuplicatedNode(n) => write!(f, "a node {n} is duplicated \ when constructing the regular expression."), Error::RegexDuplicatedEdge(from, to) => write!(f, "an edge from {from} to {to} is duplicated \ when constructing the regular expression."), Error::RegexNoRoot => write!(f, "the constructed regular expression has no root."), Error::RegexCycle => write!(f, "the constructed regular expression has cycles."), Error::RegexEmptyStack(pos) => write!(f, "the stack illegally becomes empty at {pos} \ when constructing a regular expression."), Error::RegexEmptyActionStack(pos) => write!(f, "the action stack illegally becomes empty at {pos} \ when constructing a regular expression."), Error::RegexUnbalancedParen(pos) => write!(f, "An unbalanced parenthesis is encountered at {pos} \ when constructing a regular expression."), Error::RegexInvalidRepeat(pos) => write!(f, "An invalid repetition specification is encountered \ at {pos} when constructing a regular expression."), Error::RegexInvalidComment(pos) => write!(f, "An invalid character is encountered in a comment \ at {pos}"), Error::RegexInvalidNumeric(pos) => write!(f, "An invalid numeric value is encoutnered at {pos}."), Error::MissingEqualSign(pos) => write!(f, "A rule is missing an equal sign at {pos}"), Error::Unimplemented(pos) => write!(f, "An unimplemented feature is required at {pos}."), Error::IOError(e) => write!(f, "An IO error: {e}"), Error::EmptyGrammar => write!(f, "the grammar is empty"), Error::AppendToNothing(pos) => write!(f, "the rule at {pos} is appending to a non-existing rule."), Error::CreateAgain(pos) => write!(f, "the rule at {pos} is creating a duplicated rule."), Error::UnknownTerminal(name) => write!(f, "the terminal {name} is not found."), Error::UnknownNonterminal(name) => write!(f, "the non-terminal {name} is not found."), Error::Invalid => write!(f, "invalid"), } } } /// Return the number of bytes taken by the characters for which the /// function `f` returns `true`. fn skip_by(s: &str, f: impl Fn(char) -> bool) -> usize { for (index, c) in s.char_indices() { if !f(c) { return index; } } s.len() } mod boolean_fns; pub(crate) use boolean_fns::*; fn skip_c_nl(s: &str) -> Option { if s.is_empty() { return Some(0); } let s_len = s.len(); match s.chars().next().unwrap() { ';' => { let after_not_newline = skip_by(&s[1..], is_not_newline); if 1 + after_not_newline >= s_len { Some(s_len) } else { let s = &s[1 + after_not_newline..]; match s.chars().next().unwrap() { '\n' | '\r' => Some(after_not_newline + 2), _ => { // We skipped anything not newline, so this // character must be newline. unreachable!(); } } } } '\n' | '\r' => Some(1), _ => None, } } fn skip_c_wsp(s: &str) -> usize { let mut currently_skipped = 0usize; let mut s = s; let mut continue_skipping = true; 'skipping_loop: while continue_skipping { continue_skipping = false; let skip_wsp = skip_by(s, is_wsp); if skip_wsp > 0 { if skip_wsp >= s.len() { return currently_skipped + s.len(); } continue_skipping = true; s = &s[skip_wsp..]; currently_skipped += skip_wsp; continue 'skipping_loop; } if let Some(skip_c_nl_size) = skip_c_nl(s) { if skip_c_nl_size >= s.len() { return currently_skipped + s.len(); } continue_skipping = true; s = &s[skip_c_nl_size..]; currently_skipped += skip_c_nl_size; let skip_wsp_after_c_nl = skip_by(s, is_wsp); s = &s[skip_wsp_after_c_nl..]; currently_skipped += skip_wsp_after_c_nl; } } currently_skipped } #[derive(Debug, Eq, PartialEq)] enum RepetitionSpec { Star, Equal(usize), Min(usize), Max(usize), MinMax(usize, usize), } #[derive(Debug)] enum RepetitionAction { /// Ignore the following element. Ignore, /// The following is put in a star. Star, /// The following is put in an optional construct. Optional, /// The following is put in a plus construct. Plus, /// Copy the following some number of times. Copy(usize), /// Copy the following some number of times, and then add a plus /// of that same element. CopyPlus(usize), /// Copy the following some number of times, and then add another /// number of *nested* optional construts of that same element. /// /// # Why nested? /// /// The optional constructs are nested here as a sequence of /// optional constructs is exponentially more ambiguous than the /// nested equivalent: "a?a?a?" is more ambiguous than /// "(a(a(a)?)?)?". For example, the input "a" can have three /// derivations according to "a?a?a?", but has only one derivation /// according to "(a(a(a)?)?)?". CopyOptional(usize, usize), } impl TryFrom> for RepetitionAction { type Error = Error; fn try_from(spec: Option) -> Result { if let Some(r) = spec { match r { RepetitionSpec::Star => Ok(Self::Star), RepetitionSpec::Equal(0) => Ok(Self::Ignore), RepetitionSpec::Equal(n) => Ok(Self::Copy(n)), RepetitionSpec::Min(0) => Ok(Self::Star), RepetitionSpec::Min(1) => Ok(Self::Plus), RepetitionSpec::Min(n) => Ok(Self::CopyPlus(n - 1)), RepetitionSpec::Max(0) => Ok(Self::Ignore), RepetitionSpec::Max(1) => Ok(Self::Optional), RepetitionSpec::Max(n) => Ok(Self::CopyOptional(0, n)), RepetitionSpec::MinMax(0, 0) => Ok(Self::Ignore), RepetitionSpec::MinMax(0, 1) => Ok(Self::Optional), RepetitionSpec::MinMax(n, m) if n > m => Err(Error::MinGreaterThanMax(0)), RepetitionSpec::MinMax(n, m) if n == m => Ok(Self::Copy(n)), RepetitionSpec::MinMax(n, m) => Ok(Self::CopyOptional(n, m - n)), } } else { Ok(Self::Copy(1)) } } } fn parse_repetition_spec(s: &str) -> Option<(usize, RepetitionSpec)> { let mut s = s; let digit_size = skip_by(s, is_decimal_digit); let mut min: Option = None; let mut max: Option = None; if digit_size != 0 { min = Some(s[..digit_size].parse::().unwrap()); } let mut current_index = digit_size; if digit_size == s.len() { if let Some(m) = min { return Some((digit_size, RepetitionSpec::Equal(m))); } else { return None; }; } s = &s[current_index..]; if s.starts_with('*') { if s.len() == 1 { if let Some(m) = min { return Some((current_index + 1, RepetitionSpec::Min(m))); } else { return Some((current_index + 1, RepetitionSpec::Star)); } } s = &s[1..]; current_index += 1; let digit_size = skip_by(s, is_decimal_digit); if digit_size != 0 { max = Some(s[..digit_size].parse::().unwrap()); } current_index += digit_size; match (min, max) { (Some(mi), Some(ma)) => Some((current_index, RepetitionSpec::MinMax(mi, ma))), (Some(m), None) => Some((current_index, RepetitionSpec::Min(m))), (None, Some(m)) => Some((current_index, RepetitionSpec::Max(m))), _ => Some((current_index, RepetitionSpec::Star)), } } else { min.map(|m| (current_index, RepetitionSpec::Equal(m))) } } fn parse_rulename(s: &str) -> Result { let first_char = s.chars().next().unwrap(); if !is_alpha(first_char) { return Ok(0); } let rulename_end = skip_by(s, is_alpha_or_digit_or_hyphen); if rulename_end >= s.len() { // dbg!(); return Err(Error::HangingRuleName(0)); } Ok(rulename_end) } #[derive(Debug, Eq, PartialEq, Hash)] enum IntermediateConstruct { Terminal(String), Nonterminal(String), Range(char, char), } impl Display for IntermediateConstruct { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { IntermediateConstruct::Terminal(t) => write!(f, "t{t}"), IntermediateConstruct::Nonterminal(n) => write!(f, "n{n}"), IntermediateConstruct::Range(min, max) => write!(f, "[{min}-{max}]"), } } } #[derive(Debug, Eq, PartialEq, Copy, Clone)] enum Radix { Binary, Decimal, Hexadecimal, } fn parse_numeric(s: &str, radix: Radix) -> Result<(usize, IntermediateConstruct), Error> { // first read a number let is_radix_digit = match radix { Radix::Binary => is_binary_digit, Radix::Decimal => is_decimal_digit, Radix::Hexadecimal => is_hexadecimal_digit, }; let radix = match radix { Radix::Binary => 2, Radix::Decimal => 10, Radix::Hexadecimal => 16, }; let mut s = s; let mut current_index = 0; let first_digit_size = skip_by(s, is_radix_digit); if first_digit_size == 0 { // dbg!(); return Err(Error::InvalidDigit(current_index)); } let first_number = match std::char::from_u32(u32::from_str_radix(&s[..first_digit_size], radix).unwrap()) { Some(c) => c, None => { // dbg!(); return Err(Error::InvalidCharacter(current_index)); } }; if first_digit_size >= s.len() { return Ok(( current_index + s.len(), IntermediateConstruct::Terminal(first_number.to_string()), )); } s = &s[first_digit_size..]; current_index += first_digit_size; // then read a dot or a dash match s.chars().next().unwrap() { '.' => { s = &s[1..]; current_index += 1; let mut stop_reading = false; let mut chars_vec = vec![first_number]; while !stop_reading { if s.is_empty() { return Err(Error::HangingNumeric(current_index - 1)); } let digit_size = skip_by(s, is_radix_digit); if digit_size == 0 { // dbg!(); return Err(Error::InvalidDigit(current_index)); } let a_number = std::char::from_u32(u32::from_str_radix(&s[..digit_size], radix).unwrap()) .ok_or(Error::InvalidCharacter(current_index))?; chars_vec.push(a_number); s = &s[digit_size..]; current_index += digit_size; if !s.starts_with('.') { stop_reading = true; // println!( // "digit_size = {digit_size} a number = {a_number}, s = {}", // &s[..10] // ); } else { s = &s[1..]; current_index += 1; } } Ok(( current_index, IntermediateConstruct::Terminal(chars_vec.into_iter().collect()), )) } '-' => { s = &s[1..]; current_index += 1; if s.is_empty() { return Err(Error::HangingNumeric(current_index - 1)); } let second_digit_size = skip_by(s, is_radix_digit); if second_digit_size == 0 { println!( "s = {}", s.chars() .take(if s.len() >= 50 { 50 } else { s.len() }) .collect::() ); return Err(Error::InvalidDigit(current_index)); } let _second_number = std::char::from_u32(u32::from_str_radix(&s[..second_digit_size], radix).unwrap()) .ok_or(Error::InvalidCharacter(current_index))?; Err(Error::Unimplemented(current_index - 1)) // Ok(( // current_index + second_digit_size, // IntermediateConstruct::Range(first_number, second_number), // )) } c if is_wsp(c) || c == '\n' || c == '\r' || c == ';' => Ok(( current_index, IntermediateConstruct::Terminal(first_number.to_string()), )), _ => Err(Error::InvalidCharacter(current_index)), } } #[derive(Debug)] struct AlternationConstruct { intermediates: Vec, regex: DefaultRegex, } fn parse_alternation(s: &str, indentation: usize) -> Result<(usize, AlternationConstruct), Error> { let mut stack: Vec = vec![0, 1]; let mut intermediates: Vec = Vec::new(); let mut types: Vec> = vec![RegexType::Or, RegexType::Empty]; let mut edges: Vec> = vec![vec![1], Vec::new()]; fn error_conversion(error: nfa::error::Error) -> Error { match error { nfa::error::Error::UnknownNode(n) => Error::RegexUnknownNode(n), nfa::error::Error::UnsupportedOperation => Error::UnsupportedOperation, nfa::error::Error::NoRoot => Error::RegexNoRoot, nfa::error::Error::Graph(ge) => ge.into(), nfa::error::Error::Cycle => Error::RegexCycle, _ => Error::Invalid, } } let mut s = s; let mut current_index = 0usize; let mut nonterminal_nodes: HashSet = HashSet::new(); if s.is_empty() { let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; return Ok(( current_index, AlternationConstruct { intermediates, regex, }, )); } // A vector of tuples to record some information for each level of // nesting. // // A tuple (len, action) means that the stack grows by LEN many // tokens at this level, and the action ACTION is pending. let mut repetition_action_stack: Vec<(usize, Option)> = vec![]; while !stack.is_empty() { // First read repetition specifications. let mut repetition_spec = None; if let Some((spec_size, spec)) = parse_repetition_spec(s) { if spec_size >= s.len() { return Err(Error::HangingRepetition(current_index)); } s = &s[spec_size..]; current_index += spec_size; repetition_spec = Some(spec); } let repetition_spec = repetition_spec; // if a rulename match parse_rulename(s) { Ok(0) => {} Ok(size) => { let name = &s[..size].to_string(); intermediates.push(IntermediateConstruct::Nonterminal(name.clone())); let nt_index = intermediates.len() - 1; let action = repetition_spec .try_into() .map_err(|e: Error| e.translate(current_index))?; // println!("rep_action = {action:?}"); let stack_last = stack .iter() .copied() .last() .ok_or(Error::RegexEmptyStack(current_index))?; match action { RepetitionAction::Ignore => {} RepetitionAction::Star => { types.push(RegexType::Kleene); types.push(RegexType::Lit(nt_index)); edges.push(vec![types.len() - 1]); edges.push(Vec::new()); nonterminal_nodes.insert(types.len() - 1); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); } RepetitionAction::Optional => { types.append(&mut vec![RegexType::Optional]); types.push(RegexType::Lit(nt_index)); edges.push(vec![types.len() - 1]); edges.push(vec![]); nonterminal_nodes.insert(types.len() - 1); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); } RepetitionAction::Plus => { types.append(&mut vec![RegexType::Plus]); types.push(RegexType::Lit(nt_index)); edges.push(vec![types.len() - 1]); edges.push(vec![]); nonterminal_nodes.insert(types.len() - 1); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); } RepetitionAction::Copy(n) => { let old_types_len = types.len(); nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .extend((0..n).map(|i| old_types_len + i)); types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); edges.extend(std::iter::repeat_with(Default::default).take(n)); // for _ in 0..n { // types.push(RegexType::Lit(nt_index)); // edges.push(vec![]); // nonterminal_nodes.insert(types.len() - 1); // edges[stack_last].push(types.len() - 1); // } } RepetitionAction::CopyPlus(n) => { let old_types_len = types.len(); nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .extend((0..n).map(|i| old_types_len + i)); types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); edges.extend(std::iter::repeat_with(Default::default).take(n)); // for _ in 0..n { // types.push(RegexType::Lit(nt_index)); // edges.push(vec![]); // edges[stack_last].push(types.len() - 1); // } types.append(&mut vec![RegexType::Plus]); types.push(RegexType::Lit(nt_index)); nonterminal_nodes.insert(types.len() - 1); edges.push(vec![types.len() - 1]); edges.push(vec![]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); } RepetitionAction::CopyOptional(n, m) => { let old_types_len = types.len(); nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .extend((0..n).map(|i| old_types_len + i)); types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); edges.extend(std::iter::repeat_with(Default::default).take(n)); // for _ in 0..n { // types.push(RegexType::Lit(nt_index)); // edges.push(vec![]); // nonterminal_nodes.insert(types.len() - 1); // edges[stack_last].push(types.len() - 1); // } let types_len = types.len(); types.extend( std::iter::repeat([RegexType::Optional, RegexType::Lit(nt_index)]) .take(m) .flatten(), ); edges.extend((0..(m - 1)).flat_map(|i| { [ vec![types_len + 1 + 2 * i, types_len + 2 + 2 * i], Vec::new(), ] })); edges.extend([vec![types.len() - 1], Vec::new()]); nonterminal_nodes.extend((0..m).map(|i| types_len + 2 * i + 1)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types_len); // for _ in 0..m { // types.extend([RegexType::Optional, RegexType::Lit(nt_index)]); // edges.extend([vec![types.len() - 1], Vec::new()]); // nonterminal_nodes.insert(types.len() - 1); // edges[stack_last].push(types.len() - 2); // } } } s = &s[size..]; current_index += size; // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } Err(_) => { intermediates.push(IntermediateConstruct::Nonterminal(s.to_owned())); types.push(RegexType::Lit(intermediates.len() - 1)); edges.push(vec![]); let stack_last = stack .last() .copied() .ok_or(Error::RegexEmptyStack(current_index))?; let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 1); nonterminal_nodes.insert(types.len() - 1); break; } } // match against the first charcter let first_char = s.chars().next().unwrap(); match first_char { '(' => { // group s = &s[1..]; current_index += 1; let action = repetition_spec .try_into() .map_err(|e: Error| e.translate(current_index))?; let stack_last = stack .iter() .copied() .last() .ok_or(Error::RegexEmptyStack(current_index))?; match action { RepetitionAction::Star => { types.extend([RegexType::Kleene, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 3); repetition_action_stack.push((3, None)); stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); } RepetitionAction::Optional => { types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 3); stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); repetition_action_stack.push((3, None)); } RepetitionAction::Plus => { types.extend([RegexType::Plus, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 3); stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); repetition_action_stack.push((3, None)); } RepetitionAction::Copy(1) => { types.extend([RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); stack.extend([types.len() - 2, types.len() - 1]); repetition_action_stack.push((2, None)); } RepetitionAction::CopyOptional(0, _) => { types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 3); stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); repetition_action_stack.push((3, Some(action))); } _ => { types.extend([RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); stack.extend([types.len() - 2, types.len() - 1]); repetition_action_stack.push((2, Some(action))); } } s = &s[1..]; current_index += 1; // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { return Err(Error::RegexUnbalancedParen(current_index)); } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } ')' => { // end group s = &s[1..]; current_index += 1; if repetition_spec.is_some() { return Err(Error::RegexInvalidRepeat(current_index)); } let (stack_growth, repetition_action) = repetition_action_stack .pop() .ok_or(Error::RegexEmptyActionStack(current_index))?; match repetition_action { Some(action) => { if stack.len() <= stack_growth + 1 { dbg!(); return Err(Error::RegexEmptyStack(current_index)); } stack.truncate(stack.len() - stack_growth + 1); // this is not going to fail because of the previous check let stack_last = stack.pop().unwrap(); let stack_second_last = stack.iter().copied().last().unwrap(); match action { RepetitionAction::Ignore => { edges[stack_second_last].pop(); types.truncate(stack_last); edges.truncate(stack_last); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { let nodes_cloned = types.iter().skip(stack_last).cloned().collect::>(); let edges_cloned = edges.iter().skip(stack_last).cloned().collect::>(); for _ in 0..(n - 1) { edges[stack_second_last].push(types.len()); // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::CopyPlus(n) => { let nodes_cloned = types.iter().skip(stack_last).cloned().collect::>(); let edges_cloned = edges.iter().skip(stack_last).cloned().collect::>(); for _ in 0..(n - 1) { edges[stack_second_last].push(types.len()); // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } types.push(RegexType::Plus); edges.push(vec![types.len()]); edges[stack_second_last].push(types.len() - 1); // if stack_second_last == 3 && nodes.len() == 4 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::CopyOptional(n, m) => { let nodes_cloned = types.iter().skip(stack_last).cloned().collect::>(); let edges_cloned = edges.iter().skip(stack_last).cloned().collect::>(); for _ in 0..(n - 1) { edges[stack_second_last].push(types.len()); // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } edges[stack_second_last].push(types.len()); let nodes_cloned_len = nodes_cloned.len(); for _ in 0..(m - 1) { // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.push(RegexType::Optional); edges.push(vec![types.len(), types.len() + nodes_cloned_len]); types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } types.push(RegexType::Optional); edges.push(vec![types.len()]); types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::Star | RepetitionAction::Optional | RepetitionAction::Plus => { // this will not happen here // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } } } None => { if stack.len() <= stack_growth { dbg!(); return Err(Error::RegexEmptyStack(current_index)); } stack.truncate(stack.len() - stack_growth); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } } } '[' => { // option let action = repetition_spec .try_into() .map_err(|e: Error| e.translate(current_index))?; let stack_last = stack .iter() .copied() .last() .ok_or(Error::RegexEmptyStack(current_index))?; match action { RepetitionAction::Ignore => { types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); edges[stack_last].push(types.len() - 3); stack.push(types.len() - 3); stack.push(types.len() - 2); stack.push(types.len() - 1); repetition_action_stack.push((3, Some(action))); } RepetitionAction::Optional => { types.extend([ RegexType::Optional, RegexType::Optional, RegexType::Or, RegexType::Empty, ]); edges.extend([ vec![types.len() - 3], vec![types.len() - 2], vec![types.len() - 1], Vec::new(), ]); edges[stack_last].push(types.len() - 4); stack.push(types.len() - 4); stack.push(types.len() - 3); stack.push(types.len() - 2); stack.push(types.len() - 1); repetition_action_stack.push((4, None)); } RepetitionAction::Star => { types.extend([ RegexType::Kleene, RegexType::Optional, RegexType::Or, RegexType::Empty, ]); edges.extend([ vec![types.len() - 3], vec![types.len() - 2], vec![types.len() - 1], Vec::new(), ]); edges[stack_last].push(types.len() - 4); stack.push(types.len() - 4); stack.push(types.len() - 3); stack.push(types.len() - 2); stack.push(types.len() - 1); repetition_action_stack.push((4, None)); } RepetitionAction::Plus => { types.extend([ RegexType::Kleene, RegexType::Optional, RegexType::Or, RegexType::Empty, ]); edges.extend([ vec![types.len() - 3], vec![types.len() - 2], vec![types.len() - 1], Vec::new(), ]); edges[stack_last].push(types.len() - 4); stack.push(types.len() - 4); stack.push(types.len() - 3); stack.push(types.len() - 2); stack.push(types.len() - 1); repetition_action_stack.push((4, None)); } RepetitionAction::Copy(1) => { types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); edges[stack_last].push(types.len() - 3); stack.push(types.len() - 3); stack.push(types.len() - 2); stack.push(types.len() - 1); repetition_action_stack.push((3, None)); } _ => { types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); edges[stack_last].push(types.len() - 3); stack.push(types.len() - 3); stack.push(types.len() - 2); stack.push(types.len() - 1); repetition_action_stack.push((3, Some(action))); } } s = &s[1..]; current_index += 1; // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { return Err(Error::RegexUnbalancedParen(current_index)); } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } ']' => { // end option s = &s[1..]; current_index += 1; if repetition_spec.is_some() { return Err(Error::RegexInvalidRepeat(current_index)); } let (stack_growth, repetition_action) = repetition_action_stack .pop() .ok_or(Error::RegexEmptyActionStack(current_index))?; match repetition_action { Some(action) => { if stack.len() <= stack_growth + 1 { return Err(Error::RegexEmptyStack(current_index)); } stack.truncate(stack.len() - stack_growth + 1); // this is not going to fail because of the previous check let stack_last = stack.pop().unwrap(); let stack_second_last = stack.iter().copied().last().unwrap(); match action { RepetitionAction::Ignore => { edges[stack_second_last].pop(); types.truncate(stack_last); edges.truncate(stack_last); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { let nodes_cloned = types.iter().skip(stack_last).cloned().collect::>(); let edges_cloned = edges.iter().skip(stack_last).cloned().collect::>(); for _ in 0..(n - 1) { edges[stack_second_last].push(types.len()); // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::CopyPlus(n) => { let nodes_cloned = types.iter().skip(stack_last).cloned().collect::>(); let edges_cloned = edges.iter().skip(stack_last).cloned().collect::>(); for _ in 0..(n - 1) { edges[stack_second_last].push(types.len()); // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } types.push(RegexType::Plus); edges.push(vec![types.len()]); edges[stack_second_last].push(types.len() - 1); // if stack_second_last == 3 && nodes.len() == 4 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::CopyOptional(n, m) => { let nodes_cloned = types.iter().skip(stack_last).cloned().collect::>(); let edges_cloned = edges.iter().skip(stack_last).cloned().collect::>(); for _ in 0..(n - 1) { edges[stack_second_last].push(types.len()); // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } let nodes_cloned_len = nodes_cloned.len(); for _ in 0..(m - 1) { // if stack_second_last == 3 && nodes.len() == 3 { // println!("haha"); // } types.push(RegexType::Optional); edges.push(vec![types.len(), types.len() + nodes_cloned_len]); types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); } types.push(RegexType::Optional); edges.push(vec![types.len()]); types.extend(nodes_cloned.iter().copied()); let edges_offset = edges.len() - stack_last; edges.extend(edges_cloned.iter().map(|edges| { edges .iter() .map(|des| des + edges_offset) .collect::>() })); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } RepetitionAction::Star | RepetitionAction::Optional | RepetitionAction::Plus => { // this will not happen here // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } } } None => { if stack.len() <= stack_growth { return Err(Error::RegexEmptyStack(current_index)); } stack.truncate(stack.len() - stack_growth); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } } } '/' => { // end concatenation s = &s[1..]; current_index += 1; if repetition_spec.is_some() { return Err(Error::RegexInvalidRepeat(current_index)); } if stack.len() <= 1 { return Err(Error::RegexEmptyStack(current_index)); } stack.pop().unwrap(); let stack_last = stack.iter().copied().last().unwrap(); edges[stack_last].push(types.len()); types.push(RegexType::Empty); edges.push(vec![]); stack.push(types.len() - 1); // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } '"' => { // string s = &s[1..]; current_index += 1; let skip_char_value = skip_by(s, is_char_value); if skip_char_value >= s.len() { return Err(Error::HangingDQuote(current_index)); } let fakes = &s[skip_char_value..]; if !fakes.starts_with('"') { return Err(Error::HangingDQuote(current_index)); } let action = repetition_spec .try_into() .map_err(|e: Error| e.translate(current_index))?; let stack_last = stack .iter() .copied() .last() .ok_or(Error::RegexEmptyStack(current_index))?; let slice_value = s[..skip_char_value].to_owned(); intermediates.push(IntermediateConstruct::Terminal(slice_value.clone())); let t_index = intermediates.len() - 1; match action { RepetitionAction::Ignore => {} RepetitionAction::Star => { types.push(RegexType::Kleene); types.push(RegexType::Lit(t_index)); edges.push(vec![types.len() - 1]); edges.push(vec![]); edges[stack_last].push(types.len() - 2); } RepetitionAction::Optional => { types.append(&mut vec![RegexType::Optional]); types.push(RegexType::Lit(t_index)); edges.push(vec![types.len() - 1]); edges.push(vec![]); edges[stack_last].push(types.len() - 2); } RepetitionAction::Plus => { types.append(&mut vec![RegexType::Plus]); types.push(RegexType::Lit(t_index)); edges.push(vec![types.len() - 1]); edges.push(vec![]); edges[stack_last].push(types.len() - 2); } RepetitionAction::Copy(n) => { let types_len = types.len(); types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); edges.extend(std::iter::repeat_with(Default::default).take(n)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .extend((0..n).map(|i| types_len + i)); // for _ in 0..n { // types.push(RegexType::Lit(t_index)); // edges.push(vec![]); // edges[stack_last].push(types.len() - 1); // } } RepetitionAction::CopyPlus(n) => { let types_len = types.len(); types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); edges.extend(std::iter::repeat_with(Default::default).take(n)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .extend((0..n).map(|i| types_len + i)); types.extend([RegexType::Plus, RegexType::Lit(t_index)]); edges.extend([vec![types.len() - 1], Vec::new()]); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 2); } RepetitionAction::CopyOptional(n, m) => { let types_len = types.len(); types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); edges.extend(std::iter::repeat_with(Default::default).take(n)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .extend((0..n).map(|i| types_len + i)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len()); for _ in 0..(m - 1) { types.extend([RegexType::Optional, RegexType::Lit(t_index)]); edges.extend([vec![types.len() - 1, types.len()], Vec::new()]); } types.extend([RegexType::Optional, RegexType::Lit(t_index)]); edges.extend([vec![types.len() - 1], Vec::new()]); } } s = &fakes[1..]; current_index += skip_char_value + 1; // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } '%' => { // numeric value s = &s[1..]; current_index += 1; if s.is_empty() { return Err(Error::HangingPercent(current_index - 1)); } let inner_first_char = s.chars().next().unwrap(); let intermediate_value: IntermediateConstruct; match inner_first_char { 'b' => { s = &s[1..]; current_index += 1; let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Binary) .map_err(|e| e.translate(current_index))?; intermediate_value = parsed_construct; s = &s[parsed_size..]; current_index += parsed_size; } 'd' => { s = &s[1..]; current_index += 1; let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Decimal) .map_err(|e| e.translate(current_index))?; intermediate_value = parsed_construct; s = &s[parsed_size..]; current_index += parsed_size; } 'x' => { s = &s[1..]; current_index += 1; let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Hexadecimal) .map_err(|e| e.translate(current_index))?; // println!("size = {parsed_size}, construct = {parsed_construct:#?}"); intermediate_value = parsed_construct; s = &s[parsed_size..]; current_index += parsed_size; } _ => { return Err(Error::RegexInvalidNumeric(current_index - 1)); } } let action = repetition_spec .try_into() .map_err(|e: Error| e.translate(current_index))?; let stack_last = stack .iter() .copied() .last() .ok_or(Error::RegexEmptyStack(current_index))?; intermediates.push(intermediate_value); let inter_index = intermediates.len() - 1; match action { RepetitionAction::Ignore => {} RepetitionAction::Star => { types.push(RegexType::Kleene); edges.push(vec![types.len()]); edges[stack_last].push(types.len() - 1); types.push(RegexType::Lit(inter_index)); edges.push(vec![]); } RepetitionAction::Optional => { types.push(RegexType::Optional); edges.push(vec![types.len()]); edges[stack_last].push(types.len() - 1); types.push(RegexType::Lit(inter_index)); edges.push(vec![]); } RepetitionAction::Plus => { types.push(RegexType::Plus); edges.push(vec![types.len()]); edges[stack_last].push(types.len() - 1); types.push(RegexType::Lit(inter_index)); edges.push(vec![]); } RepetitionAction::Copy(n) => { types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); edges.extend(std::iter::repeat_with(Default::default).take(n)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 1); // for _ in 0..n { // types.push(RegexType::Lit(inter_index)); // edges.push(vec![]); // edges[stack_last].push(types.len() - 1); // } } RepetitionAction::CopyPlus(n) => { types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); edges.extend(std::iter::repeat_with(Default::default).take(n)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 1); types.push(RegexType::Plus); edges.push(vec![types.len()]); edges[stack_last].push(types.len() - 1); types.push(RegexType::Lit(inter_index)); edges.push(vec![]); } RepetitionAction::CopyOptional(n, m) => { types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); edges.extend(std::iter::repeat_with(Default::default).take(n)); let edges_len = edges.len(); edges .get_mut(stack_last) .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? .push(types.len() - 1); for _ in 0..(m - 1) { types.push(RegexType::Optional); edges.push(vec![types.len(), types.len() + 1]); edges[stack_last].push(types.len() - 1); types.push(RegexType::Lit(inter_index)); edges.push(vec![]); } types.push(RegexType::Optional); edges.push(vec![types.len()]); edges[stack_last].push(types.len() - 1); types.push(RegexType::Lit(inter_index)); edges.push(vec![]); } } // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { break; } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; continue; } '<' => { // prose value return Err(Error::Unimplemented(current_index)); } ';' => { // Possibly extend to the next line if the indentation // of the next line is greater than `indentation`. s = &s[1..]; current_index += 1; let mut in_comment = true; loop { let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; let mut after_wsp_and_v_char = skip_by(s, skip_fn); if after_wsp_and_v_char >= s.len() { current_index += s.len(); stack = vec![]; break; } s = &s[after_wsp_and_v_char..]; current_index += after_wsp_and_v_char; let first_char = s.chars().next().unwrap(); let mut skip_again = false; match first_char { '\n' | '\r' => { s = &s[1..]; current_index += 1; skip_again = true; } _ if in_comment => { // dbg!("1815"); return Err(Error::RegexInvalidComment(current_index)); } _ => {} } if skip_again { let skip_wsp = skip_by(s, is_wsp); if skip_wsp >= s.len() { current_index += s.len(); stack = vec![]; break; } after_wsp_and_v_char = skip_wsp; s = &s[skip_wsp..]; current_index += skip_wsp; } let first_char = s.chars().next().unwrap(); match first_char { ';' => { // continue looping s = &s[1..]; current_index += 1; in_comment = true; continue; } '\n' | '\r' => { in_comment = false; s = &s[1..]; current_index += 1; continue; } _ if after_wsp_and_v_char <= indentation => { stack = vec![]; break; } _ => { break; } } } } '\n' | '\r' => { // Possibly extend to the next line if the indentation // of the next line is greater than `indentation`. // println!("s = {}", s.chars().take(10).collect::()); s = &s[1..]; current_index += 1; if s.is_empty() { break; } let mut in_comment = false; loop { let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; let mut after_wsp = skip_by(s, skip_fn); if after_wsp >= s.len() { current_index += s.len(); stack = vec![]; break; } s = &s[after_wsp..]; current_index += after_wsp; let first_char = s.chars().next().unwrap(); let mut skip_again = false; match first_char { '\n' | '\r' => { s = &s[1..]; current_index += 1; skip_again = true; } _ if in_comment => { // dbg!("1903"); return Err(Error::RegexInvalidComment(current_index)); } _ => {} } if skip_again { let skip_wsp = skip_by(s, is_wsp); if skip_wsp >= s.len() { current_index += s.len(); stack = vec![]; break; } s = &s[skip_wsp..]; current_index += skip_wsp; after_wsp = skip_wsp; } let first_char = s.chars().next().unwrap(); match first_char { ';' => { // continue looping s = &s[1..]; current_index += 1; in_comment = true; continue; } '\n' | '\r' => { in_comment = false; s = &s[1..]; current_index += 1; continue; } _ if after_wsp <= indentation => { stack = vec![]; break; } _ => { break; } } } } _ => { // invalid println!("s = {}", &s[..(if s.len() > 50 { 50 } else { s.len() })]); return Err(Error::InvalidCharacter(current_index)); } } } let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; Ok(( current_index, AlternationConstruct { intermediates, regex, }, )) } type RuleConstruct = ( // Name String, // Data Vec, // Rule DefaultRegex, // extra_alternation_or_not bool, ); fn parse_one_rule(s: &str, indentation: usize) -> Result<(usize, RuleConstruct), Error> { if s.is_empty() { return Ok((0, ("".to_owned(), Vec::new(), Default::default(), false))); } let mut s = s; let mut current_index = 0usize; let mut appending_alternation = false; // rule name let rulename_end = parse_rulename(s)?; let rule_name = s[..rulename_end].to_string(); s = &s[rulename_end..]; current_index += rulename_end; // c wsp let skip_c_wsp_size = skip_by(s, is_wsp); if skip_c_wsp_size >= s.len() { // dbg!(); return Err(Error::HangingRuleName(0)); } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; // equal or equal forward slash let next_char = s.chars().next().unwrap(); if next_char != '=' { // println!("s = {}", &s[..50]); // println!("rule_name = {rule_name}"); return Err(Error::MissingEqualSign(current_index + 1)); } s = &s[1..]; current_index += 1; if s.is_empty() { return Err(Error::HangingEqualsSign(current_index - 1)); } let next_char = s.chars().next().unwrap(); if next_char == '/' { appending_alternation = true; s = &s[1..]; current_index += 1; } if s.is_empty() { return Err(Error::HangingEqualsSign(current_index - 1)); } // c wsp let skip_c_wsp_size = skip_c_wsp(s); if skip_c_wsp_size >= s.len() { // dbg!(); return Err(Error::HangingRuleName(0)); } s = &s[skip_c_wsp_size..]; current_index += skip_c_wsp_size; // elements // alternation let (alternation_size, alternation_result) = parse_alternation(s, indentation).map_err(|e| e.translate(current_index))?; let intermediates_data = alternation_result.intermediates; let regex = alternation_result.regex; // println!("size = {alternation_size}, result = {alternation_result:?}"); if alternation_size >= s.len() { s = ""; } else { s = &s[alternation_size..]; } current_index += alternation_size; // c wsp let skip_c_wsp_size = skip_c_wsp(s); if skip_c_wsp_size >= s.len() { s = ""; } else { s = &s[skip_c_wsp_size..]; } current_index += skip_c_wsp_size; // c-nl if let Some(skip_c_nl) = skip_c_nl(s) { current_index += skip_c_nl; } Ok(( current_index, (rule_name, intermediates_data, regex, appending_alternation), )) } impl std::str::FromStr for Grammar { // First skip whitespaces, linefeeds, carraige returns and // semicolons until the first rule appears. The basic indentation // level is set to the indentation level of the first rule's line. // // Then enter identifier reading mode, reading a letter followed // by a sequence of letters, digits and hyphens. // // After skipping white spaces, read an equal sign. // // Then enter alternation reading mode, reading alternations of // concatenations. // // Then enter element reading mode, reading optionally repetition // specification followed by an identifier, or a numeral value, or // a quoted string. type Err = Error; fn from_str(s: &str) -> Result { let mut before_first_rule = true; let mut basic_indentation: usize = 0; let mut current_skipped_index: usize = 0; let mut s = s; while before_first_rule { before_first_rule = false; let after_wsp = skip_by(s, is_wsp); if after_wsp >= s.len() { // The entire string consists of white-spaces and // comments. return Err(Error::EmptyGrammar); } s = &s[after_wsp..]; current_skipped_index += after_wsp; match skip_c_nl(s) { Some(skipped_size) => { if skipped_size >= s.len() { return Err(Error::EmptyGrammar); } // Repeat skipping until we find the first rule. before_first_rule = true; s = &s[skipped_size..]; current_skipped_index += skipped_size; } None => { // Found the first rule. basic_indentation = after_wsp; } } } // TODO: Core rules are not implemented for now. #[allow(unused_macros)] macro_rules! log { () => { let mut file = std::fs::OpenOptions::new() .create(true) .append(true) .open("debug.log") .unwrap(); let _ = file.write_all(format!("{}\n", line!()).as_bytes()); }; } let mut nonterminals_vec: Vec = vec![]; let mut nonterminals_map: HashMap = HashMap::new(); let mut all_rules: HashMap> = Default::default(); let mut terminals_vec: Vec = vec![]; let mut terminals_map: HashMap = HashMap::new(); let mut temp = s; // First collect non-terminals in the order they are defined. loop { let (rule_size, (rule_name, _rule_data, _rule_regex, appending_p)) = parse_one_rule(temp, basic_indentation) .map_err(|e| e.translate(current_skipped_index))?; if rule_size == 0 { break; } if appending_p && nonterminals_map.get(&rule_name).is_none() { return Err(Error::AppendToNothing(current_skipped_index)); } else if !appending_p && nonterminals_map.get(&rule_name).is_some() { return Err(Error::CreateAgain(current_skipped_index)); } if !appending_p { nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); nonterminals_vec.push(rule_name); } if rule_size > 0 && rule_size < temp.len() { temp = &temp[rule_size..]; current_skipped_index += rule_size; } else { break; } } let mut temp = s; // Then gather the data about names of terminals and // non-terminals. loop { let (rule_size, (_rule_name, rule_data, _rule_regex, _appending_p)) = parse_one_rule(temp, basic_indentation)?; if rule_size == 0 { break; } for data in rule_data { match data { IntermediateConstruct::Terminal(name) => { if terminals_map.get(&name).is_none() { terminals_map.insert(name.clone(), terminals_vec.len()); terminals_vec.push(name); } } IntermediateConstruct::Nonterminal(name) => { if nonterminals_map.get(&name).is_none() { return Err(Error::UnknownNonterminal(name)); } } IntermediateConstruct::Range(_, _) => unimplemented!(), } } if rule_size > 0 && rule_size < temp.len() { temp = &temp[rule_size..]; current_skipped_index += rule_size; } else { break; } } // Then collect the actual rules. loop { let (rule_size, (rule_name, rule_data, rule_regex, appending_p)) = parse_one_rule(s, basic_indentation)?; // println!("rule name = {rule_name}"); // println!("regexp = {rule_regexp:#?}"); let rule_transform = |old_label: usize| -> Result { if let Some(old_value) = rule_data.get(old_label) { match old_value { IntermediateConstruct::Terminal(name) => { if let Some(index) = terminals_map.get(name) { Ok(TNT::Ter(*index)) } else { dbg!(); Err(Error::UnknownTerminal(name.clone())) } } IntermediateConstruct::Nonterminal(name) => { if let Some(index) = nonterminals_map.get(name) { Ok(TNT::Non(*index)) } else { dbg!(); Err(Error::UnknownNonterminal(name.clone())) } } IntermediateConstruct::Range(_, _) => { dbg!(); Err(Error::UnsupportedOperation) } } } else { dbg!(); Err(Error::RegexIndexOutOfBounds(old_label, rule_data.len())) } }; let transformed_rule = rule_regex.maps_to(rule_transform)?; if rule_size == 0 { break; } if !appending_p { // nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); // nonterminals_vec.push(rule_name.clone()); all_rules.insert(rule_name, transformed_rule); } else { all_rules .get_mut(&rule_name) .unwrap() .append(transformed_rule); } if rule_size > 0 && rule_size < s.len() { s = &s[rule_size..]; current_skipped_index += rule_size; } else { break; } } let rules_vec: Vec = nonterminals_vec .iter() .map(|name| { all_rules .get(name) .map(|regex| Rule::new(regex.clone())) .ok_or(Error::UnknownNonterminal(name.clone())) }) .collect::>()?; let terminals: Vec = terminals_vec.into_iter().map(Terminal::new).collect(); let nonterminals: Vec = nonterminals_vec.into_iter().map(Nonterminal).collect(); Ok(Self::new(terminals, nonterminals, rules_vec)) } } #[cfg(test)] mod tests;