From 9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Sat, 8 Jul 2023 12:30:21 +0800 Subject: Finished the Emacs binding. Now the binding part is finished. What remains is a bug encountered when planting a fragment to the forest which intersects a packed node, which would lead to invalid forests. This will also cause problem when planting a packed fragment, but until now my testing grammars do not produce packed fragments, so this problem is not encountered yet. I am still figuring out efficient ways to solve this problem. --- grammar/src/abnf.rs | 37 - grammar/src/abnf/boolean_fns.rs | 42 + grammar/src/abnf/mod.rs | 2536 +++++++++++++++++++++++++++++++++++++++ grammar/src/abnf/tests.rs | 291 +++++ grammar/src/lib.rs | 85 +- 5 files changed, 2952 insertions(+), 39 deletions(-) delete mode 100644 grammar/src/abnf.rs create mode 100644 grammar/src/abnf/boolean_fns.rs create mode 100644 grammar/src/abnf/mod.rs create mode 100644 grammar/src/abnf/tests.rs (limited to 'grammar/src') diff --git a/grammar/src/abnf.rs b/grammar/src/abnf.rs deleted file mode 100644 index eb56819..0000000 --- a/grammar/src/abnf.rs +++ /dev/null @@ -1,37 +0,0 @@ -//! This file implements the function to read a grammar from a string, -//! in the [augmented Backus Naur -//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form). -//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its -//! specifications. -//! -//! In fact, the rules of ABNF are considered to be too strict. For -//! example, only ASCII characters are allowed in the format, even -//! within comments. I think any character should be allowed within -//! comments, so the implementation here does that. - -use super::*; - -/// The type of errors encountered during parsing a grammar in the -/// ABNF format. See the function [`abnf_to_grammar`] for the parsing -/// function. -#[derive(Debug, Clone, Copy)] -pub enum Error { - /// Something invalid has occurred. - Invalid, -} - -impl core::fmt::Display for Error { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - match self { - Error::Invalid => write!(f, "invalid"), - } - } -} - -#[allow(unused)] -/// This function converts a string to a grammar. -pub fn abnf_to_grammar(str: impl AsRef) -> Result { - let str = str.as_ref(); - - todo!() -} diff --git a/grammar/src/abnf/boolean_fns.rs b/grammar/src/abnf/boolean_fns.rs new file mode 100644 index 0000000..08f317c --- /dev/null +++ b/grammar/src/abnf/boolean_fns.rs @@ -0,0 +1,42 @@ +//! This file collects some functions that return a boolean value from +//! a character. + +pub(super) fn is_wsp(c: char) -> bool { + c == ' ' || c == '\t' +} + +pub(crate) fn is_v_char(c: char) -> bool { + ('!'..='~').contains(&c) +} + +pub(super) fn is_wsp_or_v_char(c: char) -> bool { + is_wsp(c) || is_v_char(c) +} + +pub(super) fn is_not_newline(c: char) -> bool { + c != '\n' && c != '\r' +} + +pub(super) fn is_alpha(c: char) -> bool { + c.is_ascii_alphabetic() +} + +pub(super) fn is_decimal_digit(c: char) -> bool { + c.is_ascii_digit() +} + +pub(super) fn is_binary_digit(c: char) -> bool { + c == '0' || c == '1' +} + +pub(super) fn is_hexadecimal_digit(c: char) -> bool { + is_decimal_digit(c) || ('A'..='F').contains(&c) || ('a'..='f').contains(&c) +} + +pub(super) fn is_alpha_or_digit_or_hyphen(c: char) -> bool { + is_alpha(c) || is_decimal_digit(c) || c == '-' +} + +pub(super) fn is_char_value(c: char) -> bool { + (' '..='~').contains(&c) && c != '"' +} diff --git a/grammar/src/abnf/mod.rs b/grammar/src/abnf/mod.rs new file mode 100644 index 0000000..f825bef --- /dev/null +++ b/grammar/src/abnf/mod.rs @@ -0,0 +1,2536 @@ +//! This file implements the function to read a grammar from a string, +//! in the [augmented Backus Naur +//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form). +//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its +//! specifications. +//! +//! In fact, the rules of ABNF are considered to be too strict. For +//! example, only ASCII characters are allowed in the format, even +//! within comments. I think any character should be allowed within +//! comments, so the implementation here does that. + +#![allow(dead_code)] + +use super::*; + +/// The type of errors encountered during parsing a grammar in the +/// ABNF format. +#[derive(Debug)] +#[non_exhaustive] +pub enum Error { + /// A rule name without rule body. + HangingRuleName(usize), + /// The minimum is greater than maximum? + MinGreaterThanMax(usize), + /// A digit is expected, but is not encountered. + InvalidDigit(usize), + /// A character that is not allowed. + InvalidCharacter(usize), + /// A numeric value is not completed. + HangingNumeric(usize), + /// A repetition is hanging. + HangingRepetition(usize), + /// A double quote is hanging. + HangingDQuote(usize), + /// A percentage sign is hanging. + HangingPercent(usize), + /// An equal sign is hanging. + HangingEqualsSign(usize), + /// An operation is unsupported. + UnsupportedOperation, + /// Encounters an unknown node when constructing the regular + /// expression. + RegexUnknownNode(usize), + /// An index is out of bounds when constructing the regular + /// expression. + RegexIndexOutOfBounds(usize, usize), + /// A node is duplicated when constructing the regular expression. + RegexDuplicatedNode(usize), + /// An edge is duplicated when constructing the regular + /// expression. + RegexDuplicatedEdge(usize, usize), + /// The constructed regular expression graph has no root. + RegexNoRoot, + /// The constructed regular expression graph has cycles. + RegexCycle, + /// The stack illegally becomes empty when constructing a regular + /// expression. + RegexEmptyStack(usize), + /// The action stack illegally becomes empty when constructing a + /// regular expression. + RegexEmptyActionStack(usize), + /// An unbalanced parenthesis is encountered when constructing a + /// regular expression. + RegexUnbalancedParen(usize), + /// An invalid repetition specification is encountered when + /// constructing a regular expression. + RegexInvalidRepeat(usize), + /// An invalid comment character is encountered. + RegexInvalidComment(usize), + /// An invalid numeric value is encoutnered. + RegexInvalidNumeric(usize), + /// A rule should have an equal sign. + MissingEqualSign(usize), + /// An unimplemented feature is required. + Unimplemented(usize), + /// Input-Output error. + IOError(std::io::Error), + /// The grammar is empty. + EmptyGrammar, + /// Found an appending rule associated to a non-existent rule. + AppendToNothing(usize), + /// Found a rule declaration associated to an existing rule. + CreateAgain(usize), + /// This terminal is not found. + UnknownTerminal(String), + /// This non-terminal is not found. + UnknownNonterminal(String), + /// Something invalid has occurred. + Invalid, +} + +impl std::error::Error for Error {} + +impl From for Error { + fn from(e: std::io::Error) -> Self { + Self::IOError(e) + } +} + +impl From for Error { + fn from(value: graph::error::Error) -> Self { + match value { + graph::error::Error::IndexOutOfBounds(index, bound) => { + Error::RegexIndexOutOfBounds(index, bound) + } + graph::error::Error::DuplicatedNode(n) => Error::RegexDuplicatedNode(n), + graph::error::Error::DuplicatedEdge(from, to) => Error::RegexDuplicatedEdge(from, to), + _ => Error::Invalid, + } + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::HangingRuleName(pos) => write!(f, "a rule name without body at {pos}"), + Error::MinGreaterThanMax(pos) => write!( + f, + "minimum should not be strictly greater than the maximum at {pos}." + ), + Error::InvalidDigit(pos) => { + write!(f, "a digit is expected at {pos}, but is not found.") + } + Error::InvalidCharacter(pos) => write!(f, "the character at {pos} is not allowed"), + Error::HangingNumeric(pos) => write!(f, "a numeric value is hanging at {pos}"), + Error::HangingRepetition(pos) => write!(f, "a repetition is hanging at {pos}"), + Error::HangingDQuote(pos) => write!(f, "a double quote is hanging at {pos}"), + Error::HangingPercent(pos) => write!(f, "a percentage sign is hanging at {pos}"), + Error::HangingEqualsSign(pos) => write!(f, "an equal sign is hanging at {pos}"), + Error::UnsupportedOperation => write!(f, "an unsupported operation is encountered"), + Error::RegexUnknownNode(n) => write!(f, "an unknown node {n} is encountered \ + when constructing the regular expression."), + Error::RegexIndexOutOfBounds(index, bound) => write!(f, "an index {index} is out of \ + bound {bound} when constructing the regular expression."), + Error::RegexDuplicatedNode(n) => write!(f, "a node {n} is duplicated \ + when constructing the regular expression."), + Error::RegexDuplicatedEdge(from, to) => write!(f, "an edge from {from} to {to} is duplicated \ + when constructing the regular expression."), + Error::RegexNoRoot => write!(f, "the constructed regular expression has no root."), + Error::RegexCycle => write!(f, "the constructed regular expression has cycles."), + Error::RegexEmptyStack(pos) => write!(f, "the stack illegally becomes empty at {pos} \ + when constructing a regular expression."), + Error::RegexEmptyActionStack(pos) => write!(f, "the action stack illegally becomes empty at {pos} \ + when constructing a regular expression."), + Error::RegexUnbalancedParen(pos) => write!(f, "An unbalanced parenthesis is encountered at {pos} \ + when constructing a regular expression."), + Error::RegexInvalidRepeat(pos) => write!(f, "An invalid repetition specification is encountered \ + at {pos} when constructing a regular expression."), + Error::RegexInvalidComment(pos) => write!(f, "An invalid character is encountered in a comment \ + at {pos}"), + Error::RegexInvalidNumeric(pos) => write!(f, "An invalid numeric value is encoutnered at {pos}."), + Error::MissingEqualSign(pos) => write!(f, "A rule is missing an equal sign at {pos}"), + Error::Unimplemented(pos) => write!(f, "An unimplemented feature is required at {pos}."), + Error::IOError(e) => write!(f, "An IO error: {e}"), + Error::EmptyGrammar => write!(f, "the grammar is empty"), + Error::AppendToNothing(pos) => write!(f, "the rule at {pos} is appending to a non-existing rule."), + Error::CreateAgain(pos) => write!(f, "the rule at {pos} is creating a duplicated rule."), + Error::UnknownTerminal(name) => write!(f, "the terminal {name} is not found."), + Error::UnknownNonterminal(name) => write!(f, "the non-terminal {name} is not found."), + Error::Invalid => write!(f, "invalid"), + } + } +} + +/// Return the number of bytes taken by the characters for which the +/// function `f` returns `true`. +fn skip_by(s: &str, f: impl Fn(char) -> bool) -> usize { + for (index, c) in s.char_indices() { + if !f(c) { + return index; + } + } + + s.len() +} + +mod boolean_fns; + +pub(crate) use boolean_fns::*; + +fn skip_c_nl(s: &str) -> Option { + if s.is_empty() { + return Some(0); + } + + let s_len = s.len(); + + match s.chars().next().unwrap() { + ';' => { + let after_not_newline = skip_by(&s[1..], is_not_newline); + + if 1 + after_not_newline >= s_len { + Some(s_len) + } else { + let s = &s[1 + after_not_newline..]; + + match s.chars().next().unwrap() { + '\n' | '\r' => Some(after_not_newline + 2), + _ => { + // We skipped anything not newline, so this + // character must be newline. + unreachable!(); + } + } + } + } + '\n' | '\r' => Some(1), + _ => None, + } +} + +fn skip_c_wsp(s: &str) -> usize { + let mut currently_skipped = 0usize; + let mut s = s; + + let mut continue_skipping = true; + + 'skipping_loop: while continue_skipping { + continue_skipping = false; + + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp > 0 { + if skip_wsp >= s.len() { + return currently_skipped + s.len(); + } + + continue_skipping = true; + s = &s[skip_wsp..]; + currently_skipped += skip_wsp; + + continue 'skipping_loop; + } + + if let Some(skip_c_nl_size) = skip_c_nl(s) { + if skip_c_nl_size >= s.len() { + return currently_skipped + s.len(); + } + + continue_skipping = true; + + s = &s[skip_c_nl_size..]; + currently_skipped += skip_c_nl_size; + + let skip_wsp_after_c_nl = skip_by(s, is_wsp); + + s = &s[skip_wsp_after_c_nl..]; + currently_skipped += skip_wsp_after_c_nl; + } + } + + currently_skipped +} + +#[derive(Debug, Eq, PartialEq)] +enum RepetitionSpec { + Star, + Equal(usize), + Min(usize), + Max(usize), + MinMax(usize, usize), +} + +#[derive(Debug)] +enum RepetitionAction { + /// Ignore the following element. + Ignore, + /// The following is put in a star. + Star, + /// The following is put in an optional construct. + Optional, + /// The following is put in a plus construct. + Plus, + /// Copy the following some number of times. + Copy(usize), + /// Copy the following some number of times, and then add a plus + /// of that same element. + CopyPlus(usize), + /// Copy the following some number of times, and then add another + /// number of *nested* optional construts of that same element. + /// + /// # Why nested? + /// + /// The optional constructs are nested here as a sequence of + /// optional constructs is exponentially more ambiguous than the + /// nested equivalent: "a?a?a?" is more ambiguous than + /// "(a(a(a)?)?)?". For example, the input "a" can have three + /// derivations according to "a?a?a?", but has only one derivation + /// according to "(a(a(a)?)?)?". + CopyOptional(usize, usize), +} + +impl TryFrom> for RepetitionAction { + type Error = Error; + + fn try_from(spec: Option) -> Result { + if let Some(r) = spec { + match r { + RepetitionSpec::Star => Ok(Self::Star), + RepetitionSpec::Equal(0) => Ok(Self::Ignore), + RepetitionSpec::Equal(n) => Ok(Self::Copy(n)), + RepetitionSpec::Min(0) => Ok(Self::Star), + RepetitionSpec::Min(1) => Ok(Self::Plus), + RepetitionSpec::Min(n) => Ok(Self::CopyPlus(n - 1)), + RepetitionSpec::Max(0) => Ok(Self::Ignore), + RepetitionSpec::Max(1) => Ok(Self::Optional), + RepetitionSpec::Max(n) => Ok(Self::CopyOptional(0, n)), + RepetitionSpec::MinMax(0, 0) => Ok(Self::Ignore), + RepetitionSpec::MinMax(0, 1) => Ok(Self::Optional), + RepetitionSpec::MinMax(n, m) if n > m => Err(Error::MinGreaterThanMax(0)), + RepetitionSpec::MinMax(n, m) if n == m => Ok(Self::Copy(n)), + RepetitionSpec::MinMax(n, m) => Ok(Self::CopyOptional(n, m - n)), + } + } else { + Ok(Self::Copy(1)) + } + } +} + +fn parse_repetition_spec(s: &str) -> Option<(usize, RepetitionSpec)> { + let mut s = s; + + let digit_size = skip_by(s, is_decimal_digit); + + let mut min: Option = None; + let mut max: Option = None; + + if digit_size != 0 { + min = Some(s[..digit_size].parse::().unwrap()); + } + + let mut current_index = digit_size; + + if digit_size == s.len() { + if let Some(m) = min { + return Some((digit_size, RepetitionSpec::Equal(m))); + } else { + return None; + }; + } + + s = &s[current_index..]; + + if s.starts_with('*') { + if s.len() == 1 { + if let Some(m) = min { + return Some((current_index + 1, RepetitionSpec::Min(m))); + } else { + return Some((current_index + 1, RepetitionSpec::Star)); + } + } + + s = &s[1..]; + current_index += 1; + + let digit_size = skip_by(s, is_decimal_digit); + + if digit_size != 0 { + max = Some(s[..digit_size].parse::().unwrap()); + } + + current_index += digit_size; + + match (min, max) { + (Some(mi), Some(ma)) => Some((current_index, RepetitionSpec::MinMax(mi, ma))), + (Some(m), None) => Some((current_index, RepetitionSpec::Min(m))), + (None, Some(m)) => Some((current_index, RepetitionSpec::Max(m))), + _ => Some((current_index, RepetitionSpec::Star)), + } + } else { + min.map(|m| (current_index, RepetitionSpec::Equal(m))) + } +} + +fn parse_rulename(s: &str) -> Result { + let first_char = s.chars().next().unwrap(); + + if !is_alpha(first_char) { + return Ok(0); + } + + let rulename_end = skip_by(s, is_alpha_or_digit_or_hyphen); + + if rulename_end >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + Ok(rulename_end) +} + +#[derive(Debug, Eq, PartialEq, Hash)] +enum IntermediateConstruct { + Terminal(String), + Nonterminal(String), + Range(char, char), +} + +impl Display for IntermediateConstruct { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IntermediateConstruct::Terminal(t) => write!(f, "t{t}"), + IntermediateConstruct::Nonterminal(n) => write!(f, "n{n}"), + IntermediateConstruct::Range(min, max) => write!(f, "[{min}-{max}]"), + } + } +} + +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +enum Radix { + Binary, + Decimal, + Hexadecimal, +} + +fn parse_numeric(s: &str, radix: Radix) -> Result<(usize, IntermediateConstruct), Error> { + // first read a number + + let is_radix_digit = match radix { + Radix::Binary => is_binary_digit, + Radix::Decimal => is_decimal_digit, + Radix::Hexadecimal => is_hexadecimal_digit, + }; + + let radix = match radix { + Radix::Binary => 2, + Radix::Decimal => 10, + Radix::Hexadecimal => 16, + }; + + let mut s = s; + let mut current_index = 0; + + let first_digit_size = skip_by(s, is_radix_digit); + + if first_digit_size == 0 { + // dbg!(); + return Err(Error::InvalidDigit(current_index)); + } + + let first_number = + match std::char::from_u32(u32::from_str_radix(&s[..first_digit_size], radix).unwrap()) { + Some(c) => c, + None => { + // dbg!(); + return Err(Error::InvalidCharacter(current_index)); + } + }; + + if first_digit_size >= s.len() { + return Ok(( + current_index + s.len(), + IntermediateConstruct::Terminal(first_number.to_string()), + )); + } + + s = &s[first_digit_size..]; + + current_index += first_digit_size; + + // then read a dot or a dash + + match s.chars().next().unwrap() { + '.' => { + s = &s[1..]; + current_index += 1; + + let mut stop_reading = false; + let mut chars_vec = vec![first_number]; + + while !stop_reading { + if s.is_empty() { + return Err(Error::HangingNumeric(current_index - 1)); + } + + let digit_size = skip_by(s, is_radix_digit); + + if digit_size == 0 { + // dbg!(); + return Err(Error::InvalidDigit(current_index)); + } + + let a_number = + std::char::from_u32(u32::from_str_radix(&s[..digit_size], radix).unwrap()) + .ok_or(Error::InvalidCharacter(current_index))?; + + chars_vec.push(a_number); + + s = &s[digit_size..]; + current_index += digit_size; + + if !s.starts_with('.') { + stop_reading = true; + // println!( + // "digit_size = {digit_size} a number = {a_number}, s = {}", + // &s[..10] + // ); + } else { + s = &s[1..]; + current_index += 1; + } + } + + Ok(( + current_index, + IntermediateConstruct::Terminal(chars_vec.into_iter().collect()), + )) + } + '-' => { + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingNumeric(current_index - 1)); + } + + let second_digit_size = skip_by(s, is_radix_digit); + + if second_digit_size == 0 { + println!( + "s = {}", + s.chars() + .take(if s.len() >= 50 { 50 } else { s.len() }) + .collect::() + ); + return Err(Error::InvalidDigit(current_index)); + } + + let _second_number = + std::char::from_u32(u32::from_str_radix(&s[..second_digit_size], radix).unwrap()) + .ok_or(Error::InvalidCharacter(current_index))?; + + Err(Error::Unimplemented(current_index - 1)) + + // Ok(( + // current_index + second_digit_size, + // IntermediateConstruct::Range(first_number, second_number), + // )) + } + c if is_wsp(c) || c == '\n' || c == '\r' || c == ';' => Ok(( + current_index, + IntermediateConstruct::Terminal(first_number.to_string()), + )), + _ => Err(Error::InvalidCharacter(current_index)), + } +} + +#[derive(Debug)] +struct AlternationConstruct { + intermediates: Vec, + regex: DefaultRegex, +} + +fn parse_alternation(s: &str, indentation: usize) -> Result<(usize, AlternationConstruct), Error> { + let mut stack: Vec = vec![0, 1]; + let mut intermediates: Vec = Vec::new(); + let mut types: Vec> = vec![RegexType::Or, RegexType::Empty]; + let mut edges: Vec> = vec![vec![1], Vec::new()]; + + fn error_conversion(error: nfa::error::Error) -> Error { + match error { + nfa::error::Error::UnknownNode(n) => Error::RegexUnknownNode(n), + nfa::error::Error::UnsupportedOperation => Error::UnsupportedOperation, + nfa::error::Error::NoRoot => Error::RegexNoRoot, + nfa::error::Error::Graph(ge) => ge.into(), + nfa::error::Error::Cycle => Error::RegexCycle, + _ => Error::Invalid, + } + } + + let mut s = s; + let mut current_index = 0usize; + + let mut nonterminal_nodes: HashSet = HashSet::new(); + + if s.is_empty() { + let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; + + return Ok(( + current_index, + AlternationConstruct { + intermediates, + regex, + }, + )); + } + + // A vector of tuples to record some information for each level of + // nesting. + // + // A tuple (len, action) means that the stack grows by LEN many + // tokens at this level, and the action ACTION is pending. + let mut repetition_action_stack: Vec<(usize, Option)> = vec![]; + + while !stack.is_empty() { + // First read repetition specifications. + let mut repetition_spec = None; + + if let Some((spec_size, spec)) = parse_repetition_spec(s) { + if spec_size >= s.len() { + return Err(Error::HangingRepetition(current_index)); + } + + s = &s[spec_size..]; + current_index += spec_size; + + repetition_spec = Some(spec); + } + + let repetition_spec = repetition_spec; + + // if a rulename + match parse_rulename(s) { + Ok(0) => {} + Ok(size) => { + let name = &s[..size].to_string(); + + intermediates.push(IntermediateConstruct::Nonterminal(name.clone())); + + let nt_index = intermediates.len() - 1; + + let action = repetition_spec.try_into()?; + // println!("rep_action = {action:?}"); + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + types.push(RegexType::Lit(nt_index)); + + edges.push(vec![types.len() - 1]); + edges.push(Vec::new()); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Optional => { + types.append(&mut vec![RegexType::Optional]); + types.push(RegexType::Lit(nt_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Plus => { + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(nt_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Copy(n) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // edges[stack_last].push(types.len() - 1); + // } + + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(nt_index)); + + nonterminal_nodes.insert(types.len() - 1); + + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::CopyOptional(n, m) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 1); + // } + + let types_len = types.len(); + + types.extend( + std::iter::repeat([RegexType::Optional, RegexType::Lit(nt_index)]) + .take(m) + .flatten(), + ); + + edges.extend((0..(m - 1)).flat_map(|i| { + [ + vec![types_len + 1 + 2 * i, types_len + 2 + 2 * i], + Vec::new(), + ] + })); + + edges.extend([vec![types.len() - 1], Vec::new()]); + + nonterminal_nodes.extend((0..m).map(|i| types_len + 2 * i + 1)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types_len); + + // for _ in 0..m { + // types.extend([RegexType::Optional, RegexType::Lit(nt_index)]); + // edges.extend([vec![types.len() - 1], Vec::new()]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 2); + // } + } + } + + s = &s[size..]; + current_index += size; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + Err(_) => { + intermediates.push(IntermediateConstruct::Nonterminal(s.to_owned())); + + types.push(RegexType::Lit(intermediates.len() - 1)); + edges.push(vec![]); + + let stack_last = stack + .last() + .copied() + .ok_or(Error::RegexEmptyStack(current_index))?; + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + nonterminal_nodes.insert(types.len() - 1); + + break; + } + } + + // match against the first charcter + + let first_char = s.chars().next().unwrap(); + + match first_char { + '(' => { + // group + + s = &s[1..]; + current_index += 1; + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Star => { + types.extend([RegexType::Kleene, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + repetition_action_stack.push((3, None)); + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + } + RepetitionAction::Optional => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + repetition_action_stack.push((3, None)); + } + RepetitionAction::Plus => { + types.extend([RegexType::Plus, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + repetition_action_stack.push((3, None)); + } + RepetitionAction::Copy(1) => { + types.extend([RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + + stack.extend([types.len() - 2, types.len() - 1]); + repetition_action_stack.push((2, None)); + } + RepetitionAction::CopyOptional(0, _) => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + + repetition_action_stack.push((3, Some(action))); + } + _ => { + types.extend([RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + + stack.extend([types.len() - 2, types.len() - 1]); + + repetition_action_stack.push((2, Some(action))); + } + } + + s = &s[1..]; + current_index += 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + return Err(Error::RegexUnbalancedParen(current_index)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + ')' => { + // end group + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + let (stack_growth, repetition_action) = repetition_action_stack + .pop() + .ok_or(Error::RegexEmptyActionStack(current_index))?; + + match repetition_action { + Some(action) => { + if stack.len() <= stack_growth + 1 { + dbg!(); + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth + 1); + + // this is not going to fail because of the previous check + let stack_last = stack.pop().unwrap(); + let stack_second_last = stack.iter().copied().last().unwrap(); + + match action { + RepetitionAction::Ignore => { + edges[stack_second_last].pop(); + + types.truncate(stack_last); + edges.truncate(stack_last); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyPlus(n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_second_last].push(types.len() - 1); + + // if stack_second_last == 3 && nodes.len() == 4 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyOptional(n, m) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + edges[stack_second_last].push(types.len()); + + let nodes_cloned_len = nodes_cloned.len(); + + for _ in 0..(m - 1) { + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + nodes_cloned_len]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Star + | RepetitionAction::Optional + | RepetitionAction::Plus => { + // this will not happen here + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + None => { + if stack.len() <= stack_growth { + dbg!(); + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + '[' => { + // option + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Ignore => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, Some(action))); + } + RepetitionAction::Optional => { + types.extend([ + RegexType::Optional, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Star => { + types.extend([ + RegexType::Kleene, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Plus => { + types.extend([ + RegexType::Kleene, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Copy(1) => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, None)); + } + _ => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, Some(action))); + } + } + + s = &s[1..]; + current_index += 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + return Err(Error::RegexUnbalancedParen(current_index)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + ']' => { + // end option + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + let (stack_growth, repetition_action) = repetition_action_stack + .pop() + .ok_or(Error::RegexEmptyActionStack(current_index))?; + + match repetition_action { + Some(action) => { + if stack.len() <= stack_growth + 1 { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth + 1); + + // this is not going to fail because of the previous check + let stack_last = stack.pop().unwrap(); + let stack_second_last = stack.iter().copied().last().unwrap(); + + match action { + RepetitionAction::Ignore => { + edges[stack_second_last].pop(); + + types.truncate(stack_last); + edges.truncate(stack_last); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyPlus(n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_second_last].push(types.len() - 1); + + // if stack_second_last == 3 && nodes.len() == 4 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyOptional(n, m) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + let nodes_cloned_len = nodes_cloned.len(); + + for _ in 0..(m - 1) { + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + nodes_cloned_len]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Star + | RepetitionAction::Optional + | RepetitionAction::Plus => { + // this will not happen here + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + None => { + if stack.len() <= stack_growth { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + '/' => { + // end concatenation + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + if stack.len() <= 1 { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.pop().unwrap(); + + let stack_last = stack.iter().copied().last().unwrap(); + + edges[stack_last].push(types.len()); + + types.push(RegexType::Empty); + edges.push(vec![]); + + stack.push(types.len() - 1); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '"' => { + // string + + s = &s[1..]; + current_index += 1; + + let skip_char_value = skip_by(s, is_char_value); + + if skip_char_value >= s.len() { + return Err(Error::HangingDQuote(current_index)); + } + + let fakes = &s[skip_char_value..]; + + if !fakes.starts_with('"') { + return Err(Error::HangingDQuote(current_index)); + } + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + let slice_value = s[..skip_char_value].to_owned(); + + intermediates.push(IntermediateConstruct::Terminal(slice_value.clone())); + + let t_index = intermediates.len() - 1; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Optional => { + types.append(&mut vec![RegexType::Optional]); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Plus => { + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Copy(n) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + // for _ in 0..n { + // types.push(RegexType::Lit(t_index)); + // edges.push(vec![]); + + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + types.extend([RegexType::Plus, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::CopyOptional(n, m) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len()); + + for _ in 0..(m - 1) { + types.extend([RegexType::Optional, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1, types.len()], Vec::new()]); + } + + types.extend([RegexType::Optional, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1], Vec::new()]); + } + } + + s = &fakes[1..]; + current_index += skip_char_value + 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '%' => { + // numeric value + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingPercent(current_index - 1)); + } + + let inner_first_char = s.chars().next().unwrap(); + + let intermediate_value: IntermediateConstruct; + + match inner_first_char { + 'b' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Binary)?; + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + 'd' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Decimal)?; + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + 'x' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Hexadecimal)?; + // println!("size = {parsed_size}, construct = {parsed_construct:#?}"); + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + _ => { + return Err(Error::RegexInvalidNumeric(current_index - 1)); + } + } + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + intermediates.push(intermediate_value); + + let inter_index = intermediates.len() - 1; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Optional => { + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Plus => { + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Copy(n) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + // for _ in 0..n { + // types.push(RegexType::Lit(inter_index)); + // edges.push(vec![]); + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::CopyOptional(n, m) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + for _ in 0..(m - 1) { + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + 1]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '<' => { + // prose value + return Err(Error::Unimplemented(current_index)); + } + ';' => { + // Possibly extend to the next line if the indentation + // of the next line is greater than `indentation`. + + s = &s[1..]; + current_index += 1; + + let mut in_comment = true; + + loop { + let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; + + let mut after_wsp_and_v_char = skip_by(s, skip_fn); + + if after_wsp_and_v_char >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[after_wsp_and_v_char..]; + current_index += after_wsp_and_v_char; + + let first_char = s.chars().next().unwrap(); + + let mut skip_again = false; + + match first_char { + '\n' | '\r' => { + s = &s[1..]; + current_index += 1; + skip_again = true; + } + _ if in_comment => { + // dbg!("1815"); + return Err(Error::RegexInvalidComment(current_index)); + } + _ => {} + } + + if skip_again { + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + after_wsp_and_v_char = skip_wsp; + + s = &s[skip_wsp..]; + current_index += skip_wsp; + } + + let first_char = s.chars().next().unwrap(); + + match first_char { + ';' => { + // continue looping + s = &s[1..]; + current_index += 1; + in_comment = true; + + continue; + } + '\n' | '\r' => { + in_comment = false; + s = &s[1..]; + current_index += 1; + + continue; + } + _ if after_wsp_and_v_char <= indentation => { + stack = vec![]; + break; + } + _ => { + break; + } + } + } + } + '\n' | '\r' => { + // Possibly extend to the next line if the indentation + // of the next line is greater than `indentation`. + // println!("s = {}", s.chars().take(10).collect::()); + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + break; + } + + let mut in_comment = false; + + loop { + let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; + + let mut after_wsp = skip_by(s, skip_fn); + + if after_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[after_wsp..]; + current_index += after_wsp; + + let first_char = s.chars().next().unwrap(); + + let mut skip_again = false; + + match first_char { + '\n' | '\r' => { + s = &s[1..]; + current_index += 1; + skip_again = true; + } + _ if in_comment => { + // dbg!("1903"); + return Err(Error::RegexInvalidComment(current_index)); + } + _ => {} + } + + if skip_again { + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[skip_wsp..]; + current_index += skip_wsp; + + after_wsp = skip_wsp; + } + + let first_char = s.chars().next().unwrap(); + + match first_char { + ';' => { + // continue looping + s = &s[1..]; + current_index += 1; + in_comment = true; + + continue; + } + '\n' | '\r' => { + in_comment = false; + s = &s[1..]; + current_index += 1; + + continue; + } + _ if after_wsp <= indentation => { + stack = vec![]; + break; + } + _ => { + break; + } + } + } + } + _ => { + // invalid + println!("s = {}", &s[..(if s.len() > 50 { 50 } else { s.len() })]); + return Err(Error::InvalidCharacter(current_index)); + } + } + } + + let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; + + Ok(( + current_index, + AlternationConstruct { + intermediates, + regex, + }, + )) +} + +type RuleConstruct = ( + // Name + String, + // Data + Vec, + // Rule + DefaultRegex, + // extra_alternation_or_not + bool, +); + +fn parse_one_rule(s: &str, indentation: usize) -> Result<(usize, RuleConstruct), Error> { + if s.is_empty() { + return Ok((0, ("".to_owned(), Vec::new(), Default::default(), false))); + } + + let mut s = s; + + let mut current_index = 0usize; + + let mut appending_alternation = false; + + // rule name + + let rulename_end = parse_rulename(s)?; + let rule_name = s[..rulename_end].to_string(); + s = &s[rulename_end..]; + current_index += rulename_end; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + // equal or equal forward slash + + let next_char = s.chars().next().unwrap(); + + if next_char != '=' { + // println!("s = {}", &s[..50]); + // println!("rule_name = {rule_name}"); + return Err(Error::MissingEqualSign(current_index + 1)); + } + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingEqualsSign(current_index - 1)); + } + + let next_char = s.chars().next().unwrap(); + + if next_char == '/' { + appending_alternation = true; + s = &s[1..]; + current_index += 1; + } + + if s.is_empty() { + return Err(Error::HangingEqualsSign(current_index - 1)); + } + + // c wsp + + let skip_c_wsp_size = skip_c_wsp(s); + + if skip_c_wsp_size >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + // elements + + // alternation + + let (alternation_size, alternation_result) = parse_alternation(s, indentation)?; + + let intermediates_data = alternation_result.intermediates; + let regex = alternation_result.regex; + + // println!("size = {alternation_size}, result = {alternation_result:?}"); + + if alternation_size >= s.len() { + s = ""; + } else { + s = &s[alternation_size..]; + } + + current_index += alternation_size; + + // c wsp + + let skip_c_wsp_size = skip_c_wsp(s); + + if skip_c_wsp_size >= s.len() { + s = ""; + } else { + s = &s[skip_c_wsp_size..]; + } + + current_index += skip_c_wsp_size; + + // c-nl + + if let Some(skip_c_nl) = skip_c_nl(s) { + current_index += skip_c_nl; + } + + Ok(( + current_index, + (rule_name, intermediates_data, regex, appending_alternation), + )) +} + +impl std::str::FromStr for Grammar { + // First skip whitespaces, linefeeds, carraige returns and + // semicolons until the first rule appears. The basic indentation + // level is set to the indentation level of the first rule's line. + // + // Then enter identifier reading mode, reading a letter followed + // by a sequence of letters, digits and hyphens. + // + // After skipping white spaces, read an equal sign. + // + // Then enter alternation reading mode, reading alternations of + // concatenations. + // + // Then enter element reading mode, reading optionally repetition + // specification followed by an identifier, or a numeral value, or + // a quoted string. + + type Err = Error; + + fn from_str(s: &str) -> Result { + let mut before_first_rule = true; + let mut basic_indentation: usize = 0; + let mut current_skipped_index: usize = 0; + + let mut s = s; + + while before_first_rule { + before_first_rule = false; + + let after_wsp = skip_by(s, is_wsp); + + if after_wsp >= s.len() { + // The entire string consists of white-spaces and + // comments. + return Err(Error::EmptyGrammar); + } + + s = &s[after_wsp..]; + current_skipped_index += after_wsp; + + match skip_c_nl(s) { + Some(skipped_size) => { + if skipped_size >= s.len() { + return Err(Error::EmptyGrammar); + } + + // Repeat skipping until we find the first rule. + before_first_rule = true; + + s = &s[skipped_size..]; + current_skipped_index += skipped_size; + } + None => { + // Found the first rule. + basic_indentation = after_wsp; + } + } + } + + // TODO: Core rules are not implemented for now. + + #[allow(unused_macros)] + macro_rules! log { + () => { + let mut file = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open("debug.log") + .unwrap(); + + let _ = file.write_all(format!("{}\n", line!()).as_bytes()); + }; + } + + let mut nonterminals_vec: Vec = vec![]; + let mut nonterminals_map: HashMap = HashMap::new(); + + let mut all_rules: HashMap> = Default::default(); + + let mut terminals_vec: Vec = vec![]; + let mut terminals_map: HashMap = HashMap::new(); + + let mut temp = s; + + // First collect non-terminals in the order they are defined. + loop { + let (rule_size, (rule_name, _rule_data, _rule_regex, appending_p)) = + parse_one_rule(temp, basic_indentation)?; + + if rule_size == 0 { + break; + } + + if appending_p && nonterminals_map.get(&rule_name).is_none() { + return Err(Error::AppendToNothing(current_skipped_index)); + } else if !appending_p && nonterminals_map.get(&rule_name).is_some() { + return Err(Error::CreateAgain(current_skipped_index)); + } + + nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); + nonterminals_vec.push(rule_name); + + if rule_size > 0 && rule_size < temp.len() { + temp = &temp[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + let mut temp = s; + + // Then gather the data about names of terminals and + // non-terminals. + loop { + let (rule_size, (_rule_name, rule_data, _rule_regex, _appending_p)) = + parse_one_rule(temp, basic_indentation)?; + + if rule_size == 0 { + break; + } + + for data in rule_data { + match data { + IntermediateConstruct::Terminal(name) => { + if terminals_map.get(&name).is_none() { + terminals_map.insert(name.clone(), terminals_vec.len()); + terminals_vec.push(name); + } + } + IntermediateConstruct::Nonterminal(name) => { + if nonterminals_map.get(&name).is_none() { + return Err(Error::UnknownNonterminal(name)); + } + } + IntermediateConstruct::Range(_, _) => unimplemented!(), + } + } + + if rule_size > 0 && rule_size < temp.len() { + temp = &temp[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + // Then collect the actual rules. + loop { + let (rule_size, (rule_name, rule_data, rule_regex, appending_p)) = + parse_one_rule(s, basic_indentation)?; + // println!("rule name = {rule_name}"); + // println!("regexp = {rule_regexp:#?}"); + + let rule_transform = |old_label: usize| -> Result { + if let Some(old_value) = rule_data.get(old_label) { + match old_value { + IntermediateConstruct::Terminal(name) => { + if let Some(index) = terminals_map.get(name) { + Ok(TNT::Ter(*index)) + } else { + dbg!(); + Err(Error::UnknownTerminal(name.clone())) + } + } + IntermediateConstruct::Nonterminal(name) => { + if let Some(index) = nonterminals_map.get(name) { + Ok(TNT::Non(*index)) + } else { + dbg!(); + Err(Error::UnknownNonterminal(name.clone())) + } + } + IntermediateConstruct::Range(_, _) => { + dbg!(); + Err(Error::UnsupportedOperation) + } + } + } else { + dbg!(); + Err(Error::RegexIndexOutOfBounds(old_label, rule_data.len())) + } + }; + + let transformed_rule = rule_regex.maps_to(rule_transform)?; + + if rule_size == 0 { + break; + } + + if !appending_p { + nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); + nonterminals_vec.push(rule_name.clone()); + + all_rules.insert(rule_name, transformed_rule); + } else { + all_rules + .get_mut(&rule_name) + .unwrap() + .append(transformed_rule); + } + + if rule_size > 0 && rule_size < s.len() { + s = &s[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + let rules_vec: Vec = nonterminals_vec + .iter() + .map(|name| { + all_rules + .get(name) + .map(|regex| Rule::new(regex.clone())) + .ok_or(Error::UnknownNonterminal(name.clone())) + }) + .collect::>()?; + + let terminals: Vec = terminals_vec.into_iter().map(Terminal::new).collect(); + let nonterminals: Vec = + nonterminals_vec.into_iter().map(Nonterminal).collect(); + + Ok(Self::new(terminals, nonterminals, rules_vec)) + } +} + +#[cfg(test)] +mod tests; diff --git a/grammar/src/abnf/tests.rs b/grammar/src/abnf/tests.rs new file mode 100644 index 0000000..67dfaaa --- /dev/null +++ b/grammar/src/abnf/tests.rs @@ -0,0 +1,291 @@ +//! This module contains unit tests for the abnf grammar parser. + +use super::*; + +#[test] +fn test_skip_by() -> Result<(), Box> { + let string = std::iter::repeat('a') + .take(10) + .chain(std::iter::repeat('哈').take(10)) + .chain(" \t\n".chars()) + .collect::(); + + let mut s: &str = string.as_str(); + + dbg!(&s); + + let alpha_len = skip_by(s, is_alpha); + + assert_eq!(alpha_len, 10); + + s = &s[alpha_len..]; + + dbg!(&s); + + let not_newline_len = skip_by(s, is_not_newline); + + // 哈 takes 3 bytes, so the count should be 30 plus two bytes for + // the space and the tab. + assert_eq!(not_newline_len, 32); + + Ok(()) +} + +#[test] +fn test_skip_c_nl() -> Result<(), Box> { + let string = std::iter::repeat(' ') + .take(10) + .chain(std::iter::repeat('哈').take(10)) + .chain(" \t\n".chars()) + .collect::(); + + let s = string.as_str(); + + dbg!(s); + + let c_nl = skip_c_nl(s); + + assert!(c_nl.is_none()); + + let new_string = std::iter::once(';') + .chain(string.chars()) + .chain(string.chars()) + .collect::(); + + let s = new_string.as_str(); + + dbg!(s); + + let c_nl = skip_c_nl(s); + + assert_eq!(c_nl, Some(string.len() + 1)); + + let c_nl = skip_c_nl("\n"); + + assert_eq!(c_nl, Some(1)); + + Ok(()) +} + +#[test] +fn test_skip_c_wsp() -> Result<(), Box> { + let mut string: String = std::iter::repeat(' ') + .take(10) + .chain(std::iter::repeat('\t').take(10)) + .collect(); + + string.extend( + std::iter::once(';') + .chain( + std::iter::repeat(std::iter::once('\n').chain(std::iter::once(';'))) + .take(10) + .flatten(), + ) + .chain(std::iter::once('\n')) + .chain(std::iter::repeat('a').take(10)), + ); + + let string = string; + + dbg!(&string); + + let skip_c_wsp_size = skip_c_wsp(&string); + + assert_eq!(skip_c_wsp_size, string.len() - 10); + + Ok(()) +} + +#[test] +fn test_parse_repetition() -> Result<(), Box> { + let strs = ["*A", "0*A", "*0A", "2*A", "*2A", "5*10A", "A"]; + + let results = strs.map(parse_repetition_spec); + + dbg!(&results); + + let answers = [ + Some((1, RepetitionSpec::Star)), + Some((2, RepetitionSpec::Min(0))), + Some((2, RepetitionSpec::Max(0))), + Some((2, RepetitionSpec::Min(2))), + Some((2, RepetitionSpec::Max(2))), + Some((4, RepetitionSpec::MinMax(5, 10))), + None, + ]; + + for (result, answer) in results.iter().zip(answers.iter()) { + assert_eq!(result, answer); + } + + Ok(()) +} + +use super::Error; + +#[test] +fn test_parse_rulename() -> Result<(), Box> { + let s = "NONTERMINAL"; + + assert!(matches!(parse_rulename(s), Err(Error::HangingRuleName(0)))); + + let s = " NONTERMINAL = "; + + assert!(matches!(parse_rulename(s), Ok(0))); + + let s = "NONTERMINAL = "; + + assert!(matches!(parse_rulename(s), Ok(11))); + + Ok(()) +} + +#[test] +fn test_parse_numeric() -> Result<(), Box> { + let s = "haha"; + + assert!(matches!( + parse_numeric(s, Radix::Binary), + Err(Error::InvalidDigit(0)) + )); + + let s = "124"; + + assert!(matches!( + parse_numeric(s, Radix::Binary), + Err(Error::InvalidCharacter(1)) + )); + + let s = "32"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Ok((2, IntermediateConstruct::Terminal(string))) if string == " ".to_string() + )); + + let s = "32.65.66.66.65"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Ok((14, IntermediateConstruct::Terminal(string))) if string == " ABBA".to_string() + )); + + let s = "32-"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Err(Error::HangingNumeric(2)) + )); + + let s = "32-124"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Ok((6, IntermediateConstruct::Range(from, to))) if from == ' ' && to == '|', + )); + + Ok(()) +} + +#[test] +fn test_parse_alternation() -> Result<(), Box> { + let s = "0*10nonterminal\n"; + + let (len, alt) = parse_alternation(s, 0)?; + + assert_eq!(len, s.len()); + + let regex = alt.regex; + + println!("regex = {regex}"); + + assert_eq!( + format!("{regex}"), + "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?)" + ); + + let s = "0*10nonterminal ( *nonterminal nonterminal / [ nonterminal ] 1*5nonterminal )\n"; + + let (len, alt) = parse_alternation(s, 0)?; + + assert_eq!(len, s.len()); + + let regex = alt.regex; + + println!("regex = {regex}"); + + assert_eq!( + format!("{regex}"), + "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?((1)*2|((3))?4(4(4(4(4)?)?)?)?))" + ); + + let s = "5nonterminal\n"; + + let (len, alt) = parse_alternation(s, 0)?; + + assert_eq!(len, s.len()); + + let regex = alt.regex; + + println!("regex = {regex}"); + + assert_eq!(format!("{regex}"), "(00000)"); + + Ok(()) +} + +#[test] +fn test_parse_onerule() -> Result<(), Box> { + let s = "start = 1*10nonterminal [ nonterminal ] / 1*nonterminal\n"; + + let (len, (name, data, regex, extra_p)) = parse_one_rule(s, 0)?; + + println!("name = {name}"); + print!("data = "); + + for (index, non_terminal) in data.iter().enumerate() { + print!("({index}, {non_terminal})"); + } + + println!(); + + println!("regex = {regex}"); + + println!("extra_p = {extra_p}"); + + assert_eq!(len, s.len()); + + assert_eq!(name, "start".to_string()); + + let answers = ["nnonterminal", "nnonterminal", "nnonterminal"]; + + assert_eq!(data.len(), answers.len()); + + for (tnt, answer) in data.into_iter().zip(answers) { + assert_eq!(format!("{tnt}"), answer.to_string()); + } + + assert_eq!( + format!("{regex}"), + "(0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?((1))?|(2)+)".to_string() + ); + + assert!(!extra_p); + + Ok(()) +} + +#[test] +fn test_parse_grammar() -> Result<(), Box> { + let grammar_file_name = "abnf grammars/test.abnf"; + + let file_content = std::fs::read_to_string(grammar_file_name)?; + + let grammar: Grammar = file_content.parse()?; + + println!("print grammar:"); + + println!("{grammar}"); + + Ok(()) +} diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs index 11cb161..ea1299f 100644 --- a/grammar/src/lib.rs +++ b/grammar/src/lib.rs @@ -93,7 +93,7 @@ impl Display for TNT { } /// Errors related to grammar operations. -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone)] #[non_exhaustive] pub enum Error { /// The operation requires the grammar to be after a certain @@ -101,6 +101,8 @@ pub enum Error { WrongState(GrammarState, GrammarState), /// The first component is the index, and the second the bound. IndexOutOfBounds(usize, usize), + /// The given name of a terminal or a non-terminal is unknown. + UnknownTNTName(String), /// Fail to build the N-th regular expression, due to the /// ParseError. BuildFail(usize, ParseError), @@ -123,6 +125,11 @@ impl Display for Error { "Failed to build the {n}-th regular expression due to error: {pe}" ), Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + Error::UnknownTNTName(name) => write!( + f, + "the name {name} is unknown \ + for a terminal or a non-terminal." + ), Error::WrongState(current, threshold) => { write!(f, "require state {threshold}, but in state {current}") } @@ -158,6 +165,12 @@ impl Rule { pub fn len(&self) -> usize { self.regex.len() } + + /// Wrap a regular expression into a rule. + #[inline] + pub fn new(regex: DefaultRegex) -> Self { + Self { regex } + } } /// The state of Grammar. @@ -190,6 +203,37 @@ impl Display for GrammarState { } } +/// This enum represents the name of either a terminal or a +/// non-terminal. +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub enum TNTName { + /// The name of a terminal. + Ter(String), + /// The name of a non-terminal. + Non(String), +} + +impl TNTName { + /// Construct the name of a terminal or a non-terminal. + #[inline] + pub fn new(name: String, terminal_p: bool) -> Self { + if terminal_p { + Self::Ter(name) + } else { + Self::Non(name) + } + } + + /// Return the underlying name. + #[inline] + pub fn name(&self) -> &str { + match self { + TNTName::Ter(name) => name, + TNTName::Non(name) => name, + } + } +} + /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { @@ -197,6 +241,8 @@ pub struct Grammar { ter: Vec, /// A list of non-terminals. non: Vec, + /// A map from the names of terminals or non-terminals to TNT. + tnt_map: HashMap, /// A list of rules. /// /// The length of the list must match that of the list of @@ -286,6 +332,22 @@ impl Grammar { let expansion_map = Default::default(); let reduction_map = Default::default(); + let mut tnt_map: HashMap = Default::default(); + + for (index, ter_element) in ter.iter().enumerate() { + tnt_map.insert( + TNTName::new(ter_element.name().to_string(), true), + TNT::Ter(index), + ); + } + + for (index, non_element) in non.iter().enumerate() { + tnt_map.insert( + TNTName::new(non_element.name().to_string(), false), + TNT::Non(index), + ); + } + // NOTE: We cannot calculate accumulators here, as we want the // accumulators of the regular expression of the left-closure, // not of the original one. @@ -294,6 +356,7 @@ impl Grammar { Self { ter, non, + tnt_map, rules, firsts, first_nodes, @@ -304,6 +367,16 @@ impl Grammar { } } + /// Convert from the name of a terminal or a non-terminal to a + /// struct TNT. + #[inline] + pub fn name_to_tnt(&self, name: &TNTName) -> Result { + self.tnt_map + .get(name) + .copied() + .ok_or_else(|| Error::UnknownTNTName(name.name().to_string())) + } + /// Return the name of a terminal or a non-terminal. pub fn name_of_tnt(&self, tnt: TNT) -> Result { match tnt { @@ -313,6 +386,14 @@ impl Grammar { .get(t) .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? .name() + .chars() + .map(|c| if crate::abnf::is_v_char(c) { + c.to_string() + } else { + format!("{:#x}", c as usize) + }) + .collect::>() + .join("") )), TNT::Non(n) => Ok(format!( "N{}", @@ -822,7 +903,7 @@ impl Display for Grammar { f, "{}", rule.regex.to_string_with(|tnt| format!( - "({})", + " {} ", self.name_of_tnt(tnt) .unwrap_or_else(|_| format!("Unknown {tnt:?}")) ))? -- cgit v1.2.3-18-g5258