diff options
author | JSDurand <mmemmew@gmail.com> | 2023-07-08 12:30:21 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-07-08 12:31:13 +0800 |
commit | 9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 (patch) | |
tree | 7bb6004196b38446a5ab0cb3a0ab642d35f113e9 /grammar/src/abnf/mod.rs | |
parent | 691f969eb104fa3d4c2a1667693fd0382eb9d6b5 (diff) |
Finished the Emacs binding.
Now the binding part is finished.
What remains is a bug encountered when planting a fragment to the
forest which intersects a packed node, which would lead to invalid
forests. This will also cause problem when planting a packed
fragment, but until now my testing grammars do not produce packed
fragments, so this problem is not encountered yet.
I am still figuring out efficient ways to solve this problem.
Diffstat (limited to 'grammar/src/abnf/mod.rs')
-rw-r--r-- | grammar/src/abnf/mod.rs | 2536 |
1 files changed, 2536 insertions, 0 deletions
diff --git a/grammar/src/abnf/mod.rs b/grammar/src/abnf/mod.rs new file mode 100644 index 0000000..f825bef --- /dev/null +++ b/grammar/src/abnf/mod.rs @@ -0,0 +1,2536 @@ +//! This file implements the function to read a grammar from a string, +//! in the [augmented Backus Naur +//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form). +//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its +//! specifications. +//! +//! In fact, the rules of ABNF are considered to be too strict. For +//! example, only ASCII characters are allowed in the format, even +//! within comments. I think any character should be allowed within +//! comments, so the implementation here does that. + +#![allow(dead_code)] + +use super::*; + +/// The type of errors encountered during parsing a grammar in the +/// ABNF format. +#[derive(Debug)] +#[non_exhaustive] +pub enum Error { + /// A rule name without rule body. + HangingRuleName(usize), + /// The minimum is greater than maximum? + MinGreaterThanMax(usize), + /// A digit is expected, but is not encountered. + InvalidDigit(usize), + /// A character that is not allowed. + InvalidCharacter(usize), + /// A numeric value is not completed. + HangingNumeric(usize), + /// A repetition is hanging. + HangingRepetition(usize), + /// A double quote is hanging. + HangingDQuote(usize), + /// A percentage sign is hanging. + HangingPercent(usize), + /// An equal sign is hanging. + HangingEqualsSign(usize), + /// An operation is unsupported. + UnsupportedOperation, + /// Encounters an unknown node when constructing the regular + /// expression. + RegexUnknownNode(usize), + /// An index is out of bounds when constructing the regular + /// expression. + RegexIndexOutOfBounds(usize, usize), + /// A node is duplicated when constructing the regular expression. + RegexDuplicatedNode(usize), + /// An edge is duplicated when constructing the regular + /// expression. + RegexDuplicatedEdge(usize, usize), + /// The constructed regular expression graph has no root. + RegexNoRoot, + /// The constructed regular expression graph has cycles. + RegexCycle, + /// The stack illegally becomes empty when constructing a regular + /// expression. + RegexEmptyStack(usize), + /// The action stack illegally becomes empty when constructing a + /// regular expression. + RegexEmptyActionStack(usize), + /// An unbalanced parenthesis is encountered when constructing a + /// regular expression. + RegexUnbalancedParen(usize), + /// An invalid repetition specification is encountered when + /// constructing a regular expression. + RegexInvalidRepeat(usize), + /// An invalid comment character is encountered. + RegexInvalidComment(usize), + /// An invalid numeric value is encoutnered. + RegexInvalidNumeric(usize), + /// A rule should have an equal sign. + MissingEqualSign(usize), + /// An unimplemented feature is required. + Unimplemented(usize), + /// Input-Output error. + IOError(std::io::Error), + /// The grammar is empty. + EmptyGrammar, + /// Found an appending rule associated to a non-existent rule. + AppendToNothing(usize), + /// Found a rule declaration associated to an existing rule. + CreateAgain(usize), + /// This terminal is not found. + UnknownTerminal(String), + /// This non-terminal is not found. + UnknownNonterminal(String), + /// Something invalid has occurred. + Invalid, +} + +impl std::error::Error for Error {} + +impl From<std::io::Error> for Error { + fn from(e: std::io::Error) -> Self { + Self::IOError(e) + } +} + +impl From<graph::error::Error> for Error { + fn from(value: graph::error::Error) -> Self { + match value { + graph::error::Error::IndexOutOfBounds(index, bound) => { + Error::RegexIndexOutOfBounds(index, bound) + } + graph::error::Error::DuplicatedNode(n) => Error::RegexDuplicatedNode(n), + graph::error::Error::DuplicatedEdge(from, to) => Error::RegexDuplicatedEdge(from, to), + _ => Error::Invalid, + } + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::HangingRuleName(pos) => write!(f, "a rule name without body at {pos}"), + Error::MinGreaterThanMax(pos) => write!( + f, + "minimum should not be strictly greater than the maximum at {pos}." + ), + Error::InvalidDigit(pos) => { + write!(f, "a digit is expected at {pos}, but is not found.") + } + Error::InvalidCharacter(pos) => write!(f, "the character at {pos} is not allowed"), + Error::HangingNumeric(pos) => write!(f, "a numeric value is hanging at {pos}"), + Error::HangingRepetition(pos) => write!(f, "a repetition is hanging at {pos}"), + Error::HangingDQuote(pos) => write!(f, "a double quote is hanging at {pos}"), + Error::HangingPercent(pos) => write!(f, "a percentage sign is hanging at {pos}"), + Error::HangingEqualsSign(pos) => write!(f, "an equal sign is hanging at {pos}"), + Error::UnsupportedOperation => write!(f, "an unsupported operation is encountered"), + Error::RegexUnknownNode(n) => write!(f, "an unknown node {n} is encountered \ + when constructing the regular expression."), + Error::RegexIndexOutOfBounds(index, bound) => write!(f, "an index {index} is out of \ + bound {bound} when constructing the regular expression."), + Error::RegexDuplicatedNode(n) => write!(f, "a node {n} is duplicated \ + when constructing the regular expression."), + Error::RegexDuplicatedEdge(from, to) => write!(f, "an edge from {from} to {to} is duplicated \ + when constructing the regular expression."), + Error::RegexNoRoot => write!(f, "the constructed regular expression has no root."), + Error::RegexCycle => write!(f, "the constructed regular expression has cycles."), + Error::RegexEmptyStack(pos) => write!(f, "the stack illegally becomes empty at {pos} \ + when constructing a regular expression."), + Error::RegexEmptyActionStack(pos) => write!(f, "the action stack illegally becomes empty at {pos} \ + when constructing a regular expression."), + Error::RegexUnbalancedParen(pos) => write!(f, "An unbalanced parenthesis is encountered at {pos} \ + when constructing a regular expression."), + Error::RegexInvalidRepeat(pos) => write!(f, "An invalid repetition specification is encountered \ + at {pos} when constructing a regular expression."), + Error::RegexInvalidComment(pos) => write!(f, "An invalid character is encountered in a comment \ + at {pos}"), + Error::RegexInvalidNumeric(pos) => write!(f, "An invalid numeric value is encoutnered at {pos}."), + Error::MissingEqualSign(pos) => write!(f, "A rule is missing an equal sign at {pos}"), + Error::Unimplemented(pos) => write!(f, "An unimplemented feature is required at {pos}."), + Error::IOError(e) => write!(f, "An IO error: {e}"), + Error::EmptyGrammar => write!(f, "the grammar is empty"), + Error::AppendToNothing(pos) => write!(f, "the rule at {pos} is appending to a non-existing rule."), + Error::CreateAgain(pos) => write!(f, "the rule at {pos} is creating a duplicated rule."), + Error::UnknownTerminal(name) => write!(f, "the terminal {name} is not found."), + Error::UnknownNonterminal(name) => write!(f, "the non-terminal {name} is not found."), + Error::Invalid => write!(f, "invalid"), + } + } +} + +/// Return the number of bytes taken by the characters for which the +/// function `f` returns `true`. +fn skip_by(s: &str, f: impl Fn(char) -> bool) -> usize { + for (index, c) in s.char_indices() { + if !f(c) { + return index; + } + } + + s.len() +} + +mod boolean_fns; + +pub(crate) use boolean_fns::*; + +fn skip_c_nl(s: &str) -> Option<usize> { + if s.is_empty() { + return Some(0); + } + + let s_len = s.len(); + + match s.chars().next().unwrap() { + ';' => { + let after_not_newline = skip_by(&s[1..], is_not_newline); + + if 1 + after_not_newline >= s_len { + Some(s_len) + } else { + let s = &s[1 + after_not_newline..]; + + match s.chars().next().unwrap() { + '\n' | '\r' => Some(after_not_newline + 2), + _ => { + // We skipped anything not newline, so this + // character must be newline. + unreachable!(); + } + } + } + } + '\n' | '\r' => Some(1), + _ => None, + } +} + +fn skip_c_wsp(s: &str) -> usize { + let mut currently_skipped = 0usize; + let mut s = s; + + let mut continue_skipping = true; + + 'skipping_loop: while continue_skipping { + continue_skipping = false; + + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp > 0 { + if skip_wsp >= s.len() { + return currently_skipped + s.len(); + } + + continue_skipping = true; + s = &s[skip_wsp..]; + currently_skipped += skip_wsp; + + continue 'skipping_loop; + } + + if let Some(skip_c_nl_size) = skip_c_nl(s) { + if skip_c_nl_size >= s.len() { + return currently_skipped + s.len(); + } + + continue_skipping = true; + + s = &s[skip_c_nl_size..]; + currently_skipped += skip_c_nl_size; + + let skip_wsp_after_c_nl = skip_by(s, is_wsp); + + s = &s[skip_wsp_after_c_nl..]; + currently_skipped += skip_wsp_after_c_nl; + } + } + + currently_skipped +} + +#[derive(Debug, Eq, PartialEq)] +enum RepetitionSpec { + Star, + Equal(usize), + Min(usize), + Max(usize), + MinMax(usize, usize), +} + +#[derive(Debug)] +enum RepetitionAction { + /// Ignore the following element. + Ignore, + /// The following is put in a star. + Star, + /// The following is put in an optional construct. + Optional, + /// The following is put in a plus construct. + Plus, + /// Copy the following some number of times. + Copy(usize), + /// Copy the following some number of times, and then add a plus + /// of that same element. + CopyPlus(usize), + /// Copy the following some number of times, and then add another + /// number of *nested* optional construts of that same element. + /// + /// # Why nested? + /// + /// The optional constructs are nested here as a sequence of + /// optional constructs is exponentially more ambiguous than the + /// nested equivalent: "a?a?a?" is more ambiguous than + /// "(a(a(a)?)?)?". For example, the input "a" can have three + /// derivations according to "a?a?a?", but has only one derivation + /// according to "(a(a(a)?)?)?". + CopyOptional(usize, usize), +} + +impl TryFrom<Option<RepetitionSpec>> for RepetitionAction { + type Error = Error; + + fn try_from(spec: Option<RepetitionSpec>) -> Result<Self, Self::Error> { + if let Some(r) = spec { + match r { + RepetitionSpec::Star => Ok(Self::Star), + RepetitionSpec::Equal(0) => Ok(Self::Ignore), + RepetitionSpec::Equal(n) => Ok(Self::Copy(n)), + RepetitionSpec::Min(0) => Ok(Self::Star), + RepetitionSpec::Min(1) => Ok(Self::Plus), + RepetitionSpec::Min(n) => Ok(Self::CopyPlus(n - 1)), + RepetitionSpec::Max(0) => Ok(Self::Ignore), + RepetitionSpec::Max(1) => Ok(Self::Optional), + RepetitionSpec::Max(n) => Ok(Self::CopyOptional(0, n)), + RepetitionSpec::MinMax(0, 0) => Ok(Self::Ignore), + RepetitionSpec::MinMax(0, 1) => Ok(Self::Optional), + RepetitionSpec::MinMax(n, m) if n > m => Err(Error::MinGreaterThanMax(0)), + RepetitionSpec::MinMax(n, m) if n == m => Ok(Self::Copy(n)), + RepetitionSpec::MinMax(n, m) => Ok(Self::CopyOptional(n, m - n)), + } + } else { + Ok(Self::Copy(1)) + } + } +} + +fn parse_repetition_spec(s: &str) -> Option<(usize, RepetitionSpec)> { + let mut s = s; + + let digit_size = skip_by(s, is_decimal_digit); + + let mut min: Option<usize> = None; + let mut max: Option<usize> = None; + + if digit_size != 0 { + min = Some(s[..digit_size].parse::<usize>().unwrap()); + } + + let mut current_index = digit_size; + + if digit_size == s.len() { + if let Some(m) = min { + return Some((digit_size, RepetitionSpec::Equal(m))); + } else { + return None; + }; + } + + s = &s[current_index..]; + + if s.starts_with('*') { + if s.len() == 1 { + if let Some(m) = min { + return Some((current_index + 1, RepetitionSpec::Min(m))); + } else { + return Some((current_index + 1, RepetitionSpec::Star)); + } + } + + s = &s[1..]; + current_index += 1; + + let digit_size = skip_by(s, is_decimal_digit); + + if digit_size != 0 { + max = Some(s[..digit_size].parse::<usize>().unwrap()); + } + + current_index += digit_size; + + match (min, max) { + (Some(mi), Some(ma)) => Some((current_index, RepetitionSpec::MinMax(mi, ma))), + (Some(m), None) => Some((current_index, RepetitionSpec::Min(m))), + (None, Some(m)) => Some((current_index, RepetitionSpec::Max(m))), + _ => Some((current_index, RepetitionSpec::Star)), + } + } else { + min.map(|m| (current_index, RepetitionSpec::Equal(m))) + } +} + +fn parse_rulename(s: &str) -> Result<usize, Error> { + let first_char = s.chars().next().unwrap(); + + if !is_alpha(first_char) { + return Ok(0); + } + + let rulename_end = skip_by(s, is_alpha_or_digit_or_hyphen); + + if rulename_end >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + Ok(rulename_end) +} + +#[derive(Debug, Eq, PartialEq, Hash)] +enum IntermediateConstruct { + Terminal(String), + Nonterminal(String), + Range(char, char), +} + +impl Display for IntermediateConstruct { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IntermediateConstruct::Terminal(t) => write!(f, "t{t}"), + IntermediateConstruct::Nonterminal(n) => write!(f, "n{n}"), + IntermediateConstruct::Range(min, max) => write!(f, "[{min}-{max}]"), + } + } +} + +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +enum Radix { + Binary, + Decimal, + Hexadecimal, +} + +fn parse_numeric(s: &str, radix: Radix) -> Result<(usize, IntermediateConstruct), Error> { + // first read a number + + let is_radix_digit = match radix { + Radix::Binary => is_binary_digit, + Radix::Decimal => is_decimal_digit, + Radix::Hexadecimal => is_hexadecimal_digit, + }; + + let radix = match radix { + Radix::Binary => 2, + Radix::Decimal => 10, + Radix::Hexadecimal => 16, + }; + + let mut s = s; + let mut current_index = 0; + + let first_digit_size = skip_by(s, is_radix_digit); + + if first_digit_size == 0 { + // dbg!(); + return Err(Error::InvalidDigit(current_index)); + } + + let first_number = + match std::char::from_u32(u32::from_str_radix(&s[..first_digit_size], radix).unwrap()) { + Some(c) => c, + None => { + // dbg!(); + return Err(Error::InvalidCharacter(current_index)); + } + }; + + if first_digit_size >= s.len() { + return Ok(( + current_index + s.len(), + IntermediateConstruct::Terminal(first_number.to_string()), + )); + } + + s = &s[first_digit_size..]; + + current_index += first_digit_size; + + // then read a dot or a dash + + match s.chars().next().unwrap() { + '.' => { + s = &s[1..]; + current_index += 1; + + let mut stop_reading = false; + let mut chars_vec = vec![first_number]; + + while !stop_reading { + if s.is_empty() { + return Err(Error::HangingNumeric(current_index - 1)); + } + + let digit_size = skip_by(s, is_radix_digit); + + if digit_size == 0 { + // dbg!(); + return Err(Error::InvalidDigit(current_index)); + } + + let a_number = + std::char::from_u32(u32::from_str_radix(&s[..digit_size], radix).unwrap()) + .ok_or(Error::InvalidCharacter(current_index))?; + + chars_vec.push(a_number); + + s = &s[digit_size..]; + current_index += digit_size; + + if !s.starts_with('.') { + stop_reading = true; + // println!( + // "digit_size = {digit_size} a number = {a_number}, s = {}", + // &s[..10] + // ); + } else { + s = &s[1..]; + current_index += 1; + } + } + + Ok(( + current_index, + IntermediateConstruct::Terminal(chars_vec.into_iter().collect()), + )) + } + '-' => { + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingNumeric(current_index - 1)); + } + + let second_digit_size = skip_by(s, is_radix_digit); + + if second_digit_size == 0 { + println!( + "s = {}", + s.chars() + .take(if s.len() >= 50 { 50 } else { s.len() }) + .collect::<String>() + ); + return Err(Error::InvalidDigit(current_index)); + } + + let _second_number = + std::char::from_u32(u32::from_str_radix(&s[..second_digit_size], radix).unwrap()) + .ok_or(Error::InvalidCharacter(current_index))?; + + Err(Error::Unimplemented(current_index - 1)) + + // Ok(( + // current_index + second_digit_size, + // IntermediateConstruct::Range(first_number, second_number), + // )) + } + c if is_wsp(c) || c == '\n' || c == '\r' || c == ';' => Ok(( + current_index, + IntermediateConstruct::Terminal(first_number.to_string()), + )), + _ => Err(Error::InvalidCharacter(current_index)), + } +} + +#[derive(Debug)] +struct AlternationConstruct { + intermediates: Vec<IntermediateConstruct>, + regex: DefaultRegex<usize>, +} + +fn parse_alternation(s: &str, indentation: usize) -> Result<(usize, AlternationConstruct), Error> { + let mut stack: Vec<usize> = vec![0, 1]; + let mut intermediates: Vec<IntermediateConstruct> = Vec::new(); + let mut types: Vec<RegexType<usize>> = vec![RegexType::Or, RegexType::Empty]; + let mut edges: Vec<Vec<usize>> = vec![vec![1], Vec::new()]; + + fn error_conversion(error: nfa::error::Error) -> Error { + match error { + nfa::error::Error::UnknownNode(n) => Error::RegexUnknownNode(n), + nfa::error::Error::UnsupportedOperation => Error::UnsupportedOperation, + nfa::error::Error::NoRoot => Error::RegexNoRoot, + nfa::error::Error::Graph(ge) => ge.into(), + nfa::error::Error::Cycle => Error::RegexCycle, + _ => Error::Invalid, + } + } + + let mut s = s; + let mut current_index = 0usize; + + let mut nonterminal_nodes: HashSet<usize> = HashSet::new(); + + if s.is_empty() { + let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; + + return Ok(( + current_index, + AlternationConstruct { + intermediates, + regex, + }, + )); + } + + // A vector of tuples to record some information for each level of + // nesting. + // + // A tuple (len, action) means that the stack grows by LEN many + // tokens at this level, and the action ACTION is pending. + let mut repetition_action_stack: Vec<(usize, Option<RepetitionAction>)> = vec![]; + + while !stack.is_empty() { + // First read repetition specifications. + let mut repetition_spec = None; + + if let Some((spec_size, spec)) = parse_repetition_spec(s) { + if spec_size >= s.len() { + return Err(Error::HangingRepetition(current_index)); + } + + s = &s[spec_size..]; + current_index += spec_size; + + repetition_spec = Some(spec); + } + + let repetition_spec = repetition_spec; + + // if a rulename + match parse_rulename(s) { + Ok(0) => {} + Ok(size) => { + let name = &s[..size].to_string(); + + intermediates.push(IntermediateConstruct::Nonterminal(name.clone())); + + let nt_index = intermediates.len() - 1; + + let action = repetition_spec.try_into()?; + // println!("rep_action = {action:?}"); + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + types.push(RegexType::Lit(nt_index)); + + edges.push(vec![types.len() - 1]); + edges.push(Vec::new()); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Optional => { + types.append(&mut vec![RegexType::Optional]); + types.push(RegexType::Lit(nt_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Plus => { + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(nt_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Copy(n) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // edges[stack_last].push(types.len() - 1); + // } + + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(nt_index)); + + nonterminal_nodes.insert(types.len() - 1); + + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::CopyOptional(n, m) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 1); + // } + + let types_len = types.len(); + + types.extend( + std::iter::repeat([RegexType::Optional, RegexType::Lit(nt_index)]) + .take(m) + .flatten(), + ); + + edges.extend((0..(m - 1)).flat_map(|i| { + [ + vec![types_len + 1 + 2 * i, types_len + 2 + 2 * i], + Vec::new(), + ] + })); + + edges.extend([vec![types.len() - 1], Vec::new()]); + + nonterminal_nodes.extend((0..m).map(|i| types_len + 2 * i + 1)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types_len); + + // for _ in 0..m { + // types.extend([RegexType::Optional, RegexType::Lit(nt_index)]); + // edges.extend([vec![types.len() - 1], Vec::new()]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 2); + // } + } + } + + s = &s[size..]; + current_index += size; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + Err(_) => { + intermediates.push(IntermediateConstruct::Nonterminal(s.to_owned())); + + types.push(RegexType::Lit(intermediates.len() - 1)); + edges.push(vec![]); + + let stack_last = stack + .last() + .copied() + .ok_or(Error::RegexEmptyStack(current_index))?; + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + nonterminal_nodes.insert(types.len() - 1); + + break; + } + } + + // match against the first charcter + + let first_char = s.chars().next().unwrap(); + + match first_char { + '(' => { + // group + + s = &s[1..]; + current_index += 1; + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Star => { + types.extend([RegexType::Kleene, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + repetition_action_stack.push((3, None)); + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + } + RepetitionAction::Optional => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + repetition_action_stack.push((3, None)); + } + RepetitionAction::Plus => { + types.extend([RegexType::Plus, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + repetition_action_stack.push((3, None)); + } + RepetitionAction::Copy(1) => { + types.extend([RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + + stack.extend([types.len() - 2, types.len() - 1]); + repetition_action_stack.push((2, None)); + } + RepetitionAction::CopyOptional(0, _) => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + + repetition_action_stack.push((3, Some(action))); + } + _ => { + types.extend([RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + + stack.extend([types.len() - 2, types.len() - 1]); + + repetition_action_stack.push((2, Some(action))); + } + } + + s = &s[1..]; + current_index += 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + return Err(Error::RegexUnbalancedParen(current_index)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + ')' => { + // end group + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + let (stack_growth, repetition_action) = repetition_action_stack + .pop() + .ok_or(Error::RegexEmptyActionStack(current_index))?; + + match repetition_action { + Some(action) => { + if stack.len() <= stack_growth + 1 { + dbg!(); + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth + 1); + + // this is not going to fail because of the previous check + let stack_last = stack.pop().unwrap(); + let stack_second_last = stack.iter().copied().last().unwrap(); + + match action { + RepetitionAction::Ignore => { + edges[stack_second_last].pop(); + + types.truncate(stack_last); + edges.truncate(stack_last); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyPlus(n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_second_last].push(types.len() - 1); + + // if stack_second_last == 3 && nodes.len() == 4 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyOptional(n, m) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + edges[stack_second_last].push(types.len()); + + let nodes_cloned_len = nodes_cloned.len(); + + for _ in 0..(m - 1) { + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + nodes_cloned_len]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Star + | RepetitionAction::Optional + | RepetitionAction::Plus => { + // this will not happen here + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + None => { + if stack.len() <= stack_growth { + dbg!(); + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + '[' => { + // option + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Ignore => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, Some(action))); + } + RepetitionAction::Optional => { + types.extend([ + RegexType::Optional, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Star => { + types.extend([ + RegexType::Kleene, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Plus => { + types.extend([ + RegexType::Kleene, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Copy(1) => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, None)); + } + _ => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, Some(action))); + } + } + + s = &s[1..]; + current_index += 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + return Err(Error::RegexUnbalancedParen(current_index)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + ']' => { + // end option + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + let (stack_growth, repetition_action) = repetition_action_stack + .pop() + .ok_or(Error::RegexEmptyActionStack(current_index))?; + + match repetition_action { + Some(action) => { + if stack.len() <= stack_growth + 1 { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth + 1); + + // this is not going to fail because of the previous check + let stack_last = stack.pop().unwrap(); + let stack_second_last = stack.iter().copied().last().unwrap(); + + match action { + RepetitionAction::Ignore => { + edges[stack_second_last].pop(); + + types.truncate(stack_last); + edges.truncate(stack_last); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyPlus(n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_second_last].push(types.len() - 1); + + // if stack_second_last == 3 && nodes.len() == 4 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyOptional(n, m) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + let nodes_cloned_len = nodes_cloned.len(); + + for _ in 0..(m - 1) { + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + nodes_cloned_len]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Star + | RepetitionAction::Optional + | RepetitionAction::Plus => { + // this will not happen here + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + None => { + if stack.len() <= stack_growth { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + '/' => { + // end concatenation + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + if stack.len() <= 1 { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.pop().unwrap(); + + let stack_last = stack.iter().copied().last().unwrap(); + + edges[stack_last].push(types.len()); + + types.push(RegexType::Empty); + edges.push(vec![]); + + stack.push(types.len() - 1); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '"' => { + // string + + s = &s[1..]; + current_index += 1; + + let skip_char_value = skip_by(s, is_char_value); + + if skip_char_value >= s.len() { + return Err(Error::HangingDQuote(current_index)); + } + + let fakes = &s[skip_char_value..]; + + if !fakes.starts_with('"') { + return Err(Error::HangingDQuote(current_index)); + } + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + let slice_value = s[..skip_char_value].to_owned(); + + intermediates.push(IntermediateConstruct::Terminal(slice_value.clone())); + + let t_index = intermediates.len() - 1; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Optional => { + types.append(&mut vec![RegexType::Optional]); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Plus => { + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Copy(n) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + // for _ in 0..n { + // types.push(RegexType::Lit(t_index)); + // edges.push(vec![]); + + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + types.extend([RegexType::Plus, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::CopyOptional(n, m) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len()); + + for _ in 0..(m - 1) { + types.extend([RegexType::Optional, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1, types.len()], Vec::new()]); + } + + types.extend([RegexType::Optional, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1], Vec::new()]); + } + } + + s = &fakes[1..]; + current_index += skip_char_value + 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '%' => { + // numeric value + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingPercent(current_index - 1)); + } + + let inner_first_char = s.chars().next().unwrap(); + + let intermediate_value: IntermediateConstruct; + + match inner_first_char { + 'b' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Binary)?; + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + 'd' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Decimal)?; + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + 'x' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Hexadecimal)?; + // println!("size = {parsed_size}, construct = {parsed_construct:#?}"); + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + _ => { + return Err(Error::RegexInvalidNumeric(current_index - 1)); + } + } + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + intermediates.push(intermediate_value); + + let inter_index = intermediates.len() - 1; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Optional => { + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Plus => { + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Copy(n) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + // for _ in 0..n { + // types.push(RegexType::Lit(inter_index)); + // edges.push(vec![]); + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::CopyOptional(n, m) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + for _ in 0..(m - 1) { + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + 1]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '<' => { + // prose value + return Err(Error::Unimplemented(current_index)); + } + ';' => { + // Possibly extend to the next line if the indentation + // of the next line is greater than `indentation`. + + s = &s[1..]; + current_index += 1; + + let mut in_comment = true; + + loop { + let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; + + let mut after_wsp_and_v_char = skip_by(s, skip_fn); + + if after_wsp_and_v_char >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[after_wsp_and_v_char..]; + current_index += after_wsp_and_v_char; + + let first_char = s.chars().next().unwrap(); + + let mut skip_again = false; + + match first_char { + '\n' | '\r' => { + s = &s[1..]; + current_index += 1; + skip_again = true; + } + _ if in_comment => { + // dbg!("1815"); + return Err(Error::RegexInvalidComment(current_index)); + } + _ => {} + } + + if skip_again { + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + after_wsp_and_v_char = skip_wsp; + + s = &s[skip_wsp..]; + current_index += skip_wsp; + } + + let first_char = s.chars().next().unwrap(); + + match first_char { + ';' => { + // continue looping + s = &s[1..]; + current_index += 1; + in_comment = true; + + continue; + } + '\n' | '\r' => { + in_comment = false; + s = &s[1..]; + current_index += 1; + + continue; + } + _ if after_wsp_and_v_char <= indentation => { + stack = vec![]; + break; + } + _ => { + break; + } + } + } + } + '\n' | '\r' => { + // Possibly extend to the next line if the indentation + // of the next line is greater than `indentation`. + // println!("s = {}", s.chars().take(10).collect::<String>()); + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + break; + } + + let mut in_comment = false; + + loop { + let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; + + let mut after_wsp = skip_by(s, skip_fn); + + if after_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[after_wsp..]; + current_index += after_wsp; + + let first_char = s.chars().next().unwrap(); + + let mut skip_again = false; + + match first_char { + '\n' | '\r' => { + s = &s[1..]; + current_index += 1; + skip_again = true; + } + _ if in_comment => { + // dbg!("1903"); + return Err(Error::RegexInvalidComment(current_index)); + } + _ => {} + } + + if skip_again { + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[skip_wsp..]; + current_index += skip_wsp; + + after_wsp = skip_wsp; + } + + let first_char = s.chars().next().unwrap(); + + match first_char { + ';' => { + // continue looping + s = &s[1..]; + current_index += 1; + in_comment = true; + + continue; + } + '\n' | '\r' => { + in_comment = false; + s = &s[1..]; + current_index += 1; + + continue; + } + _ if after_wsp <= indentation => { + stack = vec![]; + break; + } + _ => { + break; + } + } + } + } + _ => { + // invalid + println!("s = {}", &s[..(if s.len() > 50 { 50 } else { s.len() })]); + return Err(Error::InvalidCharacter(current_index)); + } + } + } + + let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; + + Ok(( + current_index, + AlternationConstruct { + intermediates, + regex, + }, + )) +} + +type RuleConstruct = ( + // Name + String, + // Data + Vec<IntermediateConstruct>, + // Rule + DefaultRegex<usize>, + // extra_alternation_or_not + bool, +); + +fn parse_one_rule(s: &str, indentation: usize) -> Result<(usize, RuleConstruct), Error> { + if s.is_empty() { + return Ok((0, ("".to_owned(), Vec::new(), Default::default(), false))); + } + + let mut s = s; + + let mut current_index = 0usize; + + let mut appending_alternation = false; + + // rule name + + let rulename_end = parse_rulename(s)?; + let rule_name = s[..rulename_end].to_string(); + s = &s[rulename_end..]; + current_index += rulename_end; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + // equal or equal forward slash + + let next_char = s.chars().next().unwrap(); + + if next_char != '=' { + // println!("s = {}", &s[..50]); + // println!("rule_name = {rule_name}"); + return Err(Error::MissingEqualSign(current_index + 1)); + } + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingEqualsSign(current_index - 1)); + } + + let next_char = s.chars().next().unwrap(); + + if next_char == '/' { + appending_alternation = true; + s = &s[1..]; + current_index += 1; + } + + if s.is_empty() { + return Err(Error::HangingEqualsSign(current_index - 1)); + } + + // c wsp + + let skip_c_wsp_size = skip_c_wsp(s); + + if skip_c_wsp_size >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + // elements + + // alternation + + let (alternation_size, alternation_result) = parse_alternation(s, indentation)?; + + let intermediates_data = alternation_result.intermediates; + let regex = alternation_result.regex; + + // println!("size = {alternation_size}, result = {alternation_result:?}"); + + if alternation_size >= s.len() { + s = ""; + } else { + s = &s[alternation_size..]; + } + + current_index += alternation_size; + + // c wsp + + let skip_c_wsp_size = skip_c_wsp(s); + + if skip_c_wsp_size >= s.len() { + s = ""; + } else { + s = &s[skip_c_wsp_size..]; + } + + current_index += skip_c_wsp_size; + + // c-nl + + if let Some(skip_c_nl) = skip_c_nl(s) { + current_index += skip_c_nl; + } + + Ok(( + current_index, + (rule_name, intermediates_data, regex, appending_alternation), + )) +} + +impl std::str::FromStr for Grammar { + // First skip whitespaces, linefeeds, carraige returns and + // semicolons until the first rule appears. The basic indentation + // level is set to the indentation level of the first rule's line. + // + // Then enter identifier reading mode, reading a letter followed + // by a sequence of letters, digits and hyphens. + // + // After skipping white spaces, read an equal sign. + // + // Then enter alternation reading mode, reading alternations of + // concatenations. + // + // Then enter element reading mode, reading optionally repetition + // specification followed by an identifier, or a numeral value, or + // a quoted string. + + type Err = Error; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + let mut before_first_rule = true; + let mut basic_indentation: usize = 0; + let mut current_skipped_index: usize = 0; + + let mut s = s; + + while before_first_rule { + before_first_rule = false; + + let after_wsp = skip_by(s, is_wsp); + + if after_wsp >= s.len() { + // The entire string consists of white-spaces and + // comments. + return Err(Error::EmptyGrammar); + } + + s = &s[after_wsp..]; + current_skipped_index += after_wsp; + + match skip_c_nl(s) { + Some(skipped_size) => { + if skipped_size >= s.len() { + return Err(Error::EmptyGrammar); + } + + // Repeat skipping until we find the first rule. + before_first_rule = true; + + s = &s[skipped_size..]; + current_skipped_index += skipped_size; + } + None => { + // Found the first rule. + basic_indentation = after_wsp; + } + } + } + + // TODO: Core rules are not implemented for now. + + #[allow(unused_macros)] + macro_rules! log { + () => { + let mut file = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open("debug.log") + .unwrap(); + + let _ = file.write_all(format!("{}\n", line!()).as_bytes()); + }; + } + + let mut nonterminals_vec: Vec<String> = vec![]; + let mut nonterminals_map: HashMap<String, usize> = HashMap::new(); + + let mut all_rules: HashMap<String, DefaultRegex<TNT>> = Default::default(); + + let mut terminals_vec: Vec<String> = vec![]; + let mut terminals_map: HashMap<String, usize> = HashMap::new(); + + let mut temp = s; + + // First collect non-terminals in the order they are defined. + loop { + let (rule_size, (rule_name, _rule_data, _rule_regex, appending_p)) = + parse_one_rule(temp, basic_indentation)?; + + if rule_size == 0 { + break; + } + + if appending_p && nonterminals_map.get(&rule_name).is_none() { + return Err(Error::AppendToNothing(current_skipped_index)); + } else if !appending_p && nonterminals_map.get(&rule_name).is_some() { + return Err(Error::CreateAgain(current_skipped_index)); + } + + nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); + nonterminals_vec.push(rule_name); + + if rule_size > 0 && rule_size < temp.len() { + temp = &temp[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + let mut temp = s; + + // Then gather the data about names of terminals and + // non-terminals. + loop { + let (rule_size, (_rule_name, rule_data, _rule_regex, _appending_p)) = + parse_one_rule(temp, basic_indentation)?; + + if rule_size == 0 { + break; + } + + for data in rule_data { + match data { + IntermediateConstruct::Terminal(name) => { + if terminals_map.get(&name).is_none() { + terminals_map.insert(name.clone(), terminals_vec.len()); + terminals_vec.push(name); + } + } + IntermediateConstruct::Nonterminal(name) => { + if nonterminals_map.get(&name).is_none() { + return Err(Error::UnknownNonterminal(name)); + } + } + IntermediateConstruct::Range(_, _) => unimplemented!(), + } + } + + if rule_size > 0 && rule_size < temp.len() { + temp = &temp[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + // Then collect the actual rules. + loop { + let (rule_size, (rule_name, rule_data, rule_regex, appending_p)) = + parse_one_rule(s, basic_indentation)?; + // println!("rule name = {rule_name}"); + // println!("regexp = {rule_regexp:#?}"); + + let rule_transform = |old_label: usize| -> Result<TNT, Error> { + if let Some(old_value) = rule_data.get(old_label) { + match old_value { + IntermediateConstruct::Terminal(name) => { + if let Some(index) = terminals_map.get(name) { + Ok(TNT::Ter(*index)) + } else { + dbg!(); + Err(Error::UnknownTerminal(name.clone())) + } + } + IntermediateConstruct::Nonterminal(name) => { + if let Some(index) = nonterminals_map.get(name) { + Ok(TNT::Non(*index)) + } else { + dbg!(); + Err(Error::UnknownNonterminal(name.clone())) + } + } + IntermediateConstruct::Range(_, _) => { + dbg!(); + Err(Error::UnsupportedOperation) + } + } + } else { + dbg!(); + Err(Error::RegexIndexOutOfBounds(old_label, rule_data.len())) + } + }; + + let transformed_rule = rule_regex.maps_to(rule_transform)?; + + if rule_size == 0 { + break; + } + + if !appending_p { + nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); + nonterminals_vec.push(rule_name.clone()); + + all_rules.insert(rule_name, transformed_rule); + } else { + all_rules + .get_mut(&rule_name) + .unwrap() + .append(transformed_rule); + } + + if rule_size > 0 && rule_size < s.len() { + s = &s[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + let rules_vec: Vec<Rule> = nonterminals_vec + .iter() + .map(|name| { + all_rules + .get(name) + .map(|regex| Rule::new(regex.clone())) + .ok_or(Error::UnknownNonterminal(name.clone())) + }) + .collect::<Result<_, _>>()?; + + let terminals: Vec<Terminal> = terminals_vec.into_iter().map(Terminal::new).collect(); + let nonterminals: Vec<Nonterminal> = + nonterminals_vec.into_iter().map(Nonterminal).collect(); + + Ok(Self::new(terminals, nonterminals, rules_vec)) + } +} + +#[cfg(test)] +mod tests; |