summaryrefslogtreecommitdiff
path: root/grammar
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-07-08 12:30:21 +0800
committerJSDurand <mmemmew@gmail.com>2023-07-08 12:31:13 +0800
commit9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 (patch)
tree7bb6004196b38446a5ab0cb3a0ab642d35f113e9 /grammar
parent691f969eb104fa3d4c2a1667693fd0382eb9d6b5 (diff)
Finished the Emacs binding.
Now the binding part is finished. What remains is a bug encountered when planting a fragment to the forest which intersects a packed node, which would lead to invalid forests. This will also cause problem when planting a packed fragment, but until now my testing grammars do not produce packed fragments, so this problem is not encountered yet. I am still figuring out efficient ways to solve this problem.
Diffstat (limited to 'grammar')
-rw-r--r--grammar/src/abnf.rs37
-rw-r--r--grammar/src/abnf/boolean_fns.rs42
-rw-r--r--grammar/src/abnf/mod.rs2536
-rw-r--r--grammar/src/abnf/tests.rs291
-rw-r--r--grammar/src/lib.rs85
5 files changed, 2952 insertions, 39 deletions
diff --git a/grammar/src/abnf.rs b/grammar/src/abnf.rs
deleted file mode 100644
index eb56819..0000000
--- a/grammar/src/abnf.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-//! This file implements the function to read a grammar from a string,
-//! in the [augmented Backus Naur
-//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form).
-//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its
-//! specifications.
-//!
-//! In fact, the rules of ABNF are considered to be too strict. For
-//! example, only ASCII characters are allowed in the format, even
-//! within comments. I think any character should be allowed within
-//! comments, so the implementation here does that.
-
-use super::*;
-
-/// The type of errors encountered during parsing a grammar in the
-/// ABNF format. See the function [`abnf_to_grammar`] for the parsing
-/// function.
-#[derive(Debug, Clone, Copy)]
-pub enum Error {
- /// Something invalid has occurred.
- Invalid,
-}
-
-impl core::fmt::Display for Error {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
- match self {
- Error::Invalid => write!(f, "invalid"),
- }
- }
-}
-
-#[allow(unused)]
-/// This function converts a string to a grammar.
-pub fn abnf_to_grammar(str: impl AsRef<str>) -> Result<Grammar, Error> {
- let str = str.as_ref();
-
- todo!()
-}
diff --git a/grammar/src/abnf/boolean_fns.rs b/grammar/src/abnf/boolean_fns.rs
new file mode 100644
index 0000000..08f317c
--- /dev/null
+++ b/grammar/src/abnf/boolean_fns.rs
@@ -0,0 +1,42 @@
+//! This file collects some functions that return a boolean value from
+//! a character.
+
+pub(super) fn is_wsp(c: char) -> bool {
+ c == ' ' || c == '\t'
+}
+
+pub(crate) fn is_v_char(c: char) -> bool {
+ ('!'..='~').contains(&c)
+}
+
+pub(super) fn is_wsp_or_v_char(c: char) -> bool {
+ is_wsp(c) || is_v_char(c)
+}
+
+pub(super) fn is_not_newline(c: char) -> bool {
+ c != '\n' && c != '\r'
+}
+
+pub(super) fn is_alpha(c: char) -> bool {
+ c.is_ascii_alphabetic()
+}
+
+pub(super) fn is_decimal_digit(c: char) -> bool {
+ c.is_ascii_digit()
+}
+
+pub(super) fn is_binary_digit(c: char) -> bool {
+ c == '0' || c == '1'
+}
+
+pub(super) fn is_hexadecimal_digit(c: char) -> bool {
+ is_decimal_digit(c) || ('A'..='F').contains(&c) || ('a'..='f').contains(&c)
+}
+
+pub(super) fn is_alpha_or_digit_or_hyphen(c: char) -> bool {
+ is_alpha(c) || is_decimal_digit(c) || c == '-'
+}
+
+pub(super) fn is_char_value(c: char) -> bool {
+ (' '..='~').contains(&c) && c != '"'
+}
diff --git a/grammar/src/abnf/mod.rs b/grammar/src/abnf/mod.rs
new file mode 100644
index 0000000..f825bef
--- /dev/null
+++ b/grammar/src/abnf/mod.rs
@@ -0,0 +1,2536 @@
+//! This file implements the function to read a grammar from a string,
+//! in the [augmented Backus Naur
+//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form).
+//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its
+//! specifications.
+//!
+//! In fact, the rules of ABNF are considered to be too strict. For
+//! example, only ASCII characters are allowed in the format, even
+//! within comments. I think any character should be allowed within
+//! comments, so the implementation here does that.
+
+#![allow(dead_code)]
+
+use super::*;
+
+/// The type of errors encountered during parsing a grammar in the
+/// ABNF format.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum Error {
+ /// A rule name without rule body.
+ HangingRuleName(usize),
+ /// The minimum is greater than maximum?
+ MinGreaterThanMax(usize),
+ /// A digit is expected, but is not encountered.
+ InvalidDigit(usize),
+ /// A character that is not allowed.
+ InvalidCharacter(usize),
+ /// A numeric value is not completed.
+ HangingNumeric(usize),
+ /// A repetition is hanging.
+ HangingRepetition(usize),
+ /// A double quote is hanging.
+ HangingDQuote(usize),
+ /// A percentage sign is hanging.
+ HangingPercent(usize),
+ /// An equal sign is hanging.
+ HangingEqualsSign(usize),
+ /// An operation is unsupported.
+ UnsupportedOperation,
+ /// Encounters an unknown node when constructing the regular
+ /// expression.
+ RegexUnknownNode(usize),
+ /// An index is out of bounds when constructing the regular
+ /// expression.
+ RegexIndexOutOfBounds(usize, usize),
+ /// A node is duplicated when constructing the regular expression.
+ RegexDuplicatedNode(usize),
+ /// An edge is duplicated when constructing the regular
+ /// expression.
+ RegexDuplicatedEdge(usize, usize),
+ /// The constructed regular expression graph has no root.
+ RegexNoRoot,
+ /// The constructed regular expression graph has cycles.
+ RegexCycle,
+ /// The stack illegally becomes empty when constructing a regular
+ /// expression.
+ RegexEmptyStack(usize),
+ /// The action stack illegally becomes empty when constructing a
+ /// regular expression.
+ RegexEmptyActionStack(usize),
+ /// An unbalanced parenthesis is encountered when constructing a
+ /// regular expression.
+ RegexUnbalancedParen(usize),
+ /// An invalid repetition specification is encountered when
+ /// constructing a regular expression.
+ RegexInvalidRepeat(usize),
+ /// An invalid comment character is encountered.
+ RegexInvalidComment(usize),
+ /// An invalid numeric value is encoutnered.
+ RegexInvalidNumeric(usize),
+ /// A rule should have an equal sign.
+ MissingEqualSign(usize),
+ /// An unimplemented feature is required.
+ Unimplemented(usize),
+ /// Input-Output error.
+ IOError(std::io::Error),
+ /// The grammar is empty.
+ EmptyGrammar,
+ /// Found an appending rule associated to a non-existent rule.
+ AppendToNothing(usize),
+ /// Found a rule declaration associated to an existing rule.
+ CreateAgain(usize),
+ /// This terminal is not found.
+ UnknownTerminal(String),
+ /// This non-terminal is not found.
+ UnknownNonterminal(String),
+ /// Something invalid has occurred.
+ Invalid,
+}
+
+impl std::error::Error for Error {}
+
+impl From<std::io::Error> for Error {
+ fn from(e: std::io::Error) -> Self {
+ Self::IOError(e)
+ }
+}
+
+impl From<graph::error::Error> for Error {
+ fn from(value: graph::error::Error) -> Self {
+ match value {
+ graph::error::Error::IndexOutOfBounds(index, bound) => {
+ Error::RegexIndexOutOfBounds(index, bound)
+ }
+ graph::error::Error::DuplicatedNode(n) => Error::RegexDuplicatedNode(n),
+ graph::error::Error::DuplicatedEdge(from, to) => Error::RegexDuplicatedEdge(from, to),
+ _ => Error::Invalid,
+ }
+ }
+}
+
+impl std::fmt::Display for Error {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Error::HangingRuleName(pos) => write!(f, "a rule name without body at {pos}"),
+ Error::MinGreaterThanMax(pos) => write!(
+ f,
+ "minimum should not be strictly greater than the maximum at {pos}."
+ ),
+ Error::InvalidDigit(pos) => {
+ write!(f, "a digit is expected at {pos}, but is not found.")
+ }
+ Error::InvalidCharacter(pos) => write!(f, "the character at {pos} is not allowed"),
+ Error::HangingNumeric(pos) => write!(f, "a numeric value is hanging at {pos}"),
+ Error::HangingRepetition(pos) => write!(f, "a repetition is hanging at {pos}"),
+ Error::HangingDQuote(pos) => write!(f, "a double quote is hanging at {pos}"),
+ Error::HangingPercent(pos) => write!(f, "a percentage sign is hanging at {pos}"),
+ Error::HangingEqualsSign(pos) => write!(f, "an equal sign is hanging at {pos}"),
+ Error::UnsupportedOperation => write!(f, "an unsupported operation is encountered"),
+ Error::RegexUnknownNode(n) => write!(f, "an unknown node {n} is encountered \
+ when constructing the regular expression."),
+ Error::RegexIndexOutOfBounds(index, bound) => write!(f, "an index {index} is out of \
+ bound {bound} when constructing the regular expression."),
+ Error::RegexDuplicatedNode(n) => write!(f, "a node {n} is duplicated \
+ when constructing the regular expression."),
+ Error::RegexDuplicatedEdge(from, to) => write!(f, "an edge from {from} to {to} is duplicated \
+ when constructing the regular expression."),
+ Error::RegexNoRoot => write!(f, "the constructed regular expression has no root."),
+ Error::RegexCycle => write!(f, "the constructed regular expression has cycles."),
+ Error::RegexEmptyStack(pos) => write!(f, "the stack illegally becomes empty at {pos} \
+ when constructing a regular expression."),
+ Error::RegexEmptyActionStack(pos) => write!(f, "the action stack illegally becomes empty at {pos} \
+ when constructing a regular expression."),
+ Error::RegexUnbalancedParen(pos) => write!(f, "An unbalanced parenthesis is encountered at {pos} \
+ when constructing a regular expression."),
+ Error::RegexInvalidRepeat(pos) => write!(f, "An invalid repetition specification is encountered \
+ at {pos} when constructing a regular expression."),
+ Error::RegexInvalidComment(pos) => write!(f, "An invalid character is encountered in a comment \
+ at {pos}"),
+ Error::RegexInvalidNumeric(pos) => write!(f, "An invalid numeric value is encoutnered at {pos}."),
+ Error::MissingEqualSign(pos) => write!(f, "A rule is missing an equal sign at {pos}"),
+ Error::Unimplemented(pos) => write!(f, "An unimplemented feature is required at {pos}."),
+ Error::IOError(e) => write!(f, "An IO error: {e}"),
+ Error::EmptyGrammar => write!(f, "the grammar is empty"),
+ Error::AppendToNothing(pos) => write!(f, "the rule at {pos} is appending to a non-existing rule."),
+ Error::CreateAgain(pos) => write!(f, "the rule at {pos} is creating a duplicated rule."),
+ Error::UnknownTerminal(name) => write!(f, "the terminal {name} is not found."),
+ Error::UnknownNonterminal(name) => write!(f, "the non-terminal {name} is not found."),
+ Error::Invalid => write!(f, "invalid"),
+ }
+ }
+}
+
+/// Return the number of bytes taken by the characters for which the
+/// function `f` returns `true`.
+fn skip_by(s: &str, f: impl Fn(char) -> bool) -> usize {
+ for (index, c) in s.char_indices() {
+ if !f(c) {
+ return index;
+ }
+ }
+
+ s.len()
+}
+
+mod boolean_fns;
+
+pub(crate) use boolean_fns::*;
+
+fn skip_c_nl(s: &str) -> Option<usize> {
+ if s.is_empty() {
+ return Some(0);
+ }
+
+ let s_len = s.len();
+
+ match s.chars().next().unwrap() {
+ ';' => {
+ let after_not_newline = skip_by(&s[1..], is_not_newline);
+
+ if 1 + after_not_newline >= s_len {
+ Some(s_len)
+ } else {
+ let s = &s[1 + after_not_newline..];
+
+ match s.chars().next().unwrap() {
+ '\n' | '\r' => Some(after_not_newline + 2),
+ _ => {
+ // We skipped anything not newline, so this
+ // character must be newline.
+ unreachable!();
+ }
+ }
+ }
+ }
+ '\n' | '\r' => Some(1),
+ _ => None,
+ }
+}
+
+fn skip_c_wsp(s: &str) -> usize {
+ let mut currently_skipped = 0usize;
+ let mut s = s;
+
+ let mut continue_skipping = true;
+
+ 'skipping_loop: while continue_skipping {
+ continue_skipping = false;
+
+ let skip_wsp = skip_by(s, is_wsp);
+
+ if skip_wsp > 0 {
+ if skip_wsp >= s.len() {
+ return currently_skipped + s.len();
+ }
+
+ continue_skipping = true;
+ s = &s[skip_wsp..];
+ currently_skipped += skip_wsp;
+
+ continue 'skipping_loop;
+ }
+
+ if let Some(skip_c_nl_size) = skip_c_nl(s) {
+ if skip_c_nl_size >= s.len() {
+ return currently_skipped + s.len();
+ }
+
+ continue_skipping = true;
+
+ s = &s[skip_c_nl_size..];
+ currently_skipped += skip_c_nl_size;
+
+ let skip_wsp_after_c_nl = skip_by(s, is_wsp);
+
+ s = &s[skip_wsp_after_c_nl..];
+ currently_skipped += skip_wsp_after_c_nl;
+ }
+ }
+
+ currently_skipped
+}
+
+#[derive(Debug, Eq, PartialEq)]
+enum RepetitionSpec {
+ Star,
+ Equal(usize),
+ Min(usize),
+ Max(usize),
+ MinMax(usize, usize),
+}
+
+#[derive(Debug)]
+enum RepetitionAction {
+ /// Ignore the following element.
+ Ignore,
+ /// The following is put in a star.
+ Star,
+ /// The following is put in an optional construct.
+ Optional,
+ /// The following is put in a plus construct.
+ Plus,
+ /// Copy the following some number of times.
+ Copy(usize),
+ /// Copy the following some number of times, and then add a plus
+ /// of that same element.
+ CopyPlus(usize),
+ /// Copy the following some number of times, and then add another
+ /// number of *nested* optional construts of that same element.
+ ///
+ /// # Why nested?
+ ///
+ /// The optional constructs are nested here as a sequence of
+ /// optional constructs is exponentially more ambiguous than the
+ /// nested equivalent: "a?a?a?" is more ambiguous than
+ /// "(a(a(a)?)?)?". For example, the input "a" can have three
+ /// derivations according to "a?a?a?", but has only one derivation
+ /// according to "(a(a(a)?)?)?".
+ CopyOptional(usize, usize),
+}
+
+impl TryFrom<Option<RepetitionSpec>> for RepetitionAction {
+ type Error = Error;
+
+ fn try_from(spec: Option<RepetitionSpec>) -> Result<Self, Self::Error> {
+ if let Some(r) = spec {
+ match r {
+ RepetitionSpec::Star => Ok(Self::Star),
+ RepetitionSpec::Equal(0) => Ok(Self::Ignore),
+ RepetitionSpec::Equal(n) => Ok(Self::Copy(n)),
+ RepetitionSpec::Min(0) => Ok(Self::Star),
+ RepetitionSpec::Min(1) => Ok(Self::Plus),
+ RepetitionSpec::Min(n) => Ok(Self::CopyPlus(n - 1)),
+ RepetitionSpec::Max(0) => Ok(Self::Ignore),
+ RepetitionSpec::Max(1) => Ok(Self::Optional),
+ RepetitionSpec::Max(n) => Ok(Self::CopyOptional(0, n)),
+ RepetitionSpec::MinMax(0, 0) => Ok(Self::Ignore),
+ RepetitionSpec::MinMax(0, 1) => Ok(Self::Optional),
+ RepetitionSpec::MinMax(n, m) if n > m => Err(Error::MinGreaterThanMax(0)),
+ RepetitionSpec::MinMax(n, m) if n == m => Ok(Self::Copy(n)),
+ RepetitionSpec::MinMax(n, m) => Ok(Self::CopyOptional(n, m - n)),
+ }
+ } else {
+ Ok(Self::Copy(1))
+ }
+ }
+}
+
+fn parse_repetition_spec(s: &str) -> Option<(usize, RepetitionSpec)> {
+ let mut s = s;
+
+ let digit_size = skip_by(s, is_decimal_digit);
+
+ let mut min: Option<usize> = None;
+ let mut max: Option<usize> = None;
+
+ if digit_size != 0 {
+ min = Some(s[..digit_size].parse::<usize>().unwrap());
+ }
+
+ let mut current_index = digit_size;
+
+ if digit_size == s.len() {
+ if let Some(m) = min {
+ return Some((digit_size, RepetitionSpec::Equal(m)));
+ } else {
+ return None;
+ };
+ }
+
+ s = &s[current_index..];
+
+ if s.starts_with('*') {
+ if s.len() == 1 {
+ if let Some(m) = min {
+ return Some((current_index + 1, RepetitionSpec::Min(m)));
+ } else {
+ return Some((current_index + 1, RepetitionSpec::Star));
+ }
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ let digit_size = skip_by(s, is_decimal_digit);
+
+ if digit_size != 0 {
+ max = Some(s[..digit_size].parse::<usize>().unwrap());
+ }
+
+ current_index += digit_size;
+
+ match (min, max) {
+ (Some(mi), Some(ma)) => Some((current_index, RepetitionSpec::MinMax(mi, ma))),
+ (Some(m), None) => Some((current_index, RepetitionSpec::Min(m))),
+ (None, Some(m)) => Some((current_index, RepetitionSpec::Max(m))),
+ _ => Some((current_index, RepetitionSpec::Star)),
+ }
+ } else {
+ min.map(|m| (current_index, RepetitionSpec::Equal(m)))
+ }
+}
+
+fn parse_rulename(s: &str) -> Result<usize, Error> {
+ let first_char = s.chars().next().unwrap();
+
+ if !is_alpha(first_char) {
+ return Ok(0);
+ }
+
+ let rulename_end = skip_by(s, is_alpha_or_digit_or_hyphen);
+
+ if rulename_end >= s.len() {
+ // dbg!();
+ return Err(Error::HangingRuleName(0));
+ }
+
+ Ok(rulename_end)
+}
+
+#[derive(Debug, Eq, PartialEq, Hash)]
+enum IntermediateConstruct {
+ Terminal(String),
+ Nonterminal(String),
+ Range(char, char),
+}
+
+impl Display for IntermediateConstruct {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ IntermediateConstruct::Terminal(t) => write!(f, "t{t}"),
+ IntermediateConstruct::Nonterminal(n) => write!(f, "n{n}"),
+ IntermediateConstruct::Range(min, max) => write!(f, "[{min}-{max}]"),
+ }
+ }
+}
+
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+enum Radix {
+ Binary,
+ Decimal,
+ Hexadecimal,
+}
+
+fn parse_numeric(s: &str, radix: Radix) -> Result<(usize, IntermediateConstruct), Error> {
+ // first read a number
+
+ let is_radix_digit = match radix {
+ Radix::Binary => is_binary_digit,
+ Radix::Decimal => is_decimal_digit,
+ Radix::Hexadecimal => is_hexadecimal_digit,
+ };
+
+ let radix = match radix {
+ Radix::Binary => 2,
+ Radix::Decimal => 10,
+ Radix::Hexadecimal => 16,
+ };
+
+ let mut s = s;
+ let mut current_index = 0;
+
+ let first_digit_size = skip_by(s, is_radix_digit);
+
+ if first_digit_size == 0 {
+ // dbg!();
+ return Err(Error::InvalidDigit(current_index));
+ }
+
+ let first_number =
+ match std::char::from_u32(u32::from_str_radix(&s[..first_digit_size], radix).unwrap()) {
+ Some(c) => c,
+ None => {
+ // dbg!();
+ return Err(Error::InvalidCharacter(current_index));
+ }
+ };
+
+ if first_digit_size >= s.len() {
+ return Ok((
+ current_index + s.len(),
+ IntermediateConstruct::Terminal(first_number.to_string()),
+ ));
+ }
+
+ s = &s[first_digit_size..];
+
+ current_index += first_digit_size;
+
+ // then read a dot or a dash
+
+ match s.chars().next().unwrap() {
+ '.' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let mut stop_reading = false;
+ let mut chars_vec = vec![first_number];
+
+ while !stop_reading {
+ if s.is_empty() {
+ return Err(Error::HangingNumeric(current_index - 1));
+ }
+
+ let digit_size = skip_by(s, is_radix_digit);
+
+ if digit_size == 0 {
+ // dbg!();
+ return Err(Error::InvalidDigit(current_index));
+ }
+
+ let a_number =
+ std::char::from_u32(u32::from_str_radix(&s[..digit_size], radix).unwrap())
+ .ok_or(Error::InvalidCharacter(current_index))?;
+
+ chars_vec.push(a_number);
+
+ s = &s[digit_size..];
+ current_index += digit_size;
+
+ if !s.starts_with('.') {
+ stop_reading = true;
+ // println!(
+ // "digit_size = {digit_size} a number = {a_number}, s = {}",
+ // &s[..10]
+ // );
+ } else {
+ s = &s[1..];
+ current_index += 1;
+ }
+ }
+
+ Ok((
+ current_index,
+ IntermediateConstruct::Terminal(chars_vec.into_iter().collect()),
+ ))
+ }
+ '-' => {
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ return Err(Error::HangingNumeric(current_index - 1));
+ }
+
+ let second_digit_size = skip_by(s, is_radix_digit);
+
+ if second_digit_size == 0 {
+ println!(
+ "s = {}",
+ s.chars()
+ .take(if s.len() >= 50 { 50 } else { s.len() })
+ .collect::<String>()
+ );
+ return Err(Error::InvalidDigit(current_index));
+ }
+
+ let _second_number =
+ std::char::from_u32(u32::from_str_radix(&s[..second_digit_size], radix).unwrap())
+ .ok_or(Error::InvalidCharacter(current_index))?;
+
+ Err(Error::Unimplemented(current_index - 1))
+
+ // Ok((
+ // current_index + second_digit_size,
+ // IntermediateConstruct::Range(first_number, second_number),
+ // ))
+ }
+ c if is_wsp(c) || c == '\n' || c == '\r' || c == ';' => Ok((
+ current_index,
+ IntermediateConstruct::Terminal(first_number.to_string()),
+ )),
+ _ => Err(Error::InvalidCharacter(current_index)),
+ }
+}
+
+#[derive(Debug)]
+struct AlternationConstruct {
+ intermediates: Vec<IntermediateConstruct>,
+ regex: DefaultRegex<usize>,
+}
+
+fn parse_alternation(s: &str, indentation: usize) -> Result<(usize, AlternationConstruct), Error> {
+ let mut stack: Vec<usize> = vec![0, 1];
+ let mut intermediates: Vec<IntermediateConstruct> = Vec::new();
+ let mut types: Vec<RegexType<usize>> = vec![RegexType::Or, RegexType::Empty];
+ let mut edges: Vec<Vec<usize>> = vec![vec![1], Vec::new()];
+
+ fn error_conversion(error: nfa::error::Error) -> Error {
+ match error {
+ nfa::error::Error::UnknownNode(n) => Error::RegexUnknownNode(n),
+ nfa::error::Error::UnsupportedOperation => Error::UnsupportedOperation,
+ nfa::error::Error::NoRoot => Error::RegexNoRoot,
+ nfa::error::Error::Graph(ge) => ge.into(),
+ nfa::error::Error::Cycle => Error::RegexCycle,
+ _ => Error::Invalid,
+ }
+ }
+
+ let mut s = s;
+ let mut current_index = 0usize;
+
+ let mut nonterminal_nodes: HashSet<usize> = HashSet::new();
+
+ if s.is_empty() {
+ let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?;
+
+ return Ok((
+ current_index,
+ AlternationConstruct {
+ intermediates,
+ regex,
+ },
+ ));
+ }
+
+ // A vector of tuples to record some information for each level of
+ // nesting.
+ //
+ // A tuple (len, action) means that the stack grows by LEN many
+ // tokens at this level, and the action ACTION is pending.
+ let mut repetition_action_stack: Vec<(usize, Option<RepetitionAction>)> = vec![];
+
+ while !stack.is_empty() {
+ // First read repetition specifications.
+ let mut repetition_spec = None;
+
+ if let Some((spec_size, spec)) = parse_repetition_spec(s) {
+ if spec_size >= s.len() {
+ return Err(Error::HangingRepetition(current_index));
+ }
+
+ s = &s[spec_size..];
+ current_index += spec_size;
+
+ repetition_spec = Some(spec);
+ }
+
+ let repetition_spec = repetition_spec;
+
+ // if a rulename
+ match parse_rulename(s) {
+ Ok(0) => {}
+ Ok(size) => {
+ let name = &s[..size].to_string();
+
+ intermediates.push(IntermediateConstruct::Nonterminal(name.clone()));
+
+ let nt_index = intermediates.len() - 1;
+
+ let action = repetition_spec.try_into()?;
+ // println!("rep_action = {action:?}");
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ match action {
+ RepetitionAction::Ignore => {}
+ RepetitionAction::Star => {
+ types.push(RegexType::Kleene);
+ types.push(RegexType::Lit(nt_index));
+
+ edges.push(vec![types.len() - 1]);
+ edges.push(Vec::new());
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::Optional => {
+ types.append(&mut vec![RegexType::Optional]);
+ types.push(RegexType::Lit(nt_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::Plus => {
+ types.append(&mut vec![RegexType::Plus]);
+ types.push(RegexType::Lit(nt_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::Copy(n) => {
+ let old_types_len = types.len();
+
+ nonterminal_nodes.extend((0..n).map(|i| old_types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| old_types_len + i));
+
+ types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(nt_index));
+ // edges.push(vec![]);
+
+ // nonterminal_nodes.insert(types.len() - 1);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let old_types_len = types.len();
+
+ nonterminal_nodes.extend((0..n).map(|i| old_types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| old_types_len + i));
+
+ types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(nt_index));
+ // edges.push(vec![]);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+
+ types.append(&mut vec![RegexType::Plus]);
+ types.push(RegexType::Lit(nt_index));
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let old_types_len = types.len();
+
+ nonterminal_nodes.extend((0..n).map(|i| old_types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| old_types_len + i));
+
+ types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(nt_index));
+ // edges.push(vec![]);
+
+ // nonterminal_nodes.insert(types.len() - 1);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+
+ let types_len = types.len();
+
+ types.extend(
+ std::iter::repeat([RegexType::Optional, RegexType::Lit(nt_index)])
+ .take(m)
+ .flatten(),
+ );
+
+ edges.extend((0..(m - 1)).flat_map(|i| {
+ [
+ vec![types_len + 1 + 2 * i, types_len + 2 + 2 * i],
+ Vec::new(),
+ ]
+ }));
+
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ nonterminal_nodes.extend((0..m).map(|i| types_len + 2 * i + 1));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types_len);
+
+ // for _ in 0..m {
+ // types.extend([RegexType::Optional, RegexType::Lit(nt_index)]);
+ // edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ // nonterminal_nodes.insert(types.len() - 1);
+
+ // edges[stack_last].push(types.len() - 2);
+ // }
+ }
+ }
+
+ s = &s[size..];
+ current_index += size;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ Err(_) => {
+ intermediates.push(IntermediateConstruct::Nonterminal(s.to_owned()));
+
+ types.push(RegexType::Lit(intermediates.len() - 1));
+ edges.push(vec![]);
+
+ let stack_last = stack
+ .last()
+ .copied()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+ nonterminal_nodes.insert(types.len() - 1);
+
+ break;
+ }
+ }
+
+ // match against the first charcter
+
+ let first_char = s.chars().next().unwrap();
+
+ match first_char {
+ '(' => {
+ // group
+
+ s = &s[1..];
+ current_index += 1;
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ match action {
+ RepetitionAction::Star => {
+ types.extend([RegexType::Kleene, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ repetition_action_stack.push((3, None));
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+ }
+ RepetitionAction::Optional => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+ repetition_action_stack.push((3, None));
+ }
+ RepetitionAction::Plus => {
+ types.extend([RegexType::Plus, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+ repetition_action_stack.push((3, None));
+ }
+ RepetitionAction::Copy(1) => {
+ types.extend([RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+
+ stack.extend([types.len() - 2, types.len() - 1]);
+ repetition_action_stack.push((2, None));
+ }
+ RepetitionAction::CopyOptional(0, _) => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+
+ repetition_action_stack.push((3, Some(action)));
+ }
+ _ => {
+ types.extend([RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+
+ stack.extend([types.len() - 2, types.len() - 1]);
+
+ repetition_action_stack.push((2, Some(action)));
+ }
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ return Err(Error::RegexUnbalancedParen(current_index));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ ')' => {
+ // end group
+
+ s = &s[1..];
+ current_index += 1;
+
+ if repetition_spec.is_some() {
+ return Err(Error::RegexInvalidRepeat(current_index));
+ }
+
+ let (stack_growth, repetition_action) = repetition_action_stack
+ .pop()
+ .ok_or(Error::RegexEmptyActionStack(current_index))?;
+
+ match repetition_action {
+ Some(action) => {
+ if stack.len() <= stack_growth + 1 {
+ dbg!();
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth + 1);
+
+ // this is not going to fail because of the previous check
+ let stack_last = stack.pop().unwrap();
+ let stack_second_last = stack.iter().copied().last().unwrap();
+
+ match action {
+ RepetitionAction::Ignore => {
+ edges[stack_second_last].pop();
+
+ types.truncate(stack_last);
+ edges.truncate(stack_last);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_second_last].push(types.len() - 1);
+
+ // if stack_second_last == 3 && nodes.len() == 4 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ edges[stack_second_last].push(types.len());
+
+ let nodes_cloned_len = nodes_cloned.len();
+
+ for _ in 0..(m - 1) {
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len(), types.len() + nodes_cloned_len]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Star
+ | RepetitionAction::Optional
+ | RepetitionAction::Plus => {
+ // this will not happen here
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ None => {
+ if stack.len() <= stack_growth {
+ dbg!();
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ '[' => {
+ // option
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ match action {
+ RepetitionAction::Ignore => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ edges[stack_last].push(types.len() - 3);
+
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((3, Some(action)));
+ }
+ RepetitionAction::Optional => {
+ types.extend([
+ RegexType::Optional,
+ RegexType::Optional,
+ RegexType::Or,
+ RegexType::Empty,
+ ]);
+ edges.extend([
+ vec![types.len() - 3],
+ vec![types.len() - 2],
+ vec![types.len() - 1],
+ Vec::new(),
+ ]);
+
+ edges[stack_last].push(types.len() - 4);
+
+ stack.push(types.len() - 4);
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((4, None));
+ }
+ RepetitionAction::Star => {
+ types.extend([
+ RegexType::Kleene,
+ RegexType::Optional,
+ RegexType::Or,
+ RegexType::Empty,
+ ]);
+ edges.extend([
+ vec![types.len() - 3],
+ vec![types.len() - 2],
+ vec![types.len() - 1],
+ Vec::new(),
+ ]);
+
+ edges[stack_last].push(types.len() - 4);
+
+ stack.push(types.len() - 4);
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((4, None));
+ }
+ RepetitionAction::Plus => {
+ types.extend([
+ RegexType::Kleene,
+ RegexType::Optional,
+ RegexType::Or,
+ RegexType::Empty,
+ ]);
+ edges.extend([
+ vec![types.len() - 3],
+ vec![types.len() - 2],
+ vec![types.len() - 1],
+ Vec::new(),
+ ]);
+
+ edges[stack_last].push(types.len() - 4);
+
+ stack.push(types.len() - 4);
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((4, None));
+ }
+ RepetitionAction::Copy(1) => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ edges[stack_last].push(types.len() - 3);
+
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((3, None));
+ }
+ _ => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ edges[stack_last].push(types.len() - 3);
+
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((3, Some(action)));
+ }
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ return Err(Error::RegexUnbalancedParen(current_index));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ ']' => {
+ // end option
+
+ s = &s[1..];
+ current_index += 1;
+
+ if repetition_spec.is_some() {
+ return Err(Error::RegexInvalidRepeat(current_index));
+ }
+
+ let (stack_growth, repetition_action) = repetition_action_stack
+ .pop()
+ .ok_or(Error::RegexEmptyActionStack(current_index))?;
+
+ match repetition_action {
+ Some(action) => {
+ if stack.len() <= stack_growth + 1 {
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth + 1);
+
+ // this is not going to fail because of the previous check
+ let stack_last = stack.pop().unwrap();
+ let stack_second_last = stack.iter().copied().last().unwrap();
+
+ match action {
+ RepetitionAction::Ignore => {
+ edges[stack_second_last].pop();
+
+ types.truncate(stack_last);
+ edges.truncate(stack_last);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_second_last].push(types.len() - 1);
+
+ // if stack_second_last == 3 && nodes.len() == 4 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ let nodes_cloned_len = nodes_cloned.len();
+
+ for _ in 0..(m - 1) {
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len(), types.len() + nodes_cloned_len]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Star
+ | RepetitionAction::Optional
+ | RepetitionAction::Plus => {
+ // this will not happen here
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ None => {
+ if stack.len() <= stack_growth {
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ '/' => {
+ // end concatenation
+
+ s = &s[1..];
+ current_index += 1;
+
+ if repetition_spec.is_some() {
+ return Err(Error::RegexInvalidRepeat(current_index));
+ }
+
+ if stack.len() <= 1 {
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.pop().unwrap();
+
+ let stack_last = stack.iter().copied().last().unwrap();
+
+ edges[stack_last].push(types.len());
+
+ types.push(RegexType::Empty);
+ edges.push(vec![]);
+
+ stack.push(types.len() - 1);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ '"' => {
+ // string
+
+ s = &s[1..];
+ current_index += 1;
+
+ let skip_char_value = skip_by(s, is_char_value);
+
+ if skip_char_value >= s.len() {
+ return Err(Error::HangingDQuote(current_index));
+ }
+
+ let fakes = &s[skip_char_value..];
+
+ if !fakes.starts_with('"') {
+ return Err(Error::HangingDQuote(current_index));
+ }
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ let slice_value = s[..skip_char_value].to_owned();
+
+ intermediates.push(IntermediateConstruct::Terminal(slice_value.clone()));
+
+ let t_index = intermediates.len() - 1;
+
+ match action {
+ RepetitionAction::Ignore => {}
+ RepetitionAction::Star => {
+ types.push(RegexType::Kleene);
+ types.push(RegexType::Lit(t_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ edges[stack_last].push(types.len() - 2);
+ }
+ RepetitionAction::Optional => {
+ types.append(&mut vec![RegexType::Optional]);
+ types.push(RegexType::Lit(t_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ edges[stack_last].push(types.len() - 2);
+ }
+ RepetitionAction::Plus => {
+ types.append(&mut vec![RegexType::Plus]);
+ types.push(RegexType::Lit(t_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ edges[stack_last].push(types.len() - 2);
+ }
+ RepetitionAction::Copy(n) => {
+ let types_len = types.len();
+
+ types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| types_len + i));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(t_index));
+ // edges.push(vec![]);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let types_len = types.len();
+
+ types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| types_len + i));
+
+ types.extend([RegexType::Plus, RegexType::Lit(t_index)]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let types_len = types.len();
+
+ types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len());
+
+ for _ in 0..(m - 1) {
+ types.extend([RegexType::Optional, RegexType::Lit(t_index)]);
+ edges.extend([vec![types.len() - 1, types.len()], Vec::new()]);
+ }
+
+ types.extend([RegexType::Optional, RegexType::Lit(t_index)]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+ }
+ }
+
+ s = &fakes[1..];
+ current_index += skip_char_value + 1;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ '%' => {
+ // numeric value
+
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ return Err(Error::HangingPercent(current_index - 1));
+ }
+
+ let inner_first_char = s.chars().next().unwrap();
+
+ let intermediate_value: IntermediateConstruct;
+
+ match inner_first_char {
+ 'b' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Binary)?;
+
+ intermediate_value = parsed_construct;
+
+ s = &s[parsed_size..];
+ current_index += parsed_size;
+ }
+ 'd' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Decimal)?;
+
+ intermediate_value = parsed_construct;
+
+ s = &s[parsed_size..];
+ current_index += parsed_size;
+ }
+ 'x' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Hexadecimal)?;
+ // println!("size = {parsed_size}, construct = {parsed_construct:#?}");
+
+ intermediate_value = parsed_construct;
+
+ s = &s[parsed_size..];
+ current_index += parsed_size;
+ }
+ _ => {
+ return Err(Error::RegexInvalidNumeric(current_index - 1));
+ }
+ }
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ intermediates.push(intermediate_value);
+
+ let inter_index = intermediates.len() - 1;
+
+ match action {
+ RepetitionAction::Ignore => {}
+ RepetitionAction::Star => {
+ types.push(RegexType::Kleene);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::Optional => {
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::Plus => {
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::Copy(n) => {
+ types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit));
+
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(inter_index));
+ // edges.push(vec![]);
+ // edges[stack_last].push(types.len() - 1);
+ // }
+ }
+ RepetitionAction::CopyPlus(n) => {
+ types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit));
+
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit));
+
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+
+ for _ in 0..(m - 1) {
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len(), types.len() + 1]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ '<' => {
+ // prose value
+ return Err(Error::Unimplemented(current_index));
+ }
+ ';' => {
+ // Possibly extend to the next line if the indentation
+ // of the next line is greater than `indentation`.
+
+ s = &s[1..];
+ current_index += 1;
+
+ let mut in_comment = true;
+
+ loop {
+ let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp };
+
+ let mut after_wsp_and_v_char = skip_by(s, skip_fn);
+
+ if after_wsp_and_v_char >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ s = &s[after_wsp_and_v_char..];
+ current_index += after_wsp_and_v_char;
+
+ let first_char = s.chars().next().unwrap();
+
+ let mut skip_again = false;
+
+ match first_char {
+ '\n' | '\r' => {
+ s = &s[1..];
+ current_index += 1;
+ skip_again = true;
+ }
+ _ if in_comment => {
+ // dbg!("1815");
+ return Err(Error::RegexInvalidComment(current_index));
+ }
+ _ => {}
+ }
+
+ if skip_again {
+ let skip_wsp = skip_by(s, is_wsp);
+
+ if skip_wsp >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ after_wsp_and_v_char = skip_wsp;
+
+ s = &s[skip_wsp..];
+ current_index += skip_wsp;
+ }
+
+ let first_char = s.chars().next().unwrap();
+
+ match first_char {
+ ';' => {
+ // continue looping
+ s = &s[1..];
+ current_index += 1;
+ in_comment = true;
+
+ continue;
+ }
+ '\n' | '\r' => {
+ in_comment = false;
+ s = &s[1..];
+ current_index += 1;
+
+ continue;
+ }
+ _ if after_wsp_and_v_char <= indentation => {
+ stack = vec![];
+ break;
+ }
+ _ => {
+ break;
+ }
+ }
+ }
+ }
+ '\n' | '\r' => {
+ // Possibly extend to the next line if the indentation
+ // of the next line is greater than `indentation`.
+ // println!("s = {}", s.chars().take(10).collect::<String>());
+
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ break;
+ }
+
+ let mut in_comment = false;
+
+ loop {
+ let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp };
+
+ let mut after_wsp = skip_by(s, skip_fn);
+
+ if after_wsp >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ s = &s[after_wsp..];
+ current_index += after_wsp;
+
+ let first_char = s.chars().next().unwrap();
+
+ let mut skip_again = false;
+
+ match first_char {
+ '\n' | '\r' => {
+ s = &s[1..];
+ current_index += 1;
+ skip_again = true;
+ }
+ _ if in_comment => {
+ // dbg!("1903");
+ return Err(Error::RegexInvalidComment(current_index));
+ }
+ _ => {}
+ }
+
+ if skip_again {
+ let skip_wsp = skip_by(s, is_wsp);
+
+ if skip_wsp >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ s = &s[skip_wsp..];
+ current_index += skip_wsp;
+
+ after_wsp = skip_wsp;
+ }
+
+ let first_char = s.chars().next().unwrap();
+
+ match first_char {
+ ';' => {
+ // continue looping
+ s = &s[1..];
+ current_index += 1;
+ in_comment = true;
+
+ continue;
+ }
+ '\n' | '\r' => {
+ in_comment = false;
+ s = &s[1..];
+ current_index += 1;
+
+ continue;
+ }
+ _ if after_wsp <= indentation => {
+ stack = vec![];
+ break;
+ }
+ _ => {
+ break;
+ }
+ }
+ }
+ }
+ _ => {
+ // invalid
+ println!("s = {}", &s[..(if s.len() > 50 { 50 } else { s.len() })]);
+ return Err(Error::InvalidCharacter(current_index));
+ }
+ }
+ }
+
+ let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?;
+
+ Ok((
+ current_index,
+ AlternationConstruct {
+ intermediates,
+ regex,
+ },
+ ))
+}
+
+type RuleConstruct = (
+ // Name
+ String,
+ // Data
+ Vec<IntermediateConstruct>,
+ // Rule
+ DefaultRegex<usize>,
+ // extra_alternation_or_not
+ bool,
+);
+
+fn parse_one_rule(s: &str, indentation: usize) -> Result<(usize, RuleConstruct), Error> {
+ if s.is_empty() {
+ return Ok((0, ("".to_owned(), Vec::new(), Default::default(), false)));
+ }
+
+ let mut s = s;
+
+ let mut current_index = 0usize;
+
+ let mut appending_alternation = false;
+
+ // rule name
+
+ let rulename_end = parse_rulename(s)?;
+ let rule_name = s[..rulename_end].to_string();
+ s = &s[rulename_end..];
+ current_index += rulename_end;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ // dbg!();
+ return Err(Error::HangingRuleName(0));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ // equal or equal forward slash
+
+ let next_char = s.chars().next().unwrap();
+
+ if next_char != '=' {
+ // println!("s = {}", &s[..50]);
+ // println!("rule_name = {rule_name}");
+ return Err(Error::MissingEqualSign(current_index + 1));
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ return Err(Error::HangingEqualsSign(current_index - 1));
+ }
+
+ let next_char = s.chars().next().unwrap();
+
+ if next_char == '/' {
+ appending_alternation = true;
+ s = &s[1..];
+ current_index += 1;
+ }
+
+ if s.is_empty() {
+ return Err(Error::HangingEqualsSign(current_index - 1));
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_c_wsp(s);
+
+ if skip_c_wsp_size >= s.len() {
+ // dbg!();
+ return Err(Error::HangingRuleName(0));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ // elements
+
+ // alternation
+
+ let (alternation_size, alternation_result) = parse_alternation(s, indentation)?;
+
+ let intermediates_data = alternation_result.intermediates;
+ let regex = alternation_result.regex;
+
+ // println!("size = {alternation_size}, result = {alternation_result:?}");
+
+ if alternation_size >= s.len() {
+ s = "";
+ } else {
+ s = &s[alternation_size..];
+ }
+
+ current_index += alternation_size;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_c_wsp(s);
+
+ if skip_c_wsp_size >= s.len() {
+ s = "";
+ } else {
+ s = &s[skip_c_wsp_size..];
+ }
+
+ current_index += skip_c_wsp_size;
+
+ // c-nl
+
+ if let Some(skip_c_nl) = skip_c_nl(s) {
+ current_index += skip_c_nl;
+ }
+
+ Ok((
+ current_index,
+ (rule_name, intermediates_data, regex, appending_alternation),
+ ))
+}
+
+impl std::str::FromStr for Grammar {
+ // First skip whitespaces, linefeeds, carraige returns and
+ // semicolons until the first rule appears. The basic indentation
+ // level is set to the indentation level of the first rule's line.
+ //
+ // Then enter identifier reading mode, reading a letter followed
+ // by a sequence of letters, digits and hyphens.
+ //
+ // After skipping white spaces, read an equal sign.
+ //
+ // Then enter alternation reading mode, reading alternations of
+ // concatenations.
+ //
+ // Then enter element reading mode, reading optionally repetition
+ // specification followed by an identifier, or a numeral value, or
+ // a quoted string.
+
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ let mut before_first_rule = true;
+ let mut basic_indentation: usize = 0;
+ let mut current_skipped_index: usize = 0;
+
+ let mut s = s;
+
+ while before_first_rule {
+ before_first_rule = false;
+
+ let after_wsp = skip_by(s, is_wsp);
+
+ if after_wsp >= s.len() {
+ // The entire string consists of white-spaces and
+ // comments.
+ return Err(Error::EmptyGrammar);
+ }
+
+ s = &s[after_wsp..];
+ current_skipped_index += after_wsp;
+
+ match skip_c_nl(s) {
+ Some(skipped_size) => {
+ if skipped_size >= s.len() {
+ return Err(Error::EmptyGrammar);
+ }
+
+ // Repeat skipping until we find the first rule.
+ before_first_rule = true;
+
+ s = &s[skipped_size..];
+ current_skipped_index += skipped_size;
+ }
+ None => {
+ // Found the first rule.
+ basic_indentation = after_wsp;
+ }
+ }
+ }
+
+ // TODO: Core rules are not implemented for now.
+
+ #[allow(unused_macros)]
+ macro_rules! log {
+ () => {
+ let mut file = std::fs::OpenOptions::new()
+ .create(true)
+ .append(true)
+ .open("debug.log")
+ .unwrap();
+
+ let _ = file.write_all(format!("{}\n", line!()).as_bytes());
+ };
+ }
+
+ let mut nonterminals_vec: Vec<String> = vec![];
+ let mut nonterminals_map: HashMap<String, usize> = HashMap::new();
+
+ let mut all_rules: HashMap<String, DefaultRegex<TNT>> = Default::default();
+
+ let mut terminals_vec: Vec<String> = vec![];
+ let mut terminals_map: HashMap<String, usize> = HashMap::new();
+
+ let mut temp = s;
+
+ // First collect non-terminals in the order they are defined.
+ loop {
+ let (rule_size, (rule_name, _rule_data, _rule_regex, appending_p)) =
+ parse_one_rule(temp, basic_indentation)?;
+
+ if rule_size == 0 {
+ break;
+ }
+
+ if appending_p && nonterminals_map.get(&rule_name).is_none() {
+ return Err(Error::AppendToNothing(current_skipped_index));
+ } else if !appending_p && nonterminals_map.get(&rule_name).is_some() {
+ return Err(Error::CreateAgain(current_skipped_index));
+ }
+
+ nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len());
+ nonterminals_vec.push(rule_name);
+
+ if rule_size > 0 && rule_size < temp.len() {
+ temp = &temp[rule_size..];
+ current_skipped_index += rule_size;
+ } else {
+ break;
+ }
+ }
+
+ let mut temp = s;
+
+ // Then gather the data about names of terminals and
+ // non-terminals.
+ loop {
+ let (rule_size, (_rule_name, rule_data, _rule_regex, _appending_p)) =
+ parse_one_rule(temp, basic_indentation)?;
+
+ if rule_size == 0 {
+ break;
+ }
+
+ for data in rule_data {
+ match data {
+ IntermediateConstruct::Terminal(name) => {
+ if terminals_map.get(&name).is_none() {
+ terminals_map.insert(name.clone(), terminals_vec.len());
+ terminals_vec.push(name);
+ }
+ }
+ IntermediateConstruct::Nonterminal(name) => {
+ if nonterminals_map.get(&name).is_none() {
+ return Err(Error::UnknownNonterminal(name));
+ }
+ }
+ IntermediateConstruct::Range(_, _) => unimplemented!(),
+ }
+ }
+
+ if rule_size > 0 && rule_size < temp.len() {
+ temp = &temp[rule_size..];
+ current_skipped_index += rule_size;
+ } else {
+ break;
+ }
+ }
+
+ // Then collect the actual rules.
+ loop {
+ let (rule_size, (rule_name, rule_data, rule_regex, appending_p)) =
+ parse_one_rule(s, basic_indentation)?;
+ // println!("rule name = {rule_name}");
+ // println!("regexp = {rule_regexp:#?}");
+
+ let rule_transform = |old_label: usize| -> Result<TNT, Error> {
+ if let Some(old_value) = rule_data.get(old_label) {
+ match old_value {
+ IntermediateConstruct::Terminal(name) => {
+ if let Some(index) = terminals_map.get(name) {
+ Ok(TNT::Ter(*index))
+ } else {
+ dbg!();
+ Err(Error::UnknownTerminal(name.clone()))
+ }
+ }
+ IntermediateConstruct::Nonterminal(name) => {
+ if let Some(index) = nonterminals_map.get(name) {
+ Ok(TNT::Non(*index))
+ } else {
+ dbg!();
+ Err(Error::UnknownNonterminal(name.clone()))
+ }
+ }
+ IntermediateConstruct::Range(_, _) => {
+ dbg!();
+ Err(Error::UnsupportedOperation)
+ }
+ }
+ } else {
+ dbg!();
+ Err(Error::RegexIndexOutOfBounds(old_label, rule_data.len()))
+ }
+ };
+
+ let transformed_rule = rule_regex.maps_to(rule_transform)?;
+
+ if rule_size == 0 {
+ break;
+ }
+
+ if !appending_p {
+ nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len());
+ nonterminals_vec.push(rule_name.clone());
+
+ all_rules.insert(rule_name, transformed_rule);
+ } else {
+ all_rules
+ .get_mut(&rule_name)
+ .unwrap()
+ .append(transformed_rule);
+ }
+
+ if rule_size > 0 && rule_size < s.len() {
+ s = &s[rule_size..];
+ current_skipped_index += rule_size;
+ } else {
+ break;
+ }
+ }
+
+ let rules_vec: Vec<Rule> = nonterminals_vec
+ .iter()
+ .map(|name| {
+ all_rules
+ .get(name)
+ .map(|regex| Rule::new(regex.clone()))
+ .ok_or(Error::UnknownNonterminal(name.clone()))
+ })
+ .collect::<Result<_, _>>()?;
+
+ let terminals: Vec<Terminal> = terminals_vec.into_iter().map(Terminal::new).collect();
+ let nonterminals: Vec<Nonterminal> =
+ nonterminals_vec.into_iter().map(Nonterminal).collect();
+
+ Ok(Self::new(terminals, nonterminals, rules_vec))
+ }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/grammar/src/abnf/tests.rs b/grammar/src/abnf/tests.rs
new file mode 100644
index 0000000..67dfaaa
--- /dev/null
+++ b/grammar/src/abnf/tests.rs
@@ -0,0 +1,291 @@
+//! This module contains unit tests for the abnf grammar parser.
+
+use super::*;
+
+#[test]
+fn test_skip_by() -> Result<(), Box<dyn std::error::Error>> {
+ let string = std::iter::repeat('a')
+ .take(10)
+ .chain(std::iter::repeat('哈').take(10))
+ .chain(" \t\n".chars())
+ .collect::<String>();
+
+ let mut s: &str = string.as_str();
+
+ dbg!(&s);
+
+ let alpha_len = skip_by(s, is_alpha);
+
+ assert_eq!(alpha_len, 10);
+
+ s = &s[alpha_len..];
+
+ dbg!(&s);
+
+ let not_newline_len = skip_by(s, is_not_newline);
+
+ // 哈 takes 3 bytes, so the count should be 30 plus two bytes for
+ // the space and the tab.
+ assert_eq!(not_newline_len, 32);
+
+ Ok(())
+}
+
+#[test]
+fn test_skip_c_nl() -> Result<(), Box<dyn std::error::Error>> {
+ let string = std::iter::repeat(' ')
+ .take(10)
+ .chain(std::iter::repeat('哈').take(10))
+ .chain(" \t\n".chars())
+ .collect::<String>();
+
+ let s = string.as_str();
+
+ dbg!(s);
+
+ let c_nl = skip_c_nl(s);
+
+ assert!(c_nl.is_none());
+
+ let new_string = std::iter::once(';')
+ .chain(string.chars())
+ .chain(string.chars())
+ .collect::<String>();
+
+ let s = new_string.as_str();
+
+ dbg!(s);
+
+ let c_nl = skip_c_nl(s);
+
+ assert_eq!(c_nl, Some(string.len() + 1));
+
+ let c_nl = skip_c_nl("\n");
+
+ assert_eq!(c_nl, Some(1));
+
+ Ok(())
+}
+
+#[test]
+fn test_skip_c_wsp() -> Result<(), Box<dyn std::error::Error>> {
+ let mut string: String = std::iter::repeat(' ')
+ .take(10)
+ .chain(std::iter::repeat('\t').take(10))
+ .collect();
+
+ string.extend(
+ std::iter::once(';')
+ .chain(
+ std::iter::repeat(std::iter::once('\n').chain(std::iter::once(';')))
+ .take(10)
+ .flatten(),
+ )
+ .chain(std::iter::once('\n'))
+ .chain(std::iter::repeat('a').take(10)),
+ );
+
+ let string = string;
+
+ dbg!(&string);
+
+ let skip_c_wsp_size = skip_c_wsp(&string);
+
+ assert_eq!(skip_c_wsp_size, string.len() - 10);
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_repetition() -> Result<(), Box<dyn std::error::Error>> {
+ let strs = ["*A", "0*A", "*0A", "2*A", "*2A", "5*10A", "A"];
+
+ let results = strs.map(parse_repetition_spec);
+
+ dbg!(&results);
+
+ let answers = [
+ Some((1, RepetitionSpec::Star)),
+ Some((2, RepetitionSpec::Min(0))),
+ Some((2, RepetitionSpec::Max(0))),
+ Some((2, RepetitionSpec::Min(2))),
+ Some((2, RepetitionSpec::Max(2))),
+ Some((4, RepetitionSpec::MinMax(5, 10))),
+ None,
+ ];
+
+ for (result, answer) in results.iter().zip(answers.iter()) {
+ assert_eq!(result, answer);
+ }
+
+ Ok(())
+}
+
+use super::Error;
+
+#[test]
+fn test_parse_rulename() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "NONTERMINAL";
+
+ assert!(matches!(parse_rulename(s), Err(Error::HangingRuleName(0))));
+
+ let s = " NONTERMINAL = ";
+
+ assert!(matches!(parse_rulename(s), Ok(0)));
+
+ let s = "NONTERMINAL = ";
+
+ assert!(matches!(parse_rulename(s), Ok(11)));
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_numeric() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "haha";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Binary),
+ Err(Error::InvalidDigit(0))
+ ));
+
+ let s = "124";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Binary),
+ Err(Error::InvalidCharacter(1))
+ ));
+
+ let s = "32";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Ok((2, IntermediateConstruct::Terminal(string))) if string == " ".to_string()
+ ));
+
+ let s = "32.65.66.66.65";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Ok((14, IntermediateConstruct::Terminal(string))) if string == " ABBA".to_string()
+ ));
+
+ let s = "32-";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Err(Error::HangingNumeric(2))
+ ));
+
+ let s = "32-124";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Ok((6, IntermediateConstruct::Range(from, to))) if from == ' ' && to == '|',
+ ));
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_alternation() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "0*10nonterminal\n";
+
+ let (len, alt) = parse_alternation(s, 0)?;
+
+ assert_eq!(len, s.len());
+
+ let regex = alt.regex;
+
+ println!("regex = {regex}");
+
+ assert_eq!(
+ format!("{regex}"),
+ "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?)"
+ );
+
+ let s = "0*10nonterminal ( *nonterminal nonterminal / [ nonterminal ] 1*5nonterminal )\n";
+
+ let (len, alt) = parse_alternation(s, 0)?;
+
+ assert_eq!(len, s.len());
+
+ let regex = alt.regex;
+
+ println!("regex = {regex}");
+
+ assert_eq!(
+ format!("{regex}"),
+ "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?((1)*2|((3))?4(4(4(4(4)?)?)?)?))"
+ );
+
+ let s = "5nonterminal\n";
+
+ let (len, alt) = parse_alternation(s, 0)?;
+
+ assert_eq!(len, s.len());
+
+ let regex = alt.regex;
+
+ println!("regex = {regex}");
+
+ assert_eq!(format!("{regex}"), "(00000)");
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_onerule() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "start = 1*10nonterminal [ nonterminal ] / 1*nonterminal\n";
+
+ let (len, (name, data, regex, extra_p)) = parse_one_rule(s, 0)?;
+
+ println!("name = {name}");
+ print!("data = ");
+
+ for (index, non_terminal) in data.iter().enumerate() {
+ print!("({index}, {non_terminal})");
+ }
+
+ println!();
+
+ println!("regex = {regex}");
+
+ println!("extra_p = {extra_p}");
+
+ assert_eq!(len, s.len());
+
+ assert_eq!(name, "start".to_string());
+
+ let answers = ["nnonterminal", "nnonterminal", "nnonterminal"];
+
+ assert_eq!(data.len(), answers.len());
+
+ for (tnt, answer) in data.into_iter().zip(answers) {
+ assert_eq!(format!("{tnt}"), answer.to_string());
+ }
+
+ assert_eq!(
+ format!("{regex}"),
+ "(0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?((1))?|(2)+)".to_string()
+ );
+
+ assert!(!extra_p);
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_grammar() -> Result<(), Box<dyn std::error::Error>> {
+ let grammar_file_name = "abnf grammars/test.abnf";
+
+ let file_content = std::fs::read_to_string(grammar_file_name)?;
+
+ let grammar: Grammar = file_content.parse()?;
+
+ println!("print grammar:");
+
+ println!("{grammar}");
+
+ Ok(())
+}
diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs
index 11cb161..ea1299f 100644
--- a/grammar/src/lib.rs
+++ b/grammar/src/lib.rs
@@ -93,7 +93,7 @@ impl Display for TNT {
}
/// Errors related to grammar operations.
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum Error {
/// The operation requires the grammar to be after a certain
@@ -101,6 +101,8 @@ pub enum Error {
WrongState(GrammarState, GrammarState),
/// The first component is the index, and the second the bound.
IndexOutOfBounds(usize, usize),
+ /// The given name of a terminal or a non-terminal is unknown.
+ UnknownTNTName(String),
/// Fail to build the N-th regular expression, due to the
/// ParseError.
BuildFail(usize, ParseError),
@@ -123,6 +125,11 @@ impl Display for Error {
"Failed to build the {n}-th regular expression due to error: {pe}"
),
Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
+ Error::UnknownTNTName(name) => write!(
+ f,
+ "the name {name} is unknown \
+ for a terminal or a non-terminal."
+ ),
Error::WrongState(current, threshold) => {
write!(f, "require state {threshold}, but in state {current}")
}
@@ -158,6 +165,12 @@ impl Rule {
pub fn len(&self) -> usize {
self.regex.len()
}
+
+ /// Wrap a regular expression into a rule.
+ #[inline]
+ pub fn new(regex: DefaultRegex<TNT>) -> Self {
+ Self { regex }
+ }
}
/// The state of Grammar.
@@ -190,6 +203,37 @@ impl Display for GrammarState {
}
}
+/// This enum represents the name of either a terminal or a
+/// non-terminal.
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+pub enum TNTName {
+ /// The name of a terminal.
+ Ter(String),
+ /// The name of a non-terminal.
+ Non(String),
+}
+
+impl TNTName {
+ /// Construct the name of a terminal or a non-terminal.
+ #[inline]
+ pub fn new(name: String, terminal_p: bool) -> Self {
+ if terminal_p {
+ Self::Ter(name)
+ } else {
+ Self::Non(name)
+ }
+ }
+
+ /// Return the underlying name.
+ #[inline]
+ pub fn name(&self) -> &str {
+ match self {
+ TNTName::Ter(name) => name,
+ TNTName::Non(name) => name,
+ }
+ }
+}
+
/// The type of a grammar.
#[derive(Debug, Clone, Default)]
pub struct Grammar {
@@ -197,6 +241,8 @@ pub struct Grammar {
ter: Vec<Terminal>,
/// A list of non-terminals.
non: Vec<Nonterminal>,
+ /// A map from the names of terminals or non-terminals to TNT.
+ tnt_map: HashMap<TNTName, TNT>,
/// A list of rules.
///
/// The length of the list must match that of the list of
@@ -286,6 +332,22 @@ impl Grammar {
let expansion_map = Default::default();
let reduction_map = Default::default();
+ let mut tnt_map: HashMap<TNTName, TNT> = Default::default();
+
+ for (index, ter_element) in ter.iter().enumerate() {
+ tnt_map.insert(
+ TNTName::new(ter_element.name().to_string(), true),
+ TNT::Ter(index),
+ );
+ }
+
+ for (index, non_element) in non.iter().enumerate() {
+ tnt_map.insert(
+ TNTName::new(non_element.name().to_string(), false),
+ TNT::Non(index),
+ );
+ }
+
// NOTE: We cannot calculate accumulators here, as we want the
// accumulators of the regular expression of the left-closure,
// not of the original one.
@@ -294,6 +356,7 @@ impl Grammar {
Self {
ter,
non,
+ tnt_map,
rules,
firsts,
first_nodes,
@@ -304,6 +367,16 @@ impl Grammar {
}
}
+ /// Convert from the name of a terminal or a non-terminal to a
+ /// struct TNT.
+ #[inline]
+ pub fn name_to_tnt(&self, name: &TNTName) -> Result<TNT, Error> {
+ self.tnt_map
+ .get(name)
+ .copied()
+ .ok_or_else(|| Error::UnknownTNTName(name.name().to_string()))
+ }
+
/// Return the name of a terminal or a non-terminal.
pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> {
match tnt {
@@ -313,6 +386,14 @@ impl Grammar {
.get(t)
.ok_or(Error::IndexOutOfBounds(t, self.ter.len()))?
.name()
+ .chars()
+ .map(|c| if crate::abnf::is_v_char(c) {
+ c.to_string()
+ } else {
+ format!("{:#x}", c as usize)
+ })
+ .collect::<Vec<_>>()
+ .join("")
)),
TNT::Non(n) => Ok(format!(
"N{}",
@@ -822,7 +903,7 @@ impl Display for Grammar {
f,
"{}",
rule.regex.to_string_with(|tnt| format!(
- "({})",
+ " {} ",
self.name_of_tnt(tnt)
.unwrap_or_else(|_| format!("Unknown {tnt:?}"))
))?