structural change: separate crates out

I put functionalities that are not strictly core to separate crates, so that the whole package becomes more modular, and makes it easier to try other parsing algorithms in the future. Also I have to figure the forests out before finishing the core chain-rule algorithm, as the part about forests affects the labels of the grammars directly. From my experiences in writing the previous version, it is asking for trouble to change the labels type dramatically at a later point: too many places need to be changed. Thus I decide to figure the rough part of forests out. Actually I only have to figure out how to attach forests fragments to edges of the underlying atomic languages, and the more complex parts of putting forests together can be left to the recorders, which is my vision of assembling semi-ring values during the chain-rule machine. It should be relatively easy to produce forests fragments from grammars since we are just trying to extract some information from the grammar, not to manipulate those information in some complicated way. We have to do some manipulations in the process, though, in order to make sure that the nulling and epsilon-removal processes do not invalidate these fragments.
author: JSDurand <mmemmew@gmail.com> 2023-01-03 23:44:02 +0800
committer: JSDurand <mmemmew@gmail.com> 2023-01-03 23:44:02 +0800
commit: bdbd4b4dc21af09711c97d3f903877443199af06 (patch)
tree: c6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /grammar/src/test_grammar_helper.rs
parent: 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff)
1 files changed, 368 insertions, 0 deletions
diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs
new file mode 100644
index 0000000..c236952
--- /dev/null
+++ b/grammar/src/test_grammar_helper.rs
@@ -0,0 +1,368 @@
+//! This module provides some grammars for testing.
+
+use super::*;
+use nfa::{
+    default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType},
+    DesRec,
+};
+use std::fmt::Write;
+
+/// A helper function to compute the first sets of a grammar and
+/// return the left-closure of that grammar.
+pub fn new_closure_regex(
+    grammar: &mut Grammar,
+) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> {
+    grammar.compute_firsts()?;
+
+    grammar.left_closure().map_err(Into::into)
+}
+
+/// A function to scan the inputs.
+fn scan_tnt(
+    parser: &DefaultRegParser<TNT>,
+    input: &str,
+) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> {
+    use ParseDirection::*;
+    use RegexType::*;
+    use TNT::*;
+
+    let mut chars = input.chars();
+
+    let mut len = 1;
+
+    while let Some(first) = chars.next() {
+        match first {
+            ' ' => {
+                // ignore whitespaces
+                len += 1;
+            }
+            '*' => return Ok(Some((len, Kleene, Right))),
+            '+' => return Ok(Some((len, Plus, Right))),
+            '?' => return Ok(Some((len, Optional, Right))),
+            '|' => return Ok(Some((len, Empty, Up))),
+            '(' => return Ok(Some((len, Or, Down))),
+            ')' => return Ok(Some((len, Paren, Up))),
+            'T' => {
+                let mut name = String::new();
+
+                while let Some(c) = chars.next() {
+                    if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) {
+                        len += 1;
+                        write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
+                    } else {
+                        break;
+                    }
+                }
+
+                if let Some(t) = parser.query(&name, true) {
+                    return Ok(Some((len, Lit(Ter(t)), Right)));
+                } else {
+                    return Err(ParseError::InvalidCharacter(first));
+                }
+            }
+            'N' => {
+                let mut name = String::new();
+
+                while let Some(c) = chars.next() {
+                    if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) {
+                        len += 1;
+                        write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
+                    } else {
+                        break;
+                    }
+                }
+
+                if let Some(n) = parser.query(&name, false) {
+                    return Ok(Some((len, Lit(Non(n)), Right)));
+                } else {
+                    return Err(ParseError::InvalidCharacter(first));
+                }
+            }
+            _ => {
+                return Err(ParseError::InvalidCharacter(first));
+            }
+        }
+    }
+
+    Ok(None)
+}
+
+/// Return a simple testing grammar.
+#[allow(dead_code)]
+pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
+    let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())];
+    let non = vec![
+        Nonterminal("start".to_owned()),
+        Nonterminal("end".to_owned()),
+    ];
+
+    let mut regex_parser: DefaultRegParser<TNT> = Default::default();
+
+    regex_parser.add_tnt("a", true);
+    regex_parser.add_tnt("b", true);
+    regex_parser.add_tnt("start", false);
+    regex_parser.add_tnt("end", false);
+
+    let regex_parser = regex_parser;
+
+    let rule1 = Rule {
+        regex: regex_parser
+            .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule2 = Rule {
+        regex: regex_parser
+            .parse("Nstart?Nend*", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rules = vec![rule1, rule2];
+
+    Ok(Grammar::new(ter, non, rules))
+}
+
+/// Return a grammar that might serve as the grammar for my notes,
+/// somehow.
+#[allow(dead_code)]
+pub fn new_notes_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
+    let ter = vec![
+        Terminal::new("NL".to_owned()),
+        Terminal::new("SP".to_owned()),
+        Terminal::new("CON".to_owned()),
+        Terminal::new("STAR".to_owned()),
+        Terminal::new("NOTE".to_owned()),
+        Terminal::new("PRICE".to_owned()),
+        Terminal::new("DIGIT".to_owned()),
+    ];
+    let non = vec![
+        Nonterminal("document".to_owned()),
+        Nonterminal("item".to_owned()),
+        Nonterminal("header".to_owned()),
+        Nonterminal("title".to_owned()),
+        Nonterminal("note".to_owned()),
+        Nonterminal("note-content".to_owned()),
+        Nonterminal("price".to_owned()),
+    ];
+
+    let mut regex_parser: DefaultRegParser<TNT> = Default::default();
+
+    regex_parser.add_tnt("NL", true);
+    regex_parser.add_tnt("SP", true);
+    regex_parser.add_tnt("CON", true);
+    regex_parser.add_tnt("STAR", true);
+    regex_parser.add_tnt("note", true);
+    regex_parser.add_tnt("price", true);
+    regex_parser.add_tnt("DIGIT", true);
+    regex_parser.add_tnt("document", false);
+    regex_parser.add_tnt("item", false);
+    regex_parser.add_tnt("header", false);
+    regex_parser.add_tnt("title", false);
+    regex_parser.add_tnt("note", false);
+    regex_parser.add_tnt("notecontent", false);
+    regex_parser.add_tnt("price", false);
+
+    let regex_parser = regex_parser;
+
+    let rule1 = Rule {
+        regex: regex_parser
+            .parse("Nitem+", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule2 = Rule {
+        regex: regex_parser
+            .parse("Nheader Nprice?Nnote?", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule3 = Rule {
+        regex: regex_parser
+            .parse("TSTAR?TSP Ntitle TNL (TSP|TNL)*", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule4 = Rule {
+        regex: regex_parser
+            .parse("TCON+", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule5 = Rule {
+        regex: regex_parser
+            .parse(
+                "Tnote Nnotecontent TNL (TSP|TNL)*",
+                Box::new(scan_tnt),
+                true,
+            )?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule6 = Rule {
+        regex: regex_parser
+            .parse("TCON+", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule7 = Rule {
+        regex: regex_parser
+            .parse(
+                "Tprice TSP TDIGIT+ TNL(TSP | TNL)+",
+                Box::new(scan_tnt),
+                true,
+            )?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rules = vec![rule1, rule2, rule3, rule4, rule5, rule6, rule7];
+
+    Ok(Grammar::new(ter, non, rules))
+}
+
+/// Return a grammar that can express parentheses.
+#[allow(dead_code)]
+pub fn new_paren_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
+    let ter = vec![
+        Terminal::new("LEFT".to_owned()),
+        Terminal::new("RIGHT".to_owned()),
+        Terminal::new("A".to_owned()),
+    ];
+    let non = vec![
+        Nonterminal("start".to_owned()),
+        Nonterminal("content".to_owned()),
+    ];
+
+    let mut regex_parser: DefaultRegParser<TNT> = Default::default();
+
+    regex_parser.add_tnt("LEFT", true);
+    regex_parser.add_tnt("RIGHT", true);
+    regex_parser.add_tnt("A", true);
+    regex_parser.add_tnt("start", false);
+    regex_parser.add_tnt("content", false);
+
+    let regex_parser = regex_parser;
+
+    let rule1 = Rule {
+        regex: regex_parser
+            .parse(
+                "TLEFT Nstart TRIGHT | Ncontent Nstart | ",
+                Box::new(scan_tnt),
+                true,
+            )?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule2 = Rule {
+        regex: regex_parser
+            .parse("TA +", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rules = vec![rule1, rule2];
+
+    Ok(Grammar::new(ter, non, rules))
+}
+
+/// Return a left recursive grammar.
+#[allow(dead_code)]
+pub fn new_left_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
+    let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())];
+    let non = vec![
+        Nonterminal("start".to_owned()),
+        Nonterminal("S".to_owned()),
+        Nonterminal("A".to_owned()),
+    ];
+
+    let mut regex_parser: DefaultRegParser<TNT> = Default::default();
+
+    regex_parser.add_tnt("B", true);
+    regex_parser.add_tnt("C", true);
+    regex_parser.add_tnt("start", false);
+    regex_parser.add_tnt("S", false);
+    regex_parser.add_tnt("A", false);
+
+    let regex_parser = regex_parser;
+
+    let rule1 = Rule {
+        regex: regex_parser
+            .parse("NA NS TC", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule2 = Rule {
+        regex: regex_parser
+            .parse("TB | Nstart", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule3 = Rule {
+        regex: regex_parser
+            .parse("()", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rules = vec![rule1, rule2, rule3];
+
+    Ok(Grammar::new(ter, non, rules))
+}
+
+/// Return a right recursive grammar.
+#[allow(dead_code)]
+pub fn new_right_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
+    let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())];
+    let non = vec![
+        Nonterminal("start".to_owned()),
+        Nonterminal("S".to_owned()),
+        Nonterminal("A".to_owned()),
+    ];
+
+    let mut regex_parser: DefaultRegParser<TNT> = Default::default();
+
+    regex_parser.add_tnt("B", true);
+    regex_parser.add_tnt("C", true);
+    regex_parser.add_tnt("start", false);
+    regex_parser.add_tnt("S", false);
+    regex_parser.add_tnt("A", false);
+
+    let regex_parser = regex_parser;
+
+    let rule1 = Rule {
+        regex: regex_parser
+            .parse("NS TC NA|TB Nstart", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule2 = Rule {
+        regex: regex_parser
+            .parse("TB", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rule3 = Rule {
+        regex: regex_parser
+            .parse("NA|", Box::new(scan_tnt), true)?
+            .ok_or(ParseError::Invalid)?
+            .0,
+    };
+
+    let rules = vec![rule1, rule2, rule3];
+
+    Ok(Grammar::new(ter, non, rules))
+}
+// TODO: more grammars
author	JSDurand <mmemmew@gmail.com>	2023-01-03 23:44:02 +0800
committer	JSDurand <mmemmew@gmail.com>	2023-01-03 23:44:02 +0800
commit	bdbd4b4dc21af09711c97d3f903877443199af06 (patch)
tree	c6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /grammar/src/test_grammar_helper.rs
parent	8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff)