summaryrefslogtreecommitdiff
path: root/grammar/src
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-20 13:48:26 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-20 13:48:26 +0800
commit18d7955b7d84c00467ede38baae53f4ce1fb6908 (patch)
tree97d0746b82816a21d980636e50f8cdbeb804b518 /grammar/src
parent8f8d3d1a3c276be4be2e5d2e767ada564c47279a (diff)
chain: a prototype is added.
I have an ostensibly working prototype now. Further tests are needed to make sure that the algorithm meets the time complexity requirement, though.
Diffstat (limited to 'grammar/src')
-rw-r--r--grammar/src/label.rs124
-rw-r--r--grammar/src/left_closure.rs6
-rw-r--r--grammar/src/lib.rs83
-rw-r--r--grammar/src/test_grammar_helper.rs128
4 files changed, 332 insertions, 9 deletions
diff --git a/grammar/src/label.rs b/grammar/src/label.rs
new file mode 100644
index 0000000..58eaddc
--- /dev/null
+++ b/grammar/src/label.rs
@@ -0,0 +1,124 @@
+//! This file implements a type of labels that could be used as the
+//! labels of a parse forest.
+
+use super::*;
+
+/// The actual label of a grammar label.
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub enum GrammarLabelType {
+ /// A terminal or a non-terminal.
+ TNT(TNT),
+ /// A rule position.
+ Rule(usize),
+}
+
+impl GrammarLabelType {
+ /// Return the name of this label with the help of the associated
+ /// grammar.
+ pub fn name(&self, grammar: &Grammar) -> Result<String, Error> {
+ match self {
+ Self::TNT(tnt) => grammar.name_of_tnt(*tnt),
+ Self::Rule(pos) => grammar.rule_pos_to_string(*pos),
+ }
+ }
+}
+
+/// The label to be used in a forest.
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct GrammarLabel {
+ /// The actual label.
+ label: GrammarLabelType,
+ /// The start in the input that this label correponds to.
+ start: usize,
+ /// The end in the input that this label correponds to.
+ end: Option<usize>,
+ /// A node in the forest might be a packed node.
+ packed_p: bool,
+}
+
+impl core::fmt::Display for GrammarLabel {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ // Simply displaying this without the help of a grammar is not
+ // of much help, so we just use the debug method to cheat,
+ // haha.
+
+ write!(f, "{:?}", self)
+ }
+}
+
+impl GrammarLabel {
+ /// Construct a new label.
+ pub fn new(label: GrammarLabelType, start: usize) -> Self {
+ let end = None;
+ let packed_p = false;
+
+ Self {
+ label,
+ start,
+ end,
+ packed_p,
+ }
+ }
+
+ /// Return the end in the input.
+ pub fn end(&self) -> Option<usize> {
+ self.end
+ }
+
+ /// Return the start in the input.
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Return the actual label.
+ pub fn label(&self) -> GrammarLabelType {
+ self.label
+ }
+
+ /// Update the end.
+ pub fn set_end(&mut self, end: usize) {
+ self.end = Some(end);
+ }
+
+ /// Check whether the node is a packed node.
+ pub fn is_packed(&self) -> bool {
+ self.packed_p
+ }
+
+ /// Update the packed status.
+ pub fn set_packed_p(&mut self, packed_p: bool) {
+ self.packed_p = packed_p;
+ }
+
+ /// Return a string description with the help of the associated
+ /// grammar.
+ pub fn to_string(&self, grammar: &Grammar) -> Result<String, Error> {
+ // REVIEW: It needs at least 34 bytes, so we just double it.
+ // Of course we can also calculate the length exactly, but
+ // this can be postponed till later.
+ let mut s = String::with_capacity(68);
+ s.push_str("a ");
+
+ if self.is_packed() {
+ s.push_str("packed ");
+ } else {
+ s.push_str("normal ");
+ }
+
+ s.push_str("node labelled ");
+
+ s.push_str(&self.label().name(grammar)?);
+
+ s.push_str(" from ");
+
+ s.push_str(&format!("{} ", self.start()));
+
+ if let Some(end) = self.end() {
+ s.push_str(&format!("to {end}"));
+ } else {
+ s.push_str("onwards");
+ }
+
+ Ok(s)
+ }
+}
diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs
index 1630881..39461f0 100644
--- a/grammar/src/left_closure.rs
+++ b/grammar/src/left_closure.rs
@@ -114,10 +114,10 @@ impl Grammar {
local_result.push(Or);
builder.add_vertex();
- local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
- let non_lit_index = builder.add_vertex();
+ // local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
+ // let non_lit_index = builder.add_vertex();
- builder.add_edge(0, non_lit_index, ()).unwrap();
+ // builder.add_edge(0, non_lit_index, ()).unwrap();
// If this non-terminal is nullable, add an empty variant.
if self.is_nullable(n)? {
diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs
index 297cb66..50a2b9b 100644
--- a/grammar/src/lib.rs
+++ b/grammar/src/lib.rs
@@ -320,11 +320,39 @@ impl Grammar {
self.accumulators.last().copied().unwrap_or(0)
}
+ /// Return an element of the accumulator.
+ #[inline]
+ pub fn nth_accumulator(&self, n: usize) -> Result<usize, Error> {
+ self.accumulators
+ .get(n)
+ .copied()
+ .ok_or_else(|| Error::IndexOutOfBounds(n, self.total()))
+ }
+
+ /// Return the index of the rules a rule position belongs to.
+ #[inline]
+ pub fn get_rule_num(&self, pos: usize) -> Result<usize, Error> {
+ let mut result = None;
+
+ for (index, accumulator) in self.accumulators.iter().copied().skip(1).enumerate() {
+ if accumulator > pos {
+ result = Some(index);
+ break;
+ }
+ }
+
+ if let Some(n) = result {
+ Ok(n)
+ } else {
+ Err(Error::IndexOutOfBounds(pos, self.total()))
+ }
+ }
+
/// Query if a position is the starting position of a
/// non-terminal. If it is, return the non-terminal, else return
/// `None` .
#[inline]
- pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option<usize> {
+ pub fn get_nt_start_in_nfa(&self, pos: usize) -> Option<usize> {
for (index, accumulator) in self.accumulators.iter().copied().enumerate() {
let shifted_accumulator = accumulator << 1;
@@ -456,7 +484,7 @@ impl Grammar {
}
}
- Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..]))
+ Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| v.as_ref()))
}
// REVIEW: Do we have a better way to record expansion information
@@ -483,7 +511,11 @@ impl Grammar {
&mut self,
two_edges: TwoEdges<LabelType<TNT>>,
) -> LabelType<TNT> {
+ #[cfg(debug_assertions)]
let (first_source, first_target, first_label) = two_edges.first_edge();
+ #[cfg(not(debug_assertions))]
+ let (first_source, _, first_label) = two_edges.first_edge();
+
let (second_source, second_target, second_label) = two_edges.second_edge();
#[cfg(debug_assertions)]
@@ -501,11 +533,11 @@ impl Grammar {
// that is to say, the source of the second edge is the
// starting edge of a non-terminal.
- let mut left_p = first_label.get_left_p() || second_label.get_left_p();
+ let mut left_p = first_label.is_left_p() || second_label.is_left_p();
// Record left-linear expansion information.
- if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) {
+ if let Some(second_nt) = self.get_nt_start_in_nfa(second_source) {
left_p = true;
if !self
@@ -554,12 +586,55 @@ impl Grammar {
.ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
.iter())
}
+
+ /// Return a string describing a rule position.
+ pub fn rule_pos_to_string(&self, pos: usize) -> Result<String, Error> {
+ let rule_num = {
+ let mut result = None;
+
+ for (index, accumulator) in self.accumulators.iter().copied().skip(1).enumerate() {
+ if accumulator > pos {
+ result = Some(index);
+ break;
+ }
+ }
+
+ if let Some(n) = result {
+ n
+ } else {
+ return Err(Error::IndexOutOfBounds(pos, self.total()));
+ }
+ };
+
+ assert!(rule_num < self.rules.len());
+
+ let display_tnt = |tnt| self.name_of_tnt(tnt).unwrap_or_else(|e| format!("{e}"));
+
+ Ok(self
+ .rules
+ .get(rule_num)
+ .unwrap()
+ .regex
+ .to_string_with_dot(
+ display_tnt,
+ if rule_num == 0 {
+ pos
+ } else {
+ pos - self.accumulators.get(rule_num - 1).copied().unwrap()
+ },
+ )
+ .unwrap())
+ }
}
pub mod first_set;
pub mod left_closure;
+pub mod label;
+
+pub use label::{GrammarLabel, GrammarLabelType};
+
impl Display for Grammar {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
assert_eq!(self.non.len(), self.rules.len());
diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs
index 89f9844..984eb50 100644
--- a/grammar/src/test_grammar_helper.rs
+++ b/grammar/src/test_grammar_helper.rs
@@ -45,7 +45,7 @@ fn scan_tnt(
'T' => {
let mut name = String::new();
- while let Some(c) = chars.next() {
+ for c in chars.by_ref() {
if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) {
len += 1;
write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
@@ -63,7 +63,7 @@ fn scan_tnt(
'N' => {
let mut name = String::new();
- while let Some(c) = chars.next() {
+ for c in chars.by_ref() {
if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) {
len += 1;
write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
@@ -125,6 +125,130 @@ pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
}
/// Return a grammar that might serve as the grammar for my notes,
+/// somehow, without regular expressions.
+#[allow(dead_code)]
+pub fn new_notes_grammar_no_regexp() -> Result<Grammar, Box<dyn std::error::Error>> {
+ let ter = vec![
+ Terminal::new("NL".to_owned()),
+ Terminal::new("SP".to_owned()),
+ Terminal::new("CON".to_owned()),
+ Terminal::new("STAR".to_owned()),
+ Terminal::new("NOTE".to_owned()),
+ Terminal::new("PRICE".to_owned()),
+ Terminal::new("DIGIT".to_owned()),
+ ];
+ let non = vec![
+ Nonterminal("document".to_owned()),
+ Nonterminal("item".to_owned()),
+ Nonterminal("header".to_owned()),
+ Nonterminal("title".to_owned()),
+ Nonterminal("note".to_owned()),
+ Nonterminal("note-content".to_owned()),
+ Nonterminal("price".to_owned()),
+ Nonterminal("ending".to_owned()),
+ Nonterminal("digits".to_owned()),
+ ];
+
+ let mut regex_parser: DefaultRegParser<TNT> = Default::default();
+
+ regex_parser.add_tnt("NL", true);
+ regex_parser.add_tnt("SP", true);
+ regex_parser.add_tnt("CON", true);
+ regex_parser.add_tnt("STAR", true);
+ regex_parser.add_tnt("note", true);
+ regex_parser.add_tnt("price", true);
+ regex_parser.add_tnt("DIGIT", true);
+ regex_parser.add_tnt("document", false);
+ regex_parser.add_tnt("item", false);
+ regex_parser.add_tnt("header", false);
+ regex_parser.add_tnt("title", false);
+ regex_parser.add_tnt("note", false);
+ regex_parser.add_tnt("notecontent", false);
+ regex_parser.add_tnt("price", false);
+ regex_parser.add_tnt("ending", false);
+ regex_parser.add_tnt("digits", false);
+
+ let regex_parser = regex_parser;
+
+ let rule1 = Rule {
+ regex: regex_parser
+ .parse("Nitem | Nitem Ndocument", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule2 = Rule {
+ regex: regex_parser
+ .parse(
+ "Nheader | Nheader Nprice | Nheader Nnote | Nheader Nprice Nnote",
+ Box::new(scan_tnt),
+ true,
+ )?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule3 = Rule {
+ regex: regex_parser
+ .parse(
+ "TSP Ntitle TNL Nending | TSTAR TSP Ntitle TNL Nending",
+ Box::new(scan_tnt),
+ true,
+ )?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule4 = Rule {
+ regex: regex_parser
+ .parse("TCON | TCON Ntitle", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule5 = Rule {
+ regex: regex_parser
+ .parse("Tnote Nnotecontent TNL Nending", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule6 = Rule {
+ regex: regex_parser
+ .parse("TCON | TCON Nnotecontent", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule7 = Rule {
+ regex: regex_parser
+ .parse("Tprice TSP Ndigits TNL Nending", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule8 = Rule {
+ regex: regex_parser
+ .parse("TNL Nending | TSP Nending | ()", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rule9 = Rule {
+ regex: regex_parser
+ .parse("TDIGIT | TDIGIT Ndigits", Box::new(scan_tnt), true)?
+ .ok_or(ParseError::Invalid)?
+ .0,
+ };
+
+ let rules = vec![
+ rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9,
+ ];
+
+ Ok(Grammar::new(ter, non, rules))
+}
+
+/// Return a grammar that might serve as the grammar for my notes,
/// somehow.
#[allow(dead_code)]
pub fn new_notes_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {