From 18d7955b7d84c00467ede38baae53f4ce1fb6908 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 20 Jan 2023 13:48:26 +0800 Subject: chain: a prototype is added. I have an ostensibly working prototype now. Further tests are needed to make sure that the algorithm meets the time complexity requirement, though. --- grammar/src/label.rs | 124 +++++++++++++++++++++++++++++++++++ grammar/src/left_closure.rs | 6 +- grammar/src/lib.rs | 83 ++++++++++++++++++++++-- grammar/src/test_grammar_helper.rs | 128 ++++++++++++++++++++++++++++++++++++- 4 files changed, 332 insertions(+), 9 deletions(-) create mode 100644 grammar/src/label.rs (limited to 'grammar/src') diff --git a/grammar/src/label.rs b/grammar/src/label.rs new file mode 100644 index 0000000..58eaddc --- /dev/null +++ b/grammar/src/label.rs @@ -0,0 +1,124 @@ +//! This file implements a type of labels that could be used as the +//! labels of a parse forest. + +use super::*; + +/// The actual label of a grammar label. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum GrammarLabelType { + /// A terminal or a non-terminal. + TNT(TNT), + /// A rule position. + Rule(usize), +} + +impl GrammarLabelType { + /// Return the name of this label with the help of the associated + /// grammar. + pub fn name(&self, grammar: &Grammar) -> Result { + match self { + Self::TNT(tnt) => grammar.name_of_tnt(*tnt), + Self::Rule(pos) => grammar.rule_pos_to_string(*pos), + } + } +} + +/// The label to be used in a forest. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct GrammarLabel { + /// The actual label. + label: GrammarLabelType, + /// The start in the input that this label correponds to. + start: usize, + /// The end in the input that this label correponds to. + end: Option, + /// A node in the forest might be a packed node. + packed_p: bool, +} + +impl core::fmt::Display for GrammarLabel { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + // Simply displaying this without the help of a grammar is not + // of much help, so we just use the debug method to cheat, + // haha. + + write!(f, "{:?}", self) + } +} + +impl GrammarLabel { + /// Construct a new label. + pub fn new(label: GrammarLabelType, start: usize) -> Self { + let end = None; + let packed_p = false; + + Self { + label, + start, + end, + packed_p, + } + } + + /// Return the end in the input. + pub fn end(&self) -> Option { + self.end + } + + /// Return the start in the input. + pub fn start(&self) -> usize { + self.start + } + + /// Return the actual label. + pub fn label(&self) -> GrammarLabelType { + self.label + } + + /// Update the end. + pub fn set_end(&mut self, end: usize) { + self.end = Some(end); + } + + /// Check whether the node is a packed node. + pub fn is_packed(&self) -> bool { + self.packed_p + } + + /// Update the packed status. + pub fn set_packed_p(&mut self, packed_p: bool) { + self.packed_p = packed_p; + } + + /// Return a string description with the help of the associated + /// grammar. + pub fn to_string(&self, grammar: &Grammar) -> Result { + // REVIEW: It needs at least 34 bytes, so we just double it. + // Of course we can also calculate the length exactly, but + // this can be postponed till later. + let mut s = String::with_capacity(68); + s.push_str("a "); + + if self.is_packed() { + s.push_str("packed "); + } else { + s.push_str("normal "); + } + + s.push_str("node labelled "); + + s.push_str(&self.label().name(grammar)?); + + s.push_str(" from "); + + s.push_str(&format!("{} ", self.start())); + + if let Some(end) = self.end() { + s.push_str(&format!("to {end}")); + } else { + s.push_str("onwards"); + } + + Ok(s) + } +} diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs index 1630881..39461f0 100644 --- a/grammar/src/left_closure.rs +++ b/grammar/src/left_closure.rs @@ -114,10 +114,10 @@ impl Grammar { local_result.push(Or); builder.add_vertex(); - local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); - let non_lit_index = builder.add_vertex(); + // local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + // let non_lit_index = builder.add_vertex(); - builder.add_edge(0, non_lit_index, ()).unwrap(); + // builder.add_edge(0, non_lit_index, ()).unwrap(); // If this non-terminal is nullable, add an empty variant. if self.is_nullable(n)? { diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs index 297cb66..50a2b9b 100644 --- a/grammar/src/lib.rs +++ b/grammar/src/lib.rs @@ -320,11 +320,39 @@ impl Grammar { self.accumulators.last().copied().unwrap_or(0) } + /// Return an element of the accumulator. + #[inline] + pub fn nth_accumulator(&self, n: usize) -> Result { + self.accumulators + .get(n) + .copied() + .ok_or_else(|| Error::IndexOutOfBounds(n, self.total())) + } + + /// Return the index of the rules a rule position belongs to. + #[inline] + pub fn get_rule_num(&self, pos: usize) -> Result { + let mut result = None; + + for (index, accumulator) in self.accumulators.iter().copied().skip(1).enumerate() { + if accumulator > pos { + result = Some(index); + break; + } + } + + if let Some(n) = result { + Ok(n) + } else { + Err(Error::IndexOutOfBounds(pos, self.total())) + } + } + /// Query if a position is the starting position of a /// non-terminal. If it is, return the non-terminal, else return /// `None` . #[inline] - pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option { + pub fn get_nt_start_in_nfa(&self, pos: usize) -> Option { for (index, accumulator) in self.accumulators.iter().copied().enumerate() { let shifted_accumulator = accumulator << 1; @@ -456,7 +484,7 @@ impl Grammar { } } - Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..])) + Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| v.as_ref())) } // REVIEW: Do we have a better way to record expansion information @@ -483,7 +511,11 @@ impl Grammar { &mut self, two_edges: TwoEdges>, ) -> LabelType { + #[cfg(debug_assertions)] let (first_source, first_target, first_label) = two_edges.first_edge(); + #[cfg(not(debug_assertions))] + let (first_source, _, first_label) = two_edges.first_edge(); + let (second_source, second_target, second_label) = two_edges.second_edge(); #[cfg(debug_assertions)] @@ -501,11 +533,11 @@ impl Grammar { // that is to say, the source of the second edge is the // starting edge of a non-terminal. - let mut left_p = first_label.get_left_p() || second_label.get_left_p(); + let mut left_p = first_label.is_left_p() || second_label.is_left_p(); // Record left-linear expansion information. - if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) { + if let Some(second_nt) = self.get_nt_start_in_nfa(second_source) { left_p = true; if !self @@ -554,12 +586,55 @@ impl Grammar { .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? .iter()) } + + /// Return a string describing a rule position. + pub fn rule_pos_to_string(&self, pos: usize) -> Result { + let rule_num = { + let mut result = None; + + for (index, accumulator) in self.accumulators.iter().copied().skip(1).enumerate() { + if accumulator > pos { + result = Some(index); + break; + } + } + + if let Some(n) = result { + n + } else { + return Err(Error::IndexOutOfBounds(pos, self.total())); + } + }; + + assert!(rule_num < self.rules.len()); + + let display_tnt = |tnt| self.name_of_tnt(tnt).unwrap_or_else(|e| format!("{e}")); + + Ok(self + .rules + .get(rule_num) + .unwrap() + .regex + .to_string_with_dot( + display_tnt, + if rule_num == 0 { + pos + } else { + pos - self.accumulators.get(rule_num - 1).copied().unwrap() + }, + ) + .unwrap()) + } } pub mod first_set; pub mod left_closure; +pub mod label; + +pub use label::{GrammarLabel, GrammarLabelType}; + impl Display for Grammar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { assert_eq!(self.non.len(), self.rules.len()); diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs index 89f9844..984eb50 100644 --- a/grammar/src/test_grammar_helper.rs +++ b/grammar/src/test_grammar_helper.rs @@ -45,7 +45,7 @@ fn scan_tnt( 'T' => { let mut name = String::new(); - while let Some(c) = chars.next() { + for c in chars.by_ref() { if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { len += 1; write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; @@ -63,7 +63,7 @@ fn scan_tnt( 'N' => { let mut name = String::new(); - while let Some(c) = chars.next() { + for c in chars.by_ref() { if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { len += 1; write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; @@ -124,6 +124,130 @@ pub fn new_grammar() -> Result> { Ok(Grammar::new(ter, non, rules)) } +/// Return a grammar that might serve as the grammar for my notes, +/// somehow, without regular expressions. +#[allow(dead_code)] +pub fn new_notes_grammar_no_regexp() -> Result> { + let ter = vec![ + Terminal::new("NL".to_owned()), + Terminal::new("SP".to_owned()), + Terminal::new("CON".to_owned()), + Terminal::new("STAR".to_owned()), + Terminal::new("NOTE".to_owned()), + Terminal::new("PRICE".to_owned()), + Terminal::new("DIGIT".to_owned()), + ]; + let non = vec![ + Nonterminal("document".to_owned()), + Nonterminal("item".to_owned()), + Nonterminal("header".to_owned()), + Nonterminal("title".to_owned()), + Nonterminal("note".to_owned()), + Nonterminal("note-content".to_owned()), + Nonterminal("price".to_owned()), + Nonterminal("ending".to_owned()), + Nonterminal("digits".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("NL", true); + regex_parser.add_tnt("SP", true); + regex_parser.add_tnt("CON", true); + regex_parser.add_tnt("STAR", true); + regex_parser.add_tnt("note", true); + regex_parser.add_tnt("price", true); + regex_parser.add_tnt("DIGIT", true); + regex_parser.add_tnt("document", false); + regex_parser.add_tnt("item", false); + regex_parser.add_tnt("header", false); + regex_parser.add_tnt("title", false); + regex_parser.add_tnt("note", false); + regex_parser.add_tnt("notecontent", false); + regex_parser.add_tnt("price", false); + regex_parser.add_tnt("ending", false); + regex_parser.add_tnt("digits", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Nitem | Nitem Ndocument", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse( + "Nheader | Nheader Nprice | Nheader Nnote | Nheader Nprice Nnote", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse( + "TSP Ntitle TNL Nending | TSTAR TSP Ntitle TNL Nending", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule4 = Rule { + regex: regex_parser + .parse("TCON | TCON Ntitle", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule5 = Rule { + regex: regex_parser + .parse("Tnote Nnotecontent TNL Nending", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule6 = Rule { + regex: regex_parser + .parse("TCON | TCON Nnotecontent", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule7 = Rule { + regex: regex_parser + .parse("Tprice TSP Ndigits TNL Nending", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule8 = Rule { + regex: regex_parser + .parse("TNL Nending | TSP Nending | ()", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule9 = Rule { + regex: regex_parser + .parse("TDIGIT | TDIGIT Ndigits", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![ + rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9, + ]; + + Ok(Grammar::new(ter, non, rules)) +} + /// Return a grammar that might serve as the grammar for my notes, /// somehow. #[allow(dead_code)] -- cgit v1.2.3-18-g5258