From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 11 Jan 2023 23:47:26 +0800 Subject: Record left-linear expansion and forest format Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine". --- nfa/src/default/nfa.rs | 210 ++++++++++++++++++++++++------------------------- nfa/src/lib.rs | 155 +++++++++++++++++++++++++++++++++--- 2 files changed, 245 insertions(+), 120 deletions(-) (limited to 'nfa') diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index e642218..a23c056 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -5,7 +5,10 @@ use graph::{ LabelExtGraph, LabelGraph, }; -use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; +use crate::{ + default::regex::RegexType, error::Error, DOption, LabelType, Nfa, NfaLabel, Regex, SoC, + TwoEdges, +}; use core::fmt::{Debug, Display}; @@ -123,8 +126,6 @@ impl LabelExtGraph for DefaultNFA { } } -// TODO: Define a type for the labels of DefaultNFA. - impl Nfa for DefaultNFA { type FromRegex = DefaultNFA; @@ -134,7 +135,7 @@ impl Nfa for DefaultNFA { regexps: &[impl Regex>], sub_pred: impl Fn(T) -> Result, Error>, default: Option, - ) -> Result>, Error> { + ) -> Result>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); if total_regexps_len == 0 { @@ -144,18 +145,16 @@ impl Nfa for DefaultNFA { // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; - let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); - - // Since DOption implements Copy when T does, we can use a - // variable to hold the empty label to avoid repetitions. - let empty_label: DOption = Default::default(); + let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); for _ in 0..nfa_len { builder.add_vertex(); } + let default = LabelType::new(DOption(default), total_regexps_len, false); + // add a default edge - builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + builder.add_edge(nfa_len - 2, nfa_len - 1, default)?; let accumulators: Vec = { let mut result = Vec::with_capacity(regexps.len()); @@ -206,6 +205,10 @@ impl Nfa for DefaultNFA { stack.push(root); while let Some(top_index) = stack.pop() { + let false_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, false); + + let true_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, true); + let top_label = regex.vertex_label(top_index)?; let nfa_start = offset + 2 * top_index; @@ -213,7 +216,7 @@ impl Nfa for DefaultNFA { match top_label { RegexType::Kleene => { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; @@ -224,14 +227,14 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; - builder.add_edge(nfa_end, nfa_start, empty_label)?; + builder.add_edge(nfa_end, nfa_start, false_label)?; } } RegexType::Plus => { @@ -244,20 +247,20 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; - builder.add_edge(nfa_end, nfa_start, empty_label)?; + builder.add_edge(nfa_end, nfa_start, false_label)?; } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Optional => { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; @@ -268,12 +271,12 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; } } RegexType::Or => { @@ -284,11 +287,11 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(nfa_start, child_start, empty_label)?; - builder.add_edge(child_end, nfa_end, empty_label)?; + builder.add_edge(nfa_start, child_start, false_label)?; + builder.add_edge(child_end, nfa_end, false_label)?; } } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Paren => { @@ -305,14 +308,14 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Lit(value) => { @@ -327,13 +330,34 @@ impl Nfa for DefaultNFA { let sub_nfa_start = get_offset_safe!(sub_non); let sub_nfa_end = sub_nfa_start + 1; - builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; - builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + // We also need a label for the + // original label and non-left-linear + // expansion. + builder.add_edge( + nfa_start, + nfa_end, + NfaLabel::new( + DOption(Some(value)), + (offset >> 1) + top_index, + false, + ), + )?; + + builder.add_edge(nfa_start, sub_nfa_start, true_label)?; + builder.add_edge(sub_nfa_end, nfa_end, false_label)?; } SoC::Carry(new_value) => { // a terminal - builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + builder.add_edge( + nfa_start, + nfa_end, + NfaLabel::new( + DOption(Some(new_value)), + (offset >> 1) + top_index, + false, + ), + )?; } } } @@ -346,56 +370,7 @@ impl Nfa for DefaultNFA { Ok(DefaultNFA { graph }) } - fn remove_epsilon bool>(&mut self, f: F) -> Result<(), Error> { - let mut builder = self.graph.builder_ref(); - - let mut updated = true; - - let nodes_len = self.nodes_len(); - - while updated { - updated = false; - - let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); - - for source in builder.nodes() { - for (label, target_iter) in builder.labels_of(source)? { - if f(*label) { - // empty label found - for target in target_iter { - for (label, children_iter) in builder.labels_of(target)? { - for child in children_iter { - if !builder.has_edge_label(source, label, child)? { - updated = true; - - to_add.push((source, child, *label)); - } - } - } - } - } - } - } - - for (source, target, label) in to_add.into_iter() { - builder.add_edge(source, target, label)?; - } - } - - // Then remove all epsilon edges - - let sources_and_targets: Vec<_> = builder.edges().collect(); - - for (source, target) in sources_and_targets.into_iter() { - builder.remove_edge(source, target, &f)?; - } - - self.graph = builder.build(); - - Ok(()) - } - - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + fn remove_dead(&mut self, mut reserve: impl FnMut(usize) -> bool) -> Result<(), Error> { let mut builder = self.graph.builder_ref(); let mut changed = true; @@ -439,48 +414,69 @@ impl Nfa for DefaultNFA { Ok(()) } - // REVIEW: Combine nulling and remove_epsilon into the same - // method, or not. + fn closure( + &mut self, + mut predicate: impl FnMut(T) -> bool, + remove_after_p: bool, + mut transform: impl FnMut(TwoEdges) -> T, + mut remove_predicate: impl FnMut(T) -> bool, + ) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); - fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; - let mut builder = self.graph.builder_ref(); + + let nodes_len = self.nodes_len(); while updated { updated = false; - let mut nullable = false; - - let mut to_add = Vec::new(); - - for (source, target) in builder.edges() { - for label in builder.edge_label(source, target)? { - if f(label) { - nullable = true; + // Just a rough estimate + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); - break; - } - } + // REVIEW: I do not like nested if statements, but I do + // not know how to do this otherwise. - if nullable { - for (label, child_iter) in builder.labels_of(target)? { - for child in child_iter { - if !builder.has_edge_label(source, label, child)? { - updated = true; + for source in builder.nodes() { + for (first_label, target_iter) in builder.labels_of(source)? { + if predicate(*first_label) { + // a null label found + for target in target_iter { + for (second_label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + let new_label = transform(TwoEdges( + source, + target, + child, + *first_label, + *second_label, + )); + + if !builder.has_edge_label(source, &new_label, child)? { + updated = true; - to_add.push((source, child, *label)); + to_add.push((source, child, new_label)); + } + } } } } } } - for (source, child, label) in to_add { - builder.add_edge(source, child, label)?; + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all nullable edges if demanded + + if remove_after_p { + for (source, target) in builder.edges().collect::>().into_iter() { + builder.remove_edge(source, target, &mut remove_predicate)?; } } - self.graph.replace_by_builder(builder); + self.graph = builder.build(); Ok(()) } @@ -532,7 +528,7 @@ mod test_to_nfa { Ok( DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? - .unwrap_or(Default::default()) + .unwrap_or_default() .0, ) } @@ -543,13 +539,13 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA> = DefaultNFA::to_nfa( + let _nfa: DefaultNFA<_> = DefaultNFA::to_nfa( &[regex], |label| Ok(SoC::Carry(label)), Some(char::default()), )?; - nfa.print_viz("nfa.gv")?; + // nfa.print_viz("nfa.gv")?; Ok(()) } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index 31bd69a..c1906e1 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -60,8 +60,12 @@ pub trait Regex: Graph + Display + Clone { /// Since `Option` does not inherit the `Display` from `T`, we wrap /// it to provide an automatic implementation of `Display`. +/// +/// # Convert to `Option` +/// +/// One can dereference a `DOption` to obtain an `Option`. #[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub struct DOption(Option); +pub struct DOption(pub Option); impl Default for DOption { fn default() -> Self { @@ -117,17 +121,104 @@ pub enum SoC { Carry(T), } +/// This type records some information that is obtained from the +/// process of converting a regular expression to a nondeterministic +/// finite automaton. +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash, Default)] +pub struct NfaLabel { + /// A terminal or a non-terminal. + value: T, + /// The corresponding position in the rules. + moved: usize, + /// Whether this comes from left-linear expansion. + left_p: bool, +} + +impl NfaLabel { + /// Construct a new label. + #[inline] + pub fn new(value: T, moved: usize, left_p: bool) -> Self { + Self { + value, + moved, + left_p, + } + } + + /// Retrieve the value from the label. + #[inline] + pub fn get_value(&self) -> T { + self.value + } + /// Retrieve the moved position from the label. + #[inline] + pub fn get_moved(&self) -> usize { + self.moved + } + /// Retrieve whether or not the label comes from left-linear + /// expansions. + #[inline] + pub fn get_left_p(&self) -> bool { + self.left_p + } +} + +impl Display for NfaLabel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "edge {} at {}{}", self.value, self.moved, { + if self.left_p { + ", by left" + } else { + "" + } + }) + } +} + +/// A convenient alias of the information of two edges. +/// +/// If the tuple is (a, b, c, la, lb), then the first edge goes from a +/// to b, is labelled la, and the second edge goes from b to c, and is +/// labelled by lb. +#[derive(Debug, Clone, Copy)] +pub struct TwoEdges(usize, usize, usize, T, T); + +impl TwoEdges { + /// Extract the first edge. + pub fn first_edge(&self) -> (usize, usize, T) { + (self.0, self.1, self.3) + } + + /// Extract the second edge. + pub fn second_edge(&self) -> (usize, usize, T) { + (self.1, self.2, self.4) + } +} + +/// The type of nondeterministic finite automata that is obtained from +/// a regular expression, via the method [`to_nfa`][Nfa::to_nfa]. +pub type LabelType = NfaLabel>; + /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. pub trait Nfa: LabelExtGraph { - // TODO: This trait should have a type for the labels. + /// When we convert a regular expression to a nondeterministic + /// finite automaton, the label should be optional, so we use a + /// different type for the result type. + type FromRegex; + // FIXME: This should not be needed. /// Remove all empty transitions from the nondeterministic finite /// automaton. - fn remove_epsilon(&mut self, f: F) -> Result<(), Error> + #[inline] + fn remove_epsilon(&mut self, _f: F) -> Result<(), Error> where - F: Fn(T) -> bool; + F: FnMut(T) -> bool, + { + // self.closure(f, true, |two_edges| two_edges.second_edge().2) + unimplemented!("deprecated") + } /// Return a state-minimal NFA equivalent with the original one. /// @@ -174,11 +265,6 @@ pub trait Nfa: LabelExtGraph { Ok(result) } - /// When we convert a regular expression to a nondeterministic - /// finite automaton, the label should be optional, so we use a - /// different type for the result type. - type FromRegex; - /// Build a nondeterministic finite automaton out of a set /// `regexps` of regular expressions. /// @@ -194,10 +280,9 @@ pub trait Nfa: LabelExtGraph { /// basically. fn to_nfa( regexps: &[impl Regex>], - // TODO: The transformation should produce more information. sub_pred: impl Fn(T) -> Result, Error>, default: Option, - ) -> Result>, Error>; + ) -> Result>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. @@ -211,8 +296,9 @@ pub trait Nfa: LabelExtGraph { /// out of the dead nodes. Then these nodes are considered gone /// from the graph, and we don't need to re-index the existing /// edges. We can call this "a poor man's removal of nodes". - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>; + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), Error>; + // FIXME: This should not be needed. /// For each edge from A to B whose edge is considered nullable by /// a function `f`, and for every edge from B to C, add an edge /// from A to C. @@ -220,7 +306,50 @@ pub trait Nfa: LabelExtGraph { /// This is needed specifically by the algorithm to produce a set /// of atomic languages that represent "null-closed" derived /// languages. - fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>; + #[inline] + fn nulling(&mut self, _f: impl FnMut(T) -> bool) -> Result<(), Error> { + // self.closure(f, false, |two_edges| two_edges.second_edge().2) + unimplemented!("deprecated") + } + + /// Return the *closure* of the nondeterministic finite automaton. + /// + /// # Definition + /// + /// The closure of a nondeterministic finite automaton N is + /// defined as the unique *minimal* nondeterministic finite + /// automaton N+ that can be obtained by adjoining some edges to N + /// such that if there are edges a -> b and b -> c, and if the + /// edge a -> b is deemed as *nullable* by some function, then + /// there is an edge a -> c, where the minimality is the + /// minimality of the set of edges: if there is another such + /// nondeterministic finite automaton M satisfying the above + /// property, then the set of edges of N+ is a subset of the set + /// of edges of M. + /// + /// # Remove edges afterwards + /// + /// If `remove_after_p` is true, remove all those nullable edges. + /// + /// # Transformation of labels + /// + /// We can apply a transformer to labels, to be able to combine + /// labels in non-trivial ways. If one just wants the *new* label + /// that is the label of the edge from b to c in the above + /// definition, one can use the function that sends `two_edges` to + /// `two_edges.second_edge().2`. + /// + /// # Error + /// + /// The function should emit errors if the edges of the + /// nondeterministic finite automaton point to invalid nodes. + fn closure( + &mut self, + predicate: impl FnMut(T) -> bool, + remove_after_p: bool, + transform: impl FnMut(TwoEdges) -> T, + remove_predicate: impl FnMut(T) -> bool, + ) -> Result<(), Error>; } pub mod default; -- cgit v1.2.3-18-g5258