From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 11 Jan 2023 23:47:26 +0800 Subject: Record left-linear expansion and forest format Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine". --- nfa/src/default/nfa.rs | 210 ++++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 107 deletions(-) (limited to 'nfa/src/default/nfa.rs') diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index e642218..a23c056 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -5,7 +5,10 @@ use graph::{ LabelExtGraph, LabelGraph, }; -use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; +use crate::{ + default::regex::RegexType, error::Error, DOption, LabelType, Nfa, NfaLabel, Regex, SoC, + TwoEdges, +}; use core::fmt::{Debug, Display}; @@ -123,8 +126,6 @@ impl LabelExtGraph for DefaultNFA { } } -// TODO: Define a type for the labels of DefaultNFA. - impl Nfa for DefaultNFA { type FromRegex = DefaultNFA; @@ -134,7 +135,7 @@ impl Nfa for DefaultNFA { regexps: &[impl Regex>], sub_pred: impl Fn(T) -> Result, Error>, default: Option, - ) -> Result>, Error> { + ) -> Result>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); if total_regexps_len == 0 { @@ -144,18 +145,16 @@ impl Nfa for DefaultNFA { // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; - let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); - - // Since DOption implements Copy when T does, we can use a - // variable to hold the empty label to avoid repetitions. - let empty_label: DOption = Default::default(); + let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); for _ in 0..nfa_len { builder.add_vertex(); } + let default = LabelType::new(DOption(default), total_regexps_len, false); + // add a default edge - builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + builder.add_edge(nfa_len - 2, nfa_len - 1, default)?; let accumulators: Vec = { let mut result = Vec::with_capacity(regexps.len()); @@ -206,6 +205,10 @@ impl Nfa for DefaultNFA { stack.push(root); while let Some(top_index) = stack.pop() { + let false_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, false); + + let true_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, true); + let top_label = regex.vertex_label(top_index)?; let nfa_start = offset + 2 * top_index; @@ -213,7 +216,7 @@ impl Nfa for DefaultNFA { match top_label { RegexType::Kleene => { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; @@ -224,14 +227,14 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; - builder.add_edge(nfa_end, nfa_start, empty_label)?; + builder.add_edge(nfa_end, nfa_start, false_label)?; } } RegexType::Plus => { @@ -244,20 +247,20 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; - builder.add_edge(nfa_end, nfa_start, empty_label)?; + builder.add_edge(nfa_end, nfa_start, false_label)?; } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Optional => { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; @@ -268,12 +271,12 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; } } RegexType::Or => { @@ -284,11 +287,11 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(nfa_start, child_start, empty_label)?; - builder.add_edge(child_end, nfa_end, empty_label)?; + builder.add_edge(nfa_start, child_start, false_label)?; + builder.add_edge(child_end, nfa_end, false_label)?; } } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Paren => { @@ -305,14 +308,14 @@ impl Nfa for DefaultNFA { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Lit(value) => { @@ -327,13 +330,34 @@ impl Nfa for DefaultNFA { let sub_nfa_start = get_offset_safe!(sub_non); let sub_nfa_end = sub_nfa_start + 1; - builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; - builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + // We also need a label for the + // original label and non-left-linear + // expansion. + builder.add_edge( + nfa_start, + nfa_end, + NfaLabel::new( + DOption(Some(value)), + (offset >> 1) + top_index, + false, + ), + )?; + + builder.add_edge(nfa_start, sub_nfa_start, true_label)?; + builder.add_edge(sub_nfa_end, nfa_end, false_label)?; } SoC::Carry(new_value) => { // a terminal - builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + builder.add_edge( + nfa_start, + nfa_end, + NfaLabel::new( + DOption(Some(new_value)), + (offset >> 1) + top_index, + false, + ), + )?; } } } @@ -346,56 +370,7 @@ impl Nfa for DefaultNFA { Ok(DefaultNFA { graph }) } - fn remove_epsilon bool>(&mut self, f: F) -> Result<(), Error> { - let mut builder = self.graph.builder_ref(); - - let mut updated = true; - - let nodes_len = self.nodes_len(); - - while updated { - updated = false; - - let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); - - for source in builder.nodes() { - for (label, target_iter) in builder.labels_of(source)? { - if f(*label) { - // empty label found - for target in target_iter { - for (label, children_iter) in builder.labels_of(target)? { - for child in children_iter { - if !builder.has_edge_label(source, label, child)? { - updated = true; - - to_add.push((source, child, *label)); - } - } - } - } - } - } - } - - for (source, target, label) in to_add.into_iter() { - builder.add_edge(source, target, label)?; - } - } - - // Then remove all epsilon edges - - let sources_and_targets: Vec<_> = builder.edges().collect(); - - for (source, target) in sources_and_targets.into_iter() { - builder.remove_edge(source, target, &f)?; - } - - self.graph = builder.build(); - - Ok(()) - } - - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + fn remove_dead(&mut self, mut reserve: impl FnMut(usize) -> bool) -> Result<(), Error> { let mut builder = self.graph.builder_ref(); let mut changed = true; @@ -439,48 +414,69 @@ impl Nfa for DefaultNFA { Ok(()) } - // REVIEW: Combine nulling and remove_epsilon into the same - // method, or not. + fn closure( + &mut self, + mut predicate: impl FnMut(T) -> bool, + remove_after_p: bool, + mut transform: impl FnMut(TwoEdges) -> T, + mut remove_predicate: impl FnMut(T) -> bool, + ) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); - fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; - let mut builder = self.graph.builder_ref(); + + let nodes_len = self.nodes_len(); while updated { updated = false; - let mut nullable = false; - - let mut to_add = Vec::new(); - - for (source, target) in builder.edges() { - for label in builder.edge_label(source, target)? { - if f(label) { - nullable = true; + // Just a rough estimate + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); - break; - } - } + // REVIEW: I do not like nested if statements, but I do + // not know how to do this otherwise. - if nullable { - for (label, child_iter) in builder.labels_of(target)? { - for child in child_iter { - if !builder.has_edge_label(source, label, child)? { - updated = true; + for source in builder.nodes() { + for (first_label, target_iter) in builder.labels_of(source)? { + if predicate(*first_label) { + // a null label found + for target in target_iter { + for (second_label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + let new_label = transform(TwoEdges( + source, + target, + child, + *first_label, + *second_label, + )); + + if !builder.has_edge_label(source, &new_label, child)? { + updated = true; - to_add.push((source, child, *label)); + to_add.push((source, child, new_label)); + } + } } } } } } - for (source, child, label) in to_add { - builder.add_edge(source, child, label)?; + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all nullable edges if demanded + + if remove_after_p { + for (source, target) in builder.edges().collect::>().into_iter() { + builder.remove_edge(source, target, &mut remove_predicate)?; } } - self.graph.replace_by_builder(builder); + self.graph = builder.build(); Ok(()) } @@ -532,7 +528,7 @@ mod test_to_nfa { Ok( DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? - .unwrap_or(Default::default()) + .unwrap_or_default() .0, ) } @@ -543,13 +539,13 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA> = DefaultNFA::to_nfa( + let _nfa: DefaultNFA<_> = DefaultNFA::to_nfa( &[regex], |label| Ok(SoC::Carry(label)), Some(char::default()), )?; - nfa.print_viz("nfa.gv")?; + // nfa.print_viz("nfa.gv")?; Ok(()) } -- cgit v1.2.3-18-g5258