From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 11 Jan 2023 23:47:26 +0800 Subject: Record left-linear expansion and forest format Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine". --- chain/src/atom/default.rs | 140 ++++++++++++++++++++++++++-------------------- 1 file changed, 80 insertions(+), 60 deletions(-) (limited to 'chain/src/atom/default.rs') diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 72989b3..90133f4 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -4,7 +4,10 @@ use super::*; use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; -use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; +use nfa::{ + default::{nfa::DefaultNFA, regex::DefaultRegex}, + LabelType, +}; use core::fmt::Display; use std::collections::BTreeMap as Map; @@ -35,7 +38,7 @@ type VirtualMap = Map; #[derive(Debug, Clone, Default)] pub struct DefaultAtom { grammar: Grammar, - nfa: DefaultNFA>, + nfa: DefaultNFA>, // NOTE: This is mostly for printing and debugging regexp: Vec>, virtual_nodes: VirtualMap, @@ -95,7 +98,7 @@ impl Display for DefaultAtom { // LabelGraph, in order to implement Nfa. impl Graph for DefaultAtom { - type Iter<'b> = > as Graph>::Iter<'b> + type Iter<'b> = > as Graph>::Iter<'b> where Self: 'b; @@ -130,23 +133,23 @@ impl Graph for DefaultAtom { } } -impl LabelGraph> for DefaultAtom { - type Iter<'b> = > as LabelGraph>>::Iter<'b> +impl LabelGraph> for DefaultAtom { + type Iter<'b> = > as LabelGraph>>::Iter<'b> where Self: 'b; - type LabelIter<'b> = > as LabelGraph>>::LabelIter<'b> + type LabelIter<'b> = > as LabelGraph>>::LabelIter<'b> where Self: 'b, DOption: 'b; - type EdgeLabelIter<'a> = > as LabelGraph>>::EdgeLabelIter<'a> + type EdgeLabelIter<'a> = > as LabelGraph>>::EdgeLabelIter<'a> where Self: 'a, DOption: 'a; #[inline] - fn vertex_label(&self, node_id: usize) -> Result>, GraphError> { + fn vertex_label(&self, node_id: usize) -> Result>, GraphError> { self.nfa.vertex_label(node_id) } @@ -163,8 +166,8 @@ impl LabelGraph> for DefaultAtom { fn find_children_with_label( &self, node_id: usize, - label: &DOption, - ) -> Result<>>::Iter<'_>, GraphError> { + label: &LabelType, + ) -> Result<>>::Iter<'_>, GraphError> { self.nfa.find_children_with_label(node_id, label) } @@ -177,39 +180,31 @@ impl LabelGraph> for DefaultAtom { fn has_edge_label( &self, node_id: usize, - label: &DOption, + label: &LabelType, target: usize, ) -> Result { self.nfa.has_edge_label(node_id, label, target) } } -impl LabelExtGraph> for DefaultAtom { +impl LabelExtGraph> for DefaultAtom { #[inline] fn extend( &mut self, - edges: impl IntoIterator, usize)>, + edges: impl IntoIterator, usize)>, ) -> Result { self.nfa.extend(edges) } } -impl Nfa> for DefaultAtom { - #[inline] - fn remove_epsilon(&mut self, f: F) -> Result<(), nfa::error::Error> - where - F: Fn(DOption) -> bool, - { - self.nfa.remove_epsilon(f) - } - +impl Nfa> for DefaultAtom { type FromRegex = (); #[inline] fn to_nfa( - _regexps: &[impl nfa::Regex>>], - _sub_pred: impl Fn(DOption) -> Result>, nfa::error::Error>, - _default: Option>, + _regexps: &[impl nfa::Regex>>], + _sub_pred: impl Fn(LabelType) -> Result>, nfa::error::Error>, + _default: Option>, ) -> Result>>, nfa::error::Error> { // NOTE: We cannot construct an atom from a set of regular // languages alone. So it is appropriate to panic here, if @@ -218,13 +213,20 @@ impl Nfa> for DefaultAtom { } #[inline] - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> { self.nfa.remove_dead(reserve) } #[inline] - fn nulling(&mut self, f: impl Fn(DOption) -> bool) -> Result<(), nfa::error::Error> { - self.nfa.nulling(f) + fn closure( + &mut self, + predicate: impl FnMut(LabelType) -> bool, + remove_after_p: bool, + transform: impl FnMut(nfa::TwoEdges>) -> LabelType, + remove_predicate: impl FnMut(LabelType) -> bool, + ) -> Result<(), nfa::error::Error> { + self.nfa + .closure(predicate, remove_after_p, transform, remove_predicate) } } @@ -237,46 +239,56 @@ impl DefaultAtom { let mut nfa = grammar.left_closure_to_nfa(®exp)?; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; let accumulators: Vec = { let mut result = Vec::with_capacity(regexp.len() + 1); result.push(0); for regex in regexp.iter() { + // Calling `unwrap` here is safe as `result` is always + // non-empty. result.push(regex.nodes_len() * 2 + result.last().unwrap()); } - result.into_iter().collect() + result }; let accumulators_set: HashSet = accumulators.iter().copied().collect(); - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), + let nullables: HashSet = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + // Perform nulling and remove_epsilon at the same time. + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true } - } else { - true - } - })?; - nfa.remove_epsilon(|label| label.is_none())?; + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; - // now add the virtual nodes + // Now add the virtual nodes. let mut virtual_nodes: VirtualMap = Default::default(); let nt_num = grammar.non_num(); assert!(nt_num <= accumulators.len()); - // Convert an error telling us that an index is out of bounds. - // - // Panics if the error is not of the expected kind. + /// Convert an error telling us that an index is out of bounds. + /// + /// # Panics + /// + /// The function panics if the error is not of the expected + /// kind. fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { match ge { GraphError::IndexOutOfBounds(index, bound) => { @@ -287,24 +299,34 @@ impl DefaultAtom { } for nt in 0..nt_num { - let children: std::collections::HashMap, Vec<_>> = nfa - // this is safe because of the above assertion. + let children: std::collections::HashMap<_, _> = nfa + // This is safe because of the above assertion. .labels_of(*accumulators.get(nt).unwrap()) .map_err(index_out_of_bounds_conversion)? - .map(|(label, target_iter)| (*label, target_iter.collect())) + .map(|(label, target_iter)| (*label, target_iter)) .collect(); - for (label, children_vec) in children.into_iter() { - if let Some(TNT::Ter(t)) = *label { - let new_index = nfa - .extend(children_vec.into_iter().map(|target| (label, target))) - .map_err(index_out_of_bounds_conversion)?; + let mut terminals_map: HashMap, usize)>> = + HashMap::new(); - let virtual_node = VirtualNode::new(nt, t); - - virtual_nodes.insert(virtual_node, new_index); + for (label, children_iter) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label.get_value() { + terminals_map + .entry(t) + .or_insert_with(|| HashSet::with_capacity(children_iter.len())) + .extend(children_iter.map(|target| (label, target))); } } + + for (t, set) in terminals_map.into_iter() { + let new_index = nfa + .extend(set.into_iter()) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } } Ok(Self { @@ -335,8 +357,6 @@ impl Atom for DefaultAtom { } fn empty(&self) -> usize { - assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); - - self.nfa.nodes_len() - 2 + self.grammar.total() << 1 } } -- cgit v1.2.3-18-g5258