diff options
author | JSDurand <mmemmew@gmail.com> | 2023-01-11 23:47:26 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-01-11 23:47:26 +0800 |
commit | 1a3d346f413325ed37848a6b2526e8e729269833 (patch) | |
tree | ab8812f8094d096c68aee53cf516e986cc9a273a /chain/src/atom | |
parent | f27d604d93ce583d4404e1874664e08382ea2f00 (diff) |
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating
the nondeterministic finite automaton frmo its rules, and will record
whether an edge in the nondeterministic finite automaton comes from a
left-linear expansion. The latter is needed because while performing
a chain-rule derivation, we do not need the left-linear expanded
derivations in the "first layer". This might well have been the root
cause of the bad performance of the previous version of this package.
Also I have figured out how to properly generate and handle parse
forests while manipulating the "chain-rule machine".
Diffstat (limited to 'chain/src/atom')
-rw-r--r-- | chain/src/atom/default.rs | 140 | ||||
-rw-r--r-- | chain/src/atom/mod.rs | 4 |
2 files changed, 82 insertions, 62 deletions
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 72989b3..90133f4 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -4,7 +4,10 @@ use super::*; use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; -use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; +use nfa::{ + default::{nfa::DefaultNFA, regex::DefaultRegex}, + LabelType, +}; use core::fmt::Display; use std::collections::BTreeMap as Map; @@ -35,7 +38,7 @@ type VirtualMap = Map<VirtualNode, usize>; #[derive(Debug, Clone, Default)] pub struct DefaultAtom { grammar: Grammar, - nfa: DefaultNFA<DOption<TNT>>, + nfa: DefaultNFA<LabelType<TNT>>, // NOTE: This is mostly for printing and debugging regexp: Vec<DefaultRegex<TNT>>, virtual_nodes: VirtualMap, @@ -95,7 +98,7 @@ impl Display for DefaultAtom { // LabelGraph, in order to implement Nfa. impl Graph for DefaultAtom { - type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b> + type Iter<'b> = <DefaultNFA<LabelType<TNT>> as Graph>::Iter<'b> where Self: 'b; @@ -130,23 +133,23 @@ impl Graph for DefaultAtom { } } -impl LabelGraph<DOption<TNT>> for DefaultAtom { - type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b> +impl LabelGraph<LabelType<TNT>> for DefaultAtom { + type Iter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::Iter<'b> where Self: 'b; - type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b> + type LabelIter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::LabelIter<'b> where Self: 'b, DOption<TNT>: 'b; - type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a> + type EdgeLabelIter<'a> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::EdgeLabelIter<'a> where Self: 'a, DOption<TNT>: 'a; #[inline] - fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> { + fn vertex_label(&self, node_id: usize) -> Result<Option<LabelType<TNT>>, GraphError> { self.nfa.vertex_label(node_id) } @@ -163,8 +166,8 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom { fn find_children_with_label( &self, node_id: usize, - label: &DOption<TNT>, - ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> { + label: &LabelType<TNT>, + ) -> Result<<Self as LabelGraph<LabelType<TNT>>>::Iter<'_>, GraphError> { self.nfa.find_children_with_label(node_id, label) } @@ -177,39 +180,31 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom { fn has_edge_label( &self, node_id: usize, - label: &DOption<TNT>, + label: &LabelType<TNT>, target: usize, ) -> Result<bool, GraphError> { self.nfa.has_edge_label(node_id, label, target) } } -impl LabelExtGraph<DOption<TNT>> for DefaultAtom { +impl LabelExtGraph<LabelType<TNT>> for DefaultAtom { #[inline] fn extend( &mut self, - edges: impl IntoIterator<Item = (DOption<TNT>, usize)>, + edges: impl IntoIterator<Item = (LabelType<TNT>, usize)>, ) -> Result<usize, GraphError> { self.nfa.extend(edges) } } -impl Nfa<DOption<TNT>> for DefaultAtom { - #[inline] - fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error> - where - F: Fn(DOption<TNT>) -> bool, - { - self.nfa.remove_epsilon(f) - } - +impl Nfa<LabelType<TNT>> for DefaultAtom { type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = (); #[inline] fn to_nfa( - _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>], - _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>, - _default: Option<DOption<TNT>>, + _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<LabelType<TNT>>>], + _sub_pred: impl Fn(LabelType<TNT>) -> Result<nfa::SoC<LabelType<TNT>>, nfa::error::Error>, + _default: Option<LabelType<TNT>>, ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> { // NOTE: We cannot construct an atom from a set of regular // languages alone. So it is appropriate to panic here, if @@ -218,13 +213,20 @@ impl Nfa<DOption<TNT>> for DefaultAtom { } #[inline] - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> { self.nfa.remove_dead(reserve) } #[inline] - fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> { - self.nfa.nulling(f) + fn closure( + &mut self, + predicate: impl FnMut(LabelType<TNT>) -> bool, + remove_after_p: bool, + transform: impl FnMut(nfa::TwoEdges<LabelType<TNT>>) -> LabelType<TNT>, + remove_predicate: impl FnMut(LabelType<TNT>) -> bool, + ) -> Result<(), nfa::error::Error> { + self.nfa + .closure(predicate, remove_after_p, transform, remove_predicate) } } @@ -237,46 +239,56 @@ impl DefaultAtom { let mut nfa = grammar.left_closure_to_nfa(®exp)?; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; let accumulators: Vec<usize> = { let mut result = Vec::with_capacity(regexp.len() + 1); result.push(0); for regex in regexp.iter() { + // Calling `unwrap` here is safe as `result` is always + // non-empty. result.push(regex.nodes_len() * 2 + result.last().unwrap()); } - result.into_iter().collect() + result }; let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect(); - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), + let nullables: HashSet<usize> = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + // Perform nulling and remove_epsilon at the same time. + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true } - } else { - true - } - })?; - nfa.remove_epsilon(|label| label.is_none())?; + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; - // now add the virtual nodes + // Now add the virtual nodes. let mut virtual_nodes: VirtualMap = Default::default(); let nt_num = grammar.non_num(); assert!(nt_num <= accumulators.len()); - // Convert an error telling us that an index is out of bounds. - // - // Panics if the error is not of the expected kind. + /// Convert an error telling us that an index is out of bounds. + /// + /// # Panics + /// + /// The function panics if the error is not of the expected + /// kind. fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { match ge { GraphError::IndexOutOfBounds(index, bound) => { @@ -287,24 +299,34 @@ impl DefaultAtom { } for nt in 0..nt_num { - let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa - // this is safe because of the above assertion. + let children: std::collections::HashMap<_, _> = nfa + // This is safe because of the above assertion. .labels_of(*accumulators.get(nt).unwrap()) .map_err(index_out_of_bounds_conversion)? - .map(|(label, target_iter)| (*label, target_iter.collect())) + .map(|(label, target_iter)| (*label, target_iter)) .collect(); - for (label, children_vec) in children.into_iter() { - if let Some(TNT::Ter(t)) = *label { - let new_index = nfa - .extend(children_vec.into_iter().map(|target| (label, target))) - .map_err(index_out_of_bounds_conversion)?; + let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> = + HashMap::new(); - let virtual_node = VirtualNode::new(nt, t); - - virtual_nodes.insert(virtual_node, new_index); + for (label, children_iter) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label.get_value() { + terminals_map + .entry(t) + .or_insert_with(|| HashSet::with_capacity(children_iter.len())) + .extend(children_iter.map(|target| (label, target))); } } + + for (t, set) in terminals_map.into_iter() { + let new_index = nfa + .extend(set.into_iter()) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } } Ok(Self { @@ -335,8 +357,6 @@ impl Atom for DefaultAtom { } fn empty(&self) -> usize { - assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); - - self.nfa.nodes_len() - 2 + self.grammar.total() << 1 } } diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs index 084acca..065640b 100644 --- a/chain/src/atom/mod.rs +++ b/chain/src/atom/mod.rs @@ -7,10 +7,10 @@ //! I have better ideas in the future. use grammar::{Error as GrammarError, TNT}; -use nfa::{DOption, Nfa}; +use nfa::{DOption, LabelType, Nfa}; /// The expected behaviours of an atomic language. -pub trait Atom: Nfa<DOption<TNT>> { +pub trait Atom: Nfa<LabelType<TNT>> { /// Return the index of a node representing the derivative of the /// left-linear null closure of `nt` with respect to `t`. fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>; |