diff options
author | JSDurand <mmemmew@gmail.com> | 2023-01-11 23:47:26 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-01-11 23:47:26 +0800 |
commit | 1a3d346f413325ed37848a6b2526e8e729269833 (patch) | |
tree | ab8812f8094d096c68aee53cf516e986cc9a273a /chain | |
parent | f27d604d93ce583d4404e1874664e08382ea2f00 (diff) |
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating
the nondeterministic finite automaton frmo its rules, and will record
whether an edge in the nondeterministic finite automaton comes from a
left-linear expansion. The latter is needed because while performing
a chain-rule derivation, we do not need the left-linear expanded
derivations in the "first layer". This might well have been the root
cause of the bad performance of the previous version of this package.
Also I have figured out how to properly generate and handle parse
forests while manipulating the "chain-rule machine".
Diffstat (limited to 'chain')
-rw-r--r-- | chain/src/atom/default.rs | 140 | ||||
-rw-r--r-- | chain/src/atom/mod.rs | 4 | ||||
-rw-r--r-- | chain/src/plan.org | 52 |
3 files changed, 107 insertions, 89 deletions
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 72989b3..90133f4 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -4,7 +4,10 @@ use super::*; use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; -use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; +use nfa::{ + default::{nfa::DefaultNFA, regex::DefaultRegex}, + LabelType, +}; use core::fmt::Display; use std::collections::BTreeMap as Map; @@ -35,7 +38,7 @@ type VirtualMap = Map<VirtualNode, usize>; #[derive(Debug, Clone, Default)] pub struct DefaultAtom { grammar: Grammar, - nfa: DefaultNFA<DOption<TNT>>, + nfa: DefaultNFA<LabelType<TNT>>, // NOTE: This is mostly for printing and debugging regexp: Vec<DefaultRegex<TNT>>, virtual_nodes: VirtualMap, @@ -95,7 +98,7 @@ impl Display for DefaultAtom { // LabelGraph, in order to implement Nfa. impl Graph for DefaultAtom { - type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b> + type Iter<'b> = <DefaultNFA<LabelType<TNT>> as Graph>::Iter<'b> where Self: 'b; @@ -130,23 +133,23 @@ impl Graph for DefaultAtom { } } -impl LabelGraph<DOption<TNT>> for DefaultAtom { - type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b> +impl LabelGraph<LabelType<TNT>> for DefaultAtom { + type Iter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::Iter<'b> where Self: 'b; - type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b> + type LabelIter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::LabelIter<'b> where Self: 'b, DOption<TNT>: 'b; - type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a> + type EdgeLabelIter<'a> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::EdgeLabelIter<'a> where Self: 'a, DOption<TNT>: 'a; #[inline] - fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> { + fn vertex_label(&self, node_id: usize) -> Result<Option<LabelType<TNT>>, GraphError> { self.nfa.vertex_label(node_id) } @@ -163,8 +166,8 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom { fn find_children_with_label( &self, node_id: usize, - label: &DOption<TNT>, - ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> { + label: &LabelType<TNT>, + ) -> Result<<Self as LabelGraph<LabelType<TNT>>>::Iter<'_>, GraphError> { self.nfa.find_children_with_label(node_id, label) } @@ -177,39 +180,31 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom { fn has_edge_label( &self, node_id: usize, - label: &DOption<TNT>, + label: &LabelType<TNT>, target: usize, ) -> Result<bool, GraphError> { self.nfa.has_edge_label(node_id, label, target) } } -impl LabelExtGraph<DOption<TNT>> for DefaultAtom { +impl LabelExtGraph<LabelType<TNT>> for DefaultAtom { #[inline] fn extend( &mut self, - edges: impl IntoIterator<Item = (DOption<TNT>, usize)>, + edges: impl IntoIterator<Item = (LabelType<TNT>, usize)>, ) -> Result<usize, GraphError> { self.nfa.extend(edges) } } -impl Nfa<DOption<TNT>> for DefaultAtom { - #[inline] - fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error> - where - F: Fn(DOption<TNT>) -> bool, - { - self.nfa.remove_epsilon(f) - } - +impl Nfa<LabelType<TNT>> for DefaultAtom { type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = (); #[inline] fn to_nfa( - _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>], - _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>, - _default: Option<DOption<TNT>>, + _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<LabelType<TNT>>>], + _sub_pred: impl Fn(LabelType<TNT>) -> Result<nfa::SoC<LabelType<TNT>>, nfa::error::Error>, + _default: Option<LabelType<TNT>>, ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> { // NOTE: We cannot construct an atom from a set of regular // languages alone. So it is appropriate to panic here, if @@ -218,13 +213,20 @@ impl Nfa<DOption<TNT>> for DefaultAtom { } #[inline] - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> { self.nfa.remove_dead(reserve) } #[inline] - fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> { - self.nfa.nulling(f) + fn closure( + &mut self, + predicate: impl FnMut(LabelType<TNT>) -> bool, + remove_after_p: bool, + transform: impl FnMut(nfa::TwoEdges<LabelType<TNT>>) -> LabelType<TNT>, + remove_predicate: impl FnMut(LabelType<TNT>) -> bool, + ) -> Result<(), nfa::error::Error> { + self.nfa + .closure(predicate, remove_after_p, transform, remove_predicate) } } @@ -237,46 +239,56 @@ impl DefaultAtom { let mut nfa = grammar.left_closure_to_nfa(®exp)?; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; let accumulators: Vec<usize> = { let mut result = Vec::with_capacity(regexp.len() + 1); result.push(0); for regex in regexp.iter() { + // Calling `unwrap` here is safe as `result` is always + // non-empty. result.push(regex.nodes_len() * 2 + result.last().unwrap()); } - result.into_iter().collect() + result }; let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect(); - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), + let nullables: HashSet<usize> = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + // Perform nulling and remove_epsilon at the same time. + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true } - } else { - true - } - })?; - nfa.remove_epsilon(|label| label.is_none())?; + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; - // now add the virtual nodes + // Now add the virtual nodes. let mut virtual_nodes: VirtualMap = Default::default(); let nt_num = grammar.non_num(); assert!(nt_num <= accumulators.len()); - // Convert an error telling us that an index is out of bounds. - // - // Panics if the error is not of the expected kind. + /// Convert an error telling us that an index is out of bounds. + /// + /// # Panics + /// + /// The function panics if the error is not of the expected + /// kind. fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { match ge { GraphError::IndexOutOfBounds(index, bound) => { @@ -287,24 +299,34 @@ impl DefaultAtom { } for nt in 0..nt_num { - let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa - // this is safe because of the above assertion. + let children: std::collections::HashMap<_, _> = nfa + // This is safe because of the above assertion. .labels_of(*accumulators.get(nt).unwrap()) .map_err(index_out_of_bounds_conversion)? - .map(|(label, target_iter)| (*label, target_iter.collect())) + .map(|(label, target_iter)| (*label, target_iter)) .collect(); - for (label, children_vec) in children.into_iter() { - if let Some(TNT::Ter(t)) = *label { - let new_index = nfa - .extend(children_vec.into_iter().map(|target| (label, target))) - .map_err(index_out_of_bounds_conversion)?; + let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> = + HashMap::new(); - let virtual_node = VirtualNode::new(nt, t); - - virtual_nodes.insert(virtual_node, new_index); + for (label, children_iter) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label.get_value() { + terminals_map + .entry(t) + .or_insert_with(|| HashSet::with_capacity(children_iter.len())) + .extend(children_iter.map(|target| (label, target))); } } + + for (t, set) in terminals_map.into_iter() { + let new_index = nfa + .extend(set.into_iter()) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } } Ok(Self { @@ -335,8 +357,6 @@ impl Atom for DefaultAtom { } fn empty(&self) -> usize { - assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); - - self.nfa.nodes_len() - 2 + self.grammar.total() << 1 } } diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs index 084acca..065640b 100644 --- a/chain/src/atom/mod.rs +++ b/chain/src/atom/mod.rs @@ -7,10 +7,10 @@ //! I have better ideas in the future. use grammar::{Error as GrammarError, TNT}; -use nfa::{DOption, Nfa}; +use nfa::{DOption, LabelType, Nfa}; /// The expected behaviours of an atomic language. -pub trait Atom: Nfa<DOption<TNT>> { +pub trait Atom: Nfa<LabelType<TNT>> { /// Return the index of a node representing the derivative of the /// left-linear null closure of `nt` with respect to `t`. fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>; diff --git a/chain/src/plan.org b/chain/src/plan.org index bbd6683..b708413 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -2,7 +2,7 @@ #+AUTHOR: Durand #+DATE: <2022-11-18 Ven 19:57> -* Things to do [5/10] +* Things to do [6/10] - [X] Implement builders for graphs - [X] Find sets of the first terminals for each non-terminal, in the @@ -38,18 +38,30 @@ finite automata. * [X] Test the removal of dead states, where a state is dead if and only if no other states have an edge into that state. -- [-] Refactor [2/8] +- [X] Refactor [7/7] + [X] Implement a data type for graphs with labels on vertices and edges, but do not need to index edges by labels, which can index vertices by labels instead. * [X] Test it. + [X] Implement a builder that borrows a graph mutably. * [X] Test it. - + [ ] Implement a data type for graphs in which every node knows its + + [X] Implement a data type for graphs in which every node knows its parents, and every node has a unique label but edges have no labels. - * [ ] Test it. - + [ ] We need to record the expansions of those "virtual nodes". + * [X] Test it. + + [X] When we pull in some regular language because of the + left-linear expansion, we need to mark that branch as coming from + left-linear expansions. This is necessary because we cannot + follow a left-linearly expanded branch when we take the derivative + of a language. We only expand left-linearly when we try to access + the atomic languages [s]^{(t)}. + + [X] An edge of the NFA should carry a label that can be more + informative than solely a terminal or a non-terminal. + + [X] Add a mechanism for a grammar to attach labels to edges of NFA + which are not necessarily Copy-able, and which should be stored in a + separate array, such that the labels on edges of NFA point to the + elements of the array. + + [X] We need to record the expansions of those "virtual nodes". That is, the virtual nodes represent atomic languages such as [s]^{(t)} where s is a non-terminal and t is a terminal. To be more specific, it represents the derivative of the left-linear closure @@ -59,32 +71,21 @@ alone, without consuming any inputs, by expanding according to the grammar rules. This means that we need to know which non-terminals were expanded in order to get to a state in [s]^{(t)}. - + [ ] Each edge in the chain-rule machine needs to be labelled also - with a position in the forest. This perfectly solves the problem - of those "plugs"! - + [ ] When we pull in some regular language because of the - left-linear expansion, we need to mark that branch as coming from - left-linear expansions. This is necessary because we cannot - follow a left-linearly expanded branch when we take the derivative - of a language. We only expand left-linearly when we try to access - the atomic languages [s]^{(t)}. We can mark by returning a set of - nodes which are the beginnings of left-linearly expanded branches. - + [ ] An edge of the NFA should carry a label that can be more - informative than solely a terminal or a non-terminal. - + [ ] Add a mechanism for a grammar to attach labels to edges of NFA - which are not necessarily Copy-able, and which should be stored in a - separate array, such that the labels on edges of NFA point to the - elements of the array. + * [X] Test + * [X] Test more - [X] Atom [3/3] + [X] Define the Atom trait. + [X] Implement virtual nodes for each derivative of each atom: The lack of this step might be the root cause of the failure of the previous version of this project. + [X] Test atoms -- [-] Implement languages. [1/3] +- [-] Implement languages. [1/4] + [X] First define a trait with the expected behaviours. + [ ] Then implement them as doubly labelled graphs. - + [ ] Thenimplement finding derivatives by use of the chain rule. + + [ ] Then implement finding derivatives by use of the chain rule. + + [ ] Each edge in the chain-rule machine needs to be labelled also + with a position in the forest. This perfectly solves the problem + of those "plugs"! - [-] Implement forests [2/3] + [X] Design a format of forests. This should be the most crucial thing to do, in order to have a better understanding of the whole @@ -92,10 +93,7 @@ the binarised shared packed parsed forests, that reflects the regular expressions in the grammar equations. + [X] Define a trait with the expected behaviours. - + [-] Implement them using adjacency map: we only need one label per - edge, and we do not wish to exclude duplicate edges, and we do not - need to index edges by the labels. All in all, we implement them - using a vector of hashmaps. + + [-] Implement them as parents-knowing graphs. - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. |