From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 11 Jan 2023 23:47:26 +0800 Subject: Record left-linear expansion and forest format Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine". --- chain/src/atom/default.rs | 140 ++++++++++++++++++++++++++-------------------- chain/src/atom/mod.rs | 4 +- chain/src/plan.org | 52 +++++++++-------- 3 files changed, 107 insertions(+), 89 deletions(-) (limited to 'chain') diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 72989b3..90133f4 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -4,7 +4,10 @@ use super::*; use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; -use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; +use nfa::{ + default::{nfa::DefaultNFA, regex::DefaultRegex}, + LabelType, +}; use core::fmt::Display; use std::collections::BTreeMap as Map; @@ -35,7 +38,7 @@ type VirtualMap = Map; #[derive(Debug, Clone, Default)] pub struct DefaultAtom { grammar: Grammar, - nfa: DefaultNFA>, + nfa: DefaultNFA>, // NOTE: This is mostly for printing and debugging regexp: Vec>, virtual_nodes: VirtualMap, @@ -95,7 +98,7 @@ impl Display for DefaultAtom { // LabelGraph, in order to implement Nfa. impl Graph for DefaultAtom { - type Iter<'b> = > as Graph>::Iter<'b> + type Iter<'b> = > as Graph>::Iter<'b> where Self: 'b; @@ -130,23 +133,23 @@ impl Graph for DefaultAtom { } } -impl LabelGraph> for DefaultAtom { - type Iter<'b> = > as LabelGraph>>::Iter<'b> +impl LabelGraph> for DefaultAtom { + type Iter<'b> = > as LabelGraph>>::Iter<'b> where Self: 'b; - type LabelIter<'b> = > as LabelGraph>>::LabelIter<'b> + type LabelIter<'b> = > as LabelGraph>>::LabelIter<'b> where Self: 'b, DOption: 'b; - type EdgeLabelIter<'a> = > as LabelGraph>>::EdgeLabelIter<'a> + type EdgeLabelIter<'a> = > as LabelGraph>>::EdgeLabelIter<'a> where Self: 'a, DOption: 'a; #[inline] - fn vertex_label(&self, node_id: usize) -> Result>, GraphError> { + fn vertex_label(&self, node_id: usize) -> Result>, GraphError> { self.nfa.vertex_label(node_id) } @@ -163,8 +166,8 @@ impl LabelGraph> for DefaultAtom { fn find_children_with_label( &self, node_id: usize, - label: &DOption, - ) -> Result<>>::Iter<'_>, GraphError> { + label: &LabelType, + ) -> Result<>>::Iter<'_>, GraphError> { self.nfa.find_children_with_label(node_id, label) } @@ -177,39 +180,31 @@ impl LabelGraph> for DefaultAtom { fn has_edge_label( &self, node_id: usize, - label: &DOption, + label: &LabelType, target: usize, ) -> Result { self.nfa.has_edge_label(node_id, label, target) } } -impl LabelExtGraph> for DefaultAtom { +impl LabelExtGraph> for DefaultAtom { #[inline] fn extend( &mut self, - edges: impl IntoIterator, usize)>, + edges: impl IntoIterator, usize)>, ) -> Result { self.nfa.extend(edges) } } -impl Nfa> for DefaultAtom { - #[inline] - fn remove_epsilon(&mut self, f: F) -> Result<(), nfa::error::Error> - where - F: Fn(DOption) -> bool, - { - self.nfa.remove_epsilon(f) - } - +impl Nfa> for DefaultAtom { type FromRegex = (); #[inline] fn to_nfa( - _regexps: &[impl nfa::Regex>>], - _sub_pred: impl Fn(DOption) -> Result>, nfa::error::Error>, - _default: Option>, + _regexps: &[impl nfa::Regex>>], + _sub_pred: impl Fn(LabelType) -> Result>, nfa::error::Error>, + _default: Option>, ) -> Result>>, nfa::error::Error> { // NOTE: We cannot construct an atom from a set of regular // languages alone. So it is appropriate to panic here, if @@ -218,13 +213,20 @@ impl Nfa> for DefaultAtom { } #[inline] - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> { self.nfa.remove_dead(reserve) } #[inline] - fn nulling(&mut self, f: impl Fn(DOption) -> bool) -> Result<(), nfa::error::Error> { - self.nfa.nulling(f) + fn closure( + &mut self, + predicate: impl FnMut(LabelType) -> bool, + remove_after_p: bool, + transform: impl FnMut(nfa::TwoEdges>) -> LabelType, + remove_predicate: impl FnMut(LabelType) -> bool, + ) -> Result<(), nfa::error::Error> { + self.nfa + .closure(predicate, remove_after_p, transform, remove_predicate) } } @@ -237,46 +239,56 @@ impl DefaultAtom { let mut nfa = grammar.left_closure_to_nfa(®exp)?; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; let accumulators: Vec = { let mut result = Vec::with_capacity(regexp.len() + 1); result.push(0); for regex in regexp.iter() { + // Calling `unwrap` here is safe as `result` is always + // non-empty. result.push(regex.nodes_len() * 2 + result.last().unwrap()); } - result.into_iter().collect() + result }; let accumulators_set: HashSet = accumulators.iter().copied().collect(); - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), + let nullables: HashSet = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + // Perform nulling and remove_epsilon at the same time. + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true } - } else { - true - } - })?; - nfa.remove_epsilon(|label| label.is_none())?; + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; - // now add the virtual nodes + // Now add the virtual nodes. let mut virtual_nodes: VirtualMap = Default::default(); let nt_num = grammar.non_num(); assert!(nt_num <= accumulators.len()); - // Convert an error telling us that an index is out of bounds. - // - // Panics if the error is not of the expected kind. + /// Convert an error telling us that an index is out of bounds. + /// + /// # Panics + /// + /// The function panics if the error is not of the expected + /// kind. fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { match ge { GraphError::IndexOutOfBounds(index, bound) => { @@ -287,24 +299,34 @@ impl DefaultAtom { } for nt in 0..nt_num { - let children: std::collections::HashMap, Vec<_>> = nfa - // this is safe because of the above assertion. + let children: std::collections::HashMap<_, _> = nfa + // This is safe because of the above assertion. .labels_of(*accumulators.get(nt).unwrap()) .map_err(index_out_of_bounds_conversion)? - .map(|(label, target_iter)| (*label, target_iter.collect())) + .map(|(label, target_iter)| (*label, target_iter)) .collect(); - for (label, children_vec) in children.into_iter() { - if let Some(TNT::Ter(t)) = *label { - let new_index = nfa - .extend(children_vec.into_iter().map(|target| (label, target))) - .map_err(index_out_of_bounds_conversion)?; + let mut terminals_map: HashMap, usize)>> = + HashMap::new(); - let virtual_node = VirtualNode::new(nt, t); - - virtual_nodes.insert(virtual_node, new_index); + for (label, children_iter) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label.get_value() { + terminals_map + .entry(t) + .or_insert_with(|| HashSet::with_capacity(children_iter.len())) + .extend(children_iter.map(|target| (label, target))); } } + + for (t, set) in terminals_map.into_iter() { + let new_index = nfa + .extend(set.into_iter()) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } } Ok(Self { @@ -335,8 +357,6 @@ impl Atom for DefaultAtom { } fn empty(&self) -> usize { - assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); - - self.nfa.nodes_len() - 2 + self.grammar.total() << 1 } } diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs index 084acca..065640b 100644 --- a/chain/src/atom/mod.rs +++ b/chain/src/atom/mod.rs @@ -7,10 +7,10 @@ //! I have better ideas in the future. use grammar::{Error as GrammarError, TNT}; -use nfa::{DOption, Nfa}; +use nfa::{DOption, LabelType, Nfa}; /// The expected behaviours of an atomic language. -pub trait Atom: Nfa> { +pub trait Atom: Nfa> { /// Return the index of a node representing the derivative of the /// left-linear null closure of `nt` with respect to `t`. fn atom(&self, nt: usize, t: usize) -> Result, GrammarError>; diff --git a/chain/src/plan.org b/chain/src/plan.org index bbd6683..b708413 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -2,7 +2,7 @@ #+AUTHOR: Durand #+DATE: <2022-11-18 Ven 19:57> -* Things to do [5/10] +* Things to do [6/10] - [X] Implement builders for graphs - [X] Find sets of the first terminals for each non-terminal, in the @@ -38,18 +38,30 @@ finite automata. * [X] Test the removal of dead states, where a state is dead if and only if no other states have an edge into that state. -- [-] Refactor [2/8] +- [X] Refactor [7/7] + [X] Implement a data type for graphs with labels on vertices and edges, but do not need to index edges by labels, which can index vertices by labels instead. * [X] Test it. + [X] Implement a builder that borrows a graph mutably. * [X] Test it. - + [ ] Implement a data type for graphs in which every node knows its + + [X] Implement a data type for graphs in which every node knows its parents, and every node has a unique label but edges have no labels. - * [ ] Test it. - + [ ] We need to record the expansions of those "virtual nodes". + * [X] Test it. + + [X] When we pull in some regular language because of the + left-linear expansion, we need to mark that branch as coming from + left-linear expansions. This is necessary because we cannot + follow a left-linearly expanded branch when we take the derivative + of a language. We only expand left-linearly when we try to access + the atomic languages [s]^{(t)}. + + [X] An edge of the NFA should carry a label that can be more + informative than solely a terminal or a non-terminal. + + [X] Add a mechanism for a grammar to attach labels to edges of NFA + which are not necessarily Copy-able, and which should be stored in a + separate array, such that the labels on edges of NFA point to the + elements of the array. + + [X] We need to record the expansions of those "virtual nodes". That is, the virtual nodes represent atomic languages such as [s]^{(t)} where s is a non-terminal and t is a terminal. To be more specific, it represents the derivative of the left-linear closure @@ -59,32 +71,21 @@ alone, without consuming any inputs, by expanding according to the grammar rules. This means that we need to know which non-terminals were expanded in order to get to a state in [s]^{(t)}. - + [ ] Each edge in the chain-rule machine needs to be labelled also - with a position in the forest. This perfectly solves the problem - of those "plugs"! - + [ ] When we pull in some regular language because of the - left-linear expansion, we need to mark that branch as coming from - left-linear expansions. This is necessary because we cannot - follow a left-linearly expanded branch when we take the derivative - of a language. We only expand left-linearly when we try to access - the atomic languages [s]^{(t)}. We can mark by returning a set of - nodes which are the beginnings of left-linearly expanded branches. - + [ ] An edge of the NFA should carry a label that can be more - informative than solely a terminal or a non-terminal. - + [ ] Add a mechanism for a grammar to attach labels to edges of NFA - which are not necessarily Copy-able, and which should be stored in a - separate array, such that the labels on edges of NFA point to the - elements of the array. + * [X] Test + * [X] Test more - [X] Atom [3/3] + [X] Define the Atom trait. + [X] Implement virtual nodes for each derivative of each atom: The lack of this step might be the root cause of the failure of the previous version of this project. + [X] Test atoms -- [-] Implement languages. [1/3] +- [-] Implement languages. [1/4] + [X] First define a trait with the expected behaviours. + [ ] Then implement them as doubly labelled graphs. - + [ ] Thenimplement finding derivatives by use of the chain rule. + + [ ] Then implement finding derivatives by use of the chain rule. + + [ ] Each edge in the chain-rule machine needs to be labelled also + with a position in the forest. This perfectly solves the problem + of those "plugs"! - [-] Implement forests [2/3] + [X] Design a format of forests. This should be the most crucial thing to do, in order to have a better understanding of the whole @@ -92,10 +93,7 @@ the binarised shared packed parsed forests, that reflects the regular expressions in the grammar equations. + [X] Define a trait with the expected behaviours. - + [-] Implement them using adjacency map: we only need one label per - edge, and we do not wish to exclude duplicate edges, and we do not - need to index edges by the labels. All in all, we implement them - using a vector of hashmaps. + + [-] Implement them as parents-knowing graphs. - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. -- cgit v1.2.3-18-g5258