summaryrefslogtreecommitdiff
path: root/chain
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
commit1a3d346f413325ed37848a6b2526e8e729269833 (patch)
treeab8812f8094d096c68aee53cf516e986cc9a273a /chain
parentf27d604d93ce583d4404e1874664e08382ea2f00 (diff)
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
Diffstat (limited to 'chain')
-rw-r--r--chain/src/atom/default.rs140
-rw-r--r--chain/src/atom/mod.rs4
-rw-r--r--chain/src/plan.org52
3 files changed, 107 insertions, 89 deletions
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs
index 72989b3..90133f4 100644
--- a/chain/src/atom/default.rs
+++ b/chain/src/atom/default.rs
@@ -4,7 +4,10 @@
use super::*;
use grammar::Grammar;
use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph};
-use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex};
+use nfa::{
+ default::{nfa::DefaultNFA, regex::DefaultRegex},
+ LabelType,
+};
use core::fmt::Display;
use std::collections::BTreeMap as Map;
@@ -35,7 +38,7 @@ type VirtualMap = Map<VirtualNode, usize>;
#[derive(Debug, Clone, Default)]
pub struct DefaultAtom {
grammar: Grammar,
- nfa: DefaultNFA<DOption<TNT>>,
+ nfa: DefaultNFA<LabelType<TNT>>,
// NOTE: This is mostly for printing and debugging
regexp: Vec<DefaultRegex<TNT>>,
virtual_nodes: VirtualMap,
@@ -95,7 +98,7 @@ impl Display for DefaultAtom {
// LabelGraph, in order to implement Nfa.
impl Graph for DefaultAtom {
- type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b>
+ type Iter<'b> = <DefaultNFA<LabelType<TNT>> as Graph>::Iter<'b>
where
Self: 'b;
@@ -130,23 +133,23 @@ impl Graph for DefaultAtom {
}
}
-impl LabelGraph<DOption<TNT>> for DefaultAtom {
- type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b>
+impl LabelGraph<LabelType<TNT>> for DefaultAtom {
+ type Iter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::Iter<'b>
where
Self: 'b;
- type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b>
+ type LabelIter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::LabelIter<'b>
where
Self: 'b,
DOption<TNT>: 'b;
- type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a>
+ type EdgeLabelIter<'a> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::EdgeLabelIter<'a>
where
Self: 'a,
DOption<TNT>: 'a;
#[inline]
- fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> {
+ fn vertex_label(&self, node_id: usize) -> Result<Option<LabelType<TNT>>, GraphError> {
self.nfa.vertex_label(node_id)
}
@@ -163,8 +166,8 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom {
fn find_children_with_label(
&self,
node_id: usize,
- label: &DOption<TNT>,
- ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> {
+ label: &LabelType<TNT>,
+ ) -> Result<<Self as LabelGraph<LabelType<TNT>>>::Iter<'_>, GraphError> {
self.nfa.find_children_with_label(node_id, label)
}
@@ -177,39 +180,31 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom {
fn has_edge_label(
&self,
node_id: usize,
- label: &DOption<TNT>,
+ label: &LabelType<TNT>,
target: usize,
) -> Result<bool, GraphError> {
self.nfa.has_edge_label(node_id, label, target)
}
}
-impl LabelExtGraph<DOption<TNT>> for DefaultAtom {
+impl LabelExtGraph<LabelType<TNT>> for DefaultAtom {
#[inline]
fn extend(
&mut self,
- edges: impl IntoIterator<Item = (DOption<TNT>, usize)>,
+ edges: impl IntoIterator<Item = (LabelType<TNT>, usize)>,
) -> Result<usize, GraphError> {
self.nfa.extend(edges)
}
}
-impl Nfa<DOption<TNT>> for DefaultAtom {
- #[inline]
- fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error>
- where
- F: Fn(DOption<TNT>) -> bool,
- {
- self.nfa.remove_epsilon(f)
- }
-
+impl Nfa<LabelType<TNT>> for DefaultAtom {
type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = ();
#[inline]
fn to_nfa(
- _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>],
- _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>,
- _default: Option<DOption<TNT>>,
+ _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<LabelType<TNT>>>],
+ _sub_pred: impl Fn(LabelType<TNT>) -> Result<nfa::SoC<LabelType<TNT>>, nfa::error::Error>,
+ _default: Option<LabelType<TNT>>,
) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> {
// NOTE: We cannot construct an atom from a set of regular
// languages alone. So it is appropriate to panic here, if
@@ -218,13 +213,20 @@ impl Nfa<DOption<TNT>> for DefaultAtom {
}
#[inline]
- fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> {
+ fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> {
self.nfa.remove_dead(reserve)
}
#[inline]
- fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> {
- self.nfa.nulling(f)
+ fn closure(
+ &mut self,
+ predicate: impl FnMut(LabelType<TNT>) -> bool,
+ remove_after_p: bool,
+ transform: impl FnMut(nfa::TwoEdges<LabelType<TNT>>) -> LabelType<TNT>,
+ remove_predicate: impl FnMut(LabelType<TNT>) -> bool,
+ ) -> Result<(), nfa::error::Error> {
+ self.nfa
+ .closure(predicate, remove_after_p, transform, remove_predicate)
}
}
@@ -237,46 +239,56 @@ impl DefaultAtom {
let mut nfa = grammar.left_closure_to_nfa(&regexp)?;
- use std::collections::HashSet;
+ use std::collections::{HashMap, HashSet};
let accumulators: Vec<usize> = {
let mut result = Vec::with_capacity(regexp.len() + 1);
result.push(0);
for regex in regexp.iter() {
+ // Calling `unwrap` here is safe as `result` is always
+ // non-empty.
result.push(regex.nodes_len() * 2 + result.last().unwrap());
}
- result.into_iter().collect()
+ result
};
let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect();
- nfa.nulling(|label| {
- if let Some(label) = *label {
- match label {
- TNT::Ter(_) => false,
- // Panics if a non-terminal references an invalid node
- // here.
- TNT::Non(n) => grammar.is_nullable(n).unwrap(),
+ let nullables: HashSet<usize> = (0..grammar.non_num())
+ .filter(|n| matches!(grammar.is_nullable(*n), Ok(true)))
+ .collect();
+
+ // Perform nulling and remove_epsilon at the same time.
+ nfa.closure(
+ |label| {
+ if let Some(label) = *label.get_value() {
+ matches!(label, TNT::Non(n) if nullables.contains(&n))
+ } else {
+ true
}
- } else {
- true
- }
- })?;
- nfa.remove_epsilon(|label| label.is_none())?;
+ },
+ true,
+ |two_edges| grammar.transform_label_null_epsilon(two_edges),
+ |label| label.get_value().is_none(),
+ )?;
+
nfa.remove_dead(|node| accumulators_set.contains(&node))?;
- // now add the virtual nodes
+ // Now add the virtual nodes.
let mut virtual_nodes: VirtualMap = Default::default();
let nt_num = grammar.non_num();
assert!(nt_num <= accumulators.len());
- // Convert an error telling us that an index is out of bounds.
- //
- // Panics if the error is not of the expected kind.
+ /// Convert an error telling us that an index is out of bounds.
+ ///
+ /// # Panics
+ ///
+ /// The function panics if the error is not of the expected
+ /// kind.
fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError {
match ge {
GraphError::IndexOutOfBounds(index, bound) => {
@@ -287,24 +299,34 @@ impl DefaultAtom {
}
for nt in 0..nt_num {
- let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa
- // this is safe because of the above assertion.
+ let children: std::collections::HashMap<_, _> = nfa
+ // This is safe because of the above assertion.
.labels_of(*accumulators.get(nt).unwrap())
.map_err(index_out_of_bounds_conversion)?
- .map(|(label, target_iter)| (*label, target_iter.collect()))
+ .map(|(label, target_iter)| (*label, target_iter))
.collect();
- for (label, children_vec) in children.into_iter() {
- if let Some(TNT::Ter(t)) = *label {
- let new_index = nfa
- .extend(children_vec.into_iter().map(|target| (label, target)))
- .map_err(index_out_of_bounds_conversion)?;
+ let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> =
+ HashMap::new();
- let virtual_node = VirtualNode::new(nt, t);
-
- virtual_nodes.insert(virtual_node, new_index);
+ for (label, children_iter) in children.into_iter() {
+ if let Some(TNT::Ter(t)) = *label.get_value() {
+ terminals_map
+ .entry(t)
+ .or_insert_with(|| HashSet::with_capacity(children_iter.len()))
+ .extend(children_iter.map(|target| (label, target)));
}
}
+
+ for (t, set) in terminals_map.into_iter() {
+ let new_index = nfa
+ .extend(set.into_iter())
+ .map_err(index_out_of_bounds_conversion)?;
+
+ let virtual_node = VirtualNode::new(nt, t);
+
+ virtual_nodes.insert(virtual_node, new_index);
+ }
}
Ok(Self {
@@ -335,8 +357,6 @@ impl Atom for DefaultAtom {
}
fn empty(&self) -> usize {
- assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2);
-
- self.nfa.nodes_len() - 2
+ self.grammar.total() << 1
}
}
diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs
index 084acca..065640b 100644
--- a/chain/src/atom/mod.rs
+++ b/chain/src/atom/mod.rs
@@ -7,10 +7,10 @@
//! I have better ideas in the future.
use grammar::{Error as GrammarError, TNT};
-use nfa::{DOption, Nfa};
+use nfa::{DOption, LabelType, Nfa};
/// The expected behaviours of an atomic language.
-pub trait Atom: Nfa<DOption<TNT>> {
+pub trait Atom: Nfa<LabelType<TNT>> {
/// Return the index of a node representing the derivative of the
/// left-linear null closure of `nt` with respect to `t`.
fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>;
diff --git a/chain/src/plan.org b/chain/src/plan.org
index bbd6683..b708413 100644
--- a/chain/src/plan.org
+++ b/chain/src/plan.org
@@ -2,7 +2,7 @@
#+AUTHOR: Durand
#+DATE: <2022-11-18 Ven 19:57>
-* Things to do [5/10]
+* Things to do [6/10]
- [X] Implement builders for graphs
- [X] Find sets of the first terminals for each non-terminal, in the
@@ -38,18 +38,30 @@
finite automata.
* [X] Test the removal of dead states, where a state is dead if
and only if no other states have an edge into that state.
-- [-] Refactor [2/8]
+- [X] Refactor [7/7]
+ [X] Implement a data type for graphs with labels on vertices and
edges, but do not need to index edges by labels, which can index
vertices by labels instead.
* [X] Test it.
+ [X] Implement a builder that borrows a graph mutably.
* [X] Test it.
- + [ ] Implement a data type for graphs in which every node knows its
+ + [X] Implement a data type for graphs in which every node knows its
parents, and every node has a unique label but edges have no
labels.
- * [ ] Test it.
- + [ ] We need to record the expansions of those "virtual nodes".
+ * [X] Test it.
+ + [X] When we pull in some regular language because of the
+ left-linear expansion, we need to mark that branch as coming from
+ left-linear expansions. This is necessary because we cannot
+ follow a left-linearly expanded branch when we take the derivative
+ of a language. We only expand left-linearly when we try to access
+ the atomic languages [s]^{(t)}.
+ + [X] An edge of the NFA should carry a label that can be more
+ informative than solely a terminal or a non-terminal.
+ + [X] Add a mechanism for a grammar to attach labels to edges of NFA
+ which are not necessarily Copy-able, and which should be stored in a
+ separate array, such that the labels on edges of NFA point to the
+ elements of the array.
+ + [X] We need to record the expansions of those "virtual nodes".
That is, the virtual nodes represent atomic languages such as
[s]^{(t)} where s is a non-terminal and t is a terminal. To be more
specific, it represents the derivative of the left-linear closure
@@ -59,32 +71,21 @@
alone, without consuming any inputs, by expanding according to the
grammar rules. This means that we need to know which
non-terminals were expanded in order to get to a state in [s]^{(t)}.
- + [ ] Each edge in the chain-rule machine needs to be labelled also
- with a position in the forest. This perfectly solves the problem
- of those "plugs"!
- + [ ] When we pull in some regular language because of the
- left-linear expansion, we need to mark that branch as coming from
- left-linear expansions. This is necessary because we cannot
- follow a left-linearly expanded branch when we take the derivative
- of a language. We only expand left-linearly when we try to access
- the atomic languages [s]^{(t)}. We can mark by returning a set of
- nodes which are the beginnings of left-linearly expanded branches.
- + [ ] An edge of the NFA should carry a label that can be more
- informative than solely a terminal or a non-terminal.
- + [ ] Add a mechanism for a grammar to attach labels to edges of NFA
- which are not necessarily Copy-able, and which should be stored in a
- separate array, such that the labels on edges of NFA point to the
- elements of the array.
+ * [X] Test
+ * [X] Test more
- [X] Atom [3/3]
+ [X] Define the Atom trait.
+ [X] Implement virtual nodes for each derivative of each atom: The
lack of this step might be the root cause of the failure of the
previous version of this project.
+ [X] Test atoms
-- [-] Implement languages. [1/3]
+- [-] Implement languages. [1/4]
+ [X] First define a trait with the expected behaviours.
+ [ ] Then implement them as doubly labelled graphs.
- + [ ] Thenimplement finding derivatives by use of the chain rule.
+ + [ ] Then implement finding derivatives by use of the chain rule.
+ + [ ] Each edge in the chain-rule machine needs to be labelled also
+ with a position in the forest. This perfectly solves the problem
+ of those "plugs"!
- [-] Implement forests [2/3]
+ [X] Design a format of forests. This should be the most crucial
thing to do, in order to have a better understanding of the whole
@@ -92,10 +93,7 @@
the binarised shared packed parsed forests, that reflects the
regular expressions in the grammar equations.
+ [X] Define a trait with the expected behaviours.
- + [-] Implement them using adjacency map: we only need one label per
- edge, and we do not wish to exclude duplicate edges, and we do not
- need to index edges by the labels. All in all, we implement them
- using a vector of hashmaps.
+ + [-] Implement them as parents-knowing graphs.
- [ ] Implement semiring. [0/5]
+ [ ] Define the trait.
+ [ ] Implement the boolean semiring.