diff options
author | JSDurand <mmemmew@gmail.com> | 2023-01-20 13:48:26 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-01-20 13:48:26 +0800 |
commit | 18d7955b7d84c00467ede38baae53f4ce1fb6908 (patch) | |
tree | 97d0746b82816a21d980636e50f8cdbeb804b518 | |
parent | 8f8d3d1a3c276be4be2e5d2e767ada564c47279a (diff) |
chain: a prototype is added.
I have an ostensibly working prototype now.
Further tests are needed to make sure that the algorithm meets the
time complexity requirement, though.
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | chain/Cargo.toml | 4 | ||||
-rw-r--r-- | chain/src/atom/default.rs | 214 | ||||
-rw-r--r-- | chain/src/atom/mod.rs | 9 | ||||
-rw-r--r-- | chain/src/default.rs | 784 | ||||
-rw-r--r-- | chain/src/lib.rs | 194 | ||||
-rw-r--r-- | chain/src/plan.org | 10 | ||||
-rw-r--r-- | forest/src/default.rs | 78 | ||||
-rw-r--r-- | forest/src/design.org | 3 | ||||
-rw-r--r-- | forest/src/lib.rs | 3 | ||||
-rw-r--r-- | grammar/src/label.rs | 124 | ||||
-rw-r--r-- | grammar/src/left_closure.rs | 6 | ||||
-rw-r--r-- | grammar/src/lib.rs | 83 | ||||
-rw-r--r-- | grammar/src/test_grammar_helper.rs | 128 | ||||
-rw-r--r-- | graph/src/error.rs | 3 | ||||
-rw-r--r-- | graph/src/labelled/binary.rs | 44 | ||||
-rw-r--r-- | graph/src/labelled/double.rs | 14 | ||||
-rw-r--r-- | graph/src/lib.rs | 10 | ||||
-rw-r--r-- | graph_macro/src/lib.rs | 9 | ||||
-rw-r--r-- | nfa/src/default/nfa.rs | 2 | ||||
-rw-r--r-- | nfa/src/default/regex.rs | 141 | ||||
-rw-r--r-- | nfa/src/lib.rs | 4 |
22 files changed, 1770 insertions, 107 deletions
@@ -1,3 +1,13 @@ +2023-01-20 Jean Sévère Durand <durand@jsdurand.xyz> + + * chain: A prototype is added, and passes some tests. But I am + still testing if its performance meets the time complexity + requirement. + +2023-01-13 Jean Sévère Durand <durand@jsdurand.xyz> + + * forest: A prototype is completed, and passes some tests. + 2022-11-15 Jean Sévère Durand <durand@jsdurand.xyz> * nfa: Stop worrying about monadic anamorphisms. diff --git a/chain/Cargo.toml b/chain/Cargo.toml index 265954c..c55586d 100644 --- a/chain/Cargo.toml +++ b/chain/Cargo.toml @@ -5,6 +5,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = [] +test-print-viz = [] + [dependencies] nfa = { path = "../nfa" } graph = { path = "../graph" } diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 90133f4..0dc24c3 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -6,7 +6,7 @@ use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; use nfa::{ default::{nfa::DefaultNFA, regex::DefaultRegex}, - LabelType, + LabelType, NfaLabel, }; use core::fmt::Display; @@ -39,11 +39,55 @@ type VirtualMap = Map<VirtualNode, usize>; pub struct DefaultAtom { grammar: Grammar, nfa: DefaultNFA<LabelType<TNT>>, + accepting_vec: Vec<bool>, // NOTE: This is mostly for printing and debugging regexp: Vec<DefaultRegex<TNT>>, virtual_nodes: VirtualMap, } +impl DefaultAtom { + /// Return the string description of a rule position. + pub fn rule_pos_string(&self, pos: usize) -> Result<String, Box<dyn std::error::Error>> { + let rule_num = self.grammar.get_rule_num(pos)?; + + assert!(rule_num < self.grammar.non_num()); + + let display_tnt = |tnt| self.name_of_tnt(tnt).unwrap_or_else(|e| format!("{e}")); + + Ok(self.regexp.get(rule_num).unwrap().to_string_with_dot( + display_tnt, + if rule_num == 0 { + pos + } else { + pos - self.grammar.nth_accumulator(rule_num - 1)? + }, + )?) + } + + /// Print the underlying NFA. + pub fn print_nfa<S: AsRef<str>>(&self, filename: S) -> Result<(), std::io::Error> { + self.nfa.print_viz(filename.as_ref())?; + + let nullables: Vec<_> = self + .accepting_vec + .iter() + .enumerate() + .filter_map(|(index, pred)| if *pred { Some(index) } else { None }) + .collect(); + + if !nullables.is_empty() { + println!("nullables: {nullables:?}"); + } + + println!("printing virtual nodes:"); + for (vn, node) in self.virtual_nodes.iter() { + println!("[{}]^{{({})}}: {}", vn.s, vn.t, node); + } + + Ok(()) + } +} + impl Display for DefaultAtom { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let grammar = &self.grammar; @@ -260,15 +304,67 @@ impl DefaultAtom { .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) .collect(); + // Now record accepting information. + + let nfa_len = nfa.nodes_len(); + + let label_is_nullable = |label: NfaLabel<DOption<TNT>>| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true + } + }; + + let mut accepting_vec: Vec<bool> = std::iter::repeat(false).take(nfa_len).collect(); + + for nfa_start in accumulators.iter().copied().take(regexp.len()) { + *accepting_vec.get_mut(nfa_start + 1).unwrap() = true; + } + + // The last is always accepting. + *accepting_vec.get_mut(nfa_len - 1).unwrap() = true; + + let mut updated = true; + + while updated { + updated = false; + + for node in nfa.nodes() { + // skip those that do not need to be updated + if *accepting_vec + .get(node) + .ok_or(GrammarError::IndexOutOfBounds(node, nfa_len))? + { + continue; + } + + 'label_loop: for (label, target_iter) in nfa + .labels_of(node) + .map_err(|_| GrammarError::IndexOutOfBounds(node, nfa_len))? + { + if label_is_nullable(*label) { + for target in target_iter { + if *accepting_vec + .get(target) + .ok_or(GrammarError::IndexOutOfBounds(target, nfa_len))? + { + // accepting_vec[node] must have been + // false, as we checked above + *accepting_vec.get_mut(node).unwrap() = true; + updated = true; + + break 'label_loop; + } + } + } + } + } + } + // Perform nulling and remove_epsilon at the same time. nfa.closure( - |label| { - if let Some(label) = *label.get_value() { - matches!(label, TNT::Non(n) if nullables.contains(&n)) - } else { - true - } - }, + label_is_nullable, true, |two_edges| grammar.transform_label_null_epsilon(two_edges), |label| label.get_value().is_none(), @@ -298,6 +394,8 @@ impl DefaultAtom { } } + // dbg!(&accumulators); + for nt in 0..nt_num { let children: std::collections::HashMap<_, _> = nfa // This is safe because of the above assertion. @@ -306,23 +404,92 @@ impl DefaultAtom { .map(|(label, target_iter)| (*label, target_iter)) .collect(); - let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> = + let mut terminals_map: HashMap<usize, (HashSet<(LabelType<TNT>, usize)>, bool)> = HashMap::new(); for (label, children_iter) in children.into_iter() { if let Some(TNT::Ter(t)) = *label.get_value() { - terminals_map - .entry(t) - .or_insert_with(|| HashSet::with_capacity(children_iter.len())) - .extend(children_iter.map(|target| (label, target))); + let estimated_len = { + let mut result = 0; + + for child in children_iter.clone() { + result += nfa.degree(child).map_err(index_out_of_bounds_conversion)?; + } + + result + }; + + let mut accepting = false; + + for child in children_iter { + accepting = accepting + || *accepting_vec.get(child).ok_or_else(|| { + GrammarError::IndexOutOfBounds(child, accepting_vec.len()) + })?; + + if nt == 3 + && nfa.degree(child).map_err(index_out_of_bounds_conversion)? == 0 + { + println!("accepting = {accepting}"); + } + + if let Some((_, old_accepting)) = terminals_map.get_mut(&t) { + *old_accepting = *old_accepting || accepting; + } else { + terminals_map + .insert(t, (HashSet::with_capacity(estimated_len), accepting)); + } + + for (child_label, child_children_iter) in nfa + .labels_of(child) + .map_err(index_out_of_bounds_conversion)? + { + // We checked this is safe above. + let (set, _) = terminals_map.get_mut(&t).unwrap(); + + set.extend(child_children_iter.map(|target| (*child_label, target))); + } + } } } - for (t, set) in terminals_map.into_iter() { + if nt == 3 { + println!("map = {terminals_map:?}"); + } + + for (t, (set, accepting)) in terminals_map.into_iter() { let new_index = nfa .extend(set.into_iter()) .map_err(index_out_of_bounds_conversion)?; + if accepting_vec.get(new_index).is_none() { + #[cfg(debug_assertions)] + assert_eq!(new_index, accepting_vec.len()); + + // let mut updated = false; + // let nfa_len = nfa.nodes_len(); + + // 'label_loop: for (label, target_iter) in nfa + // .labels_of(new_index) + // .map_err(|_| GrammarError::IndexOutOfBounds(new_index, nfa_len))? + // { + // if label_is_nullable(*label) { + // for target in target_iter { + // if *accepting_vec + // .get(target) + // .ok_or(GrammarError::IndexOutOfBounds(target, nfa_len))? + // { + // updated = true; + + // break 'label_loop; + // } + // } + // } + // } + + accepting_vec.push(accepting); + } + let virtual_node = VirtualNode::new(nt, t); virtual_nodes.insert(virtual_node, new_index); @@ -334,6 +501,7 @@ impl DefaultAtom { nfa, regexp, virtual_nodes, + accepting_vec, }) } } @@ -343,6 +511,14 @@ fn query(map: &VirtualMap, nt: usize, t: usize) -> Option<usize> { map.get(&VirtualNode::new(nt, t)).copied() } +impl std::ops::Deref for DefaultAtom { + type Target = Grammar; + + fn deref(&self) -> &Self::Target { + &self.grammar + } +} + impl Atom for DefaultAtom { fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError> { if nt >= self.grammar.non_num() { @@ -359,4 +535,14 @@ impl Atom for DefaultAtom { fn empty(&self) -> usize { self.grammar.total() << 1 } + + fn is_accepting(&self, node_id: usize) -> Result<bool, GrammarError> { + self.accepting_vec + .get(node_id) + .copied() + .ok_or(GrammarError::IndexOutOfBounds( + node_id, + self.accepting_vec.len(), + )) + } } diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs index 065640b..398edd2 100644 --- a/chain/src/atom/mod.rs +++ b/chain/src/atom/mod.rs @@ -6,17 +6,22 @@ //! Because this way I can easily substitute other implementations if //! I have better ideas in the future. -use grammar::{Error as GrammarError, TNT}; +use grammar::{Error as GrammarError, Grammar, TNT}; use nfa::{DOption, LabelType, Nfa}; +use std::ops::Deref; + /// The expected behaviours of an atomic language. -pub trait Atom: Nfa<LabelType<TNT>> { +pub trait Atom: Nfa<LabelType<TNT>> + Deref<Target = Grammar> { /// Return the index of a node representing the derivative of the /// left-linear null closure of `nt` with respect to `t`. fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>; /// Return the index of the empty state. fn empty(&self) -> usize; + + /// Tell whether a node is accepting. + fn is_accepting(&self, node_id: usize) -> Result<bool, GrammarError>; } pub mod default; diff --git a/chain/src/default.rs b/chain/src/default.rs index e04be9f..697b997 100644 --- a/chain/src/default.rs +++ b/chain/src/default.rs @@ -6,18 +6,47 @@ //! modular design makes that easy. use super::*; +use crate::atom::{Atom, DefaultAtom}; use core::fmt::Display; +use forest::{default::DefaultForest, Forest}; +use grammar::{Error as GrammarError, GrammarLabel, GrammarLabelType, TNT}; +#[allow(unused_imports)] +use graph::{ + labelled::DLGBuilder, Builder, DLGraph, Graph, LabelExtGraph, LabelGraph, ParentsGraph, +}; + +use std::collections::{HashMap as Map, TryReserveError}; /// The errors related to taking derivatives by chain rule. +#[non_exhaustive] #[derive(Debug)] pub enum Error { + /// General error for indices out of bounds. + IndexOutOfBounds(usize, usize), + /// The forest encounters a duplicate node, for some reason. + DuplicateNode(usize), + /// The chain rule machine encounters a duplicate edge, for some + /// reason. + DuplicateEdge(usize, usize), + /// A node has no labels while it is required to have one. + NodeNoLabel(usize), + /// Reserving memory fails. + CannotReserve(TryReserveError), /// An invalid situation happens. Invalid, } impl Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { + Self::IndexOutOfBounds(index, bound) => write!(f, "index {index} out of bound {bound}"), + Self::DuplicateNode(n) => write!(f, "the forest has a node {n} with a duplicate label"), + Self::DuplicateEdge(source, target) => write!( + f, + "the forest has a duplicate edge from {source} to {target}" + ), + Self::NodeNoLabel(n) => write!(f, "node {n} has no labels while it should have one"), + Self::CannotReserve(e) => write!(f, "cannot reserve memory: {e}"), Self::Invalid => write!(f, "invalid"), } } @@ -25,22 +54,761 @@ impl Display for Error { impl std::error::Error for Error {} +impl From<GError> for Error { + fn from(value: GError) -> Self { + match value { + GError::IndexOutOfBounds(index, bound) => Self::IndexOutOfBounds(index, bound), + GError::DuplicatedNode(n) => Self::DuplicateNode(n), + GError::DuplicatedEdge(source, target) => Self::DuplicateEdge(source, target), + _ => Self::Invalid, + } + } +} + +impl From<ForestError> for Error { + fn from(e: ForestError) -> Self { + match e { + ForestError::IndexOutOfBounds(index, bound) => Error::IndexOutOfBounds(index, bound), + ForestError::DuplicatedNode(n) => Error::DuplicateNode(n), + ForestError::InvalidGraphError(ge) => ge.into(), + ForestError::NodeNoLabel(n) => Error::NodeNoLabel(n), + } + } +} + +impl From<TryReserveError> for Error { + fn from(value: TryReserveError) -> Self { + Self::CannotReserve(value) + } +} + +/// The type of an index into an element in [`DerIter`]. +#[derive(Debug, Copy, Clone)] +enum DerIterIndex { + Single(usize), + Map(usize), +} + +impl Default for DerIterIndex { + fn default() -> Self { + Self::Map(0) + } +} + +/// A complex type used for storing values of edges with two layers. +type SecondTypeValue = (Parent, bool, Vec<(Edge, usize)>); + +/// An iterator of TwoLayers. +#[derive(Debug, Default)] +pub struct DerIter { + /// Stores edges of only one layer. + singles: Vec<(Edge, usize)>, + /// Stores edges with two layers. They are grouped by their + /// labels of the second layer. + /// + /// The values are tuples (forest_source, accepting, edges), where + /// the edges are the grouped edges of the first layer and the + /// destination. + seconds: Map<usize, SecondTypeValue>, + /// We want to iterate the elements of the map, for which purpose + /// we need an array. Since hashmaps provide no arrays, we keep + /// an array of keys for iteration purposes. + second_array: Vec<usize>, + /// The index of the current element, either in `second_array` or + /// in `singles` . + index: DerIterIndex, +} + +impl DerIter { + fn add_second_layer( + &mut self, + label: usize, + forest_source: Parent, + accepting: bool, + edges: Vec<(Edge, usize)>, + ) { + if let Some((_, _, vec)) = self.seconds.get_mut(&label) { + vec.extend(edges); + } else { + self.seconds + .insert(label, (forest_source, accepting, edges)); + + self.second_array.push(label); + } + } +} + +impl Iterator for DerIter { + type Item = TwoLayers; + + fn next(&mut self) -> Option<Self::Item> { + // We iterate through two layered edges first. + match self.index { + DerIterIndex::Map(index) => { + if let Some(key) = self.second_array.get(index) { + if let Some((forest_source, accepting, edges)) = self.seconds.remove(key) { + self.index = DerIterIndex::Map(index + 1); + + Some(TwoLayers::Two(*key, forest_source, accepting, edges)) + } else { + // this should not happen + println!("a key does not exist in the hashmap: something is wrong when taking derivatives"); + None + } + } else { + self.index = DerIterIndex::Single(0); + + if let Some((edge, to)) = self.singles.first() { + self.index = DerIterIndex::Single(1); + + Some(TwoLayers::One(*edge, *to)) + } else { + None + } + } + } + DerIterIndex::Single(index) => { + if let Some((edge, to)) = self.singles.get(index) { + self.index = DerIterIndex::Single(index + 1); + + Some(TwoLayers::One(*edge, *to)) + } else { + None + } + } + } + } +} + /// A default implementation for the [`Chain`] trait. #[derive(Debug, Clone, Default)] -pub struct DefaultChain {} +pub struct DefaultChain { + graph: DLGraph<Edge>, + atom: DefaultAtom, + current: usize, + history: Vec<usize>, + forest: DefaultForest<GrammarLabel>, + accepting_vec: Vec<bool>, +} + +impl DefaultChain { + /// Return the current node. + #[inline] + pub fn current(&self) -> usize { + self.current + } + + /// Return the complete slice of histories. + #[inline] + pub fn history(&self) -> &[usize] { + self.history.as_ref() + } + + /// Return a reference to the associated forest. + #[inline] + pub fn forest(&self) -> &DefaultForest<GrammarLabel> { + &self.forest + } + + /// Print the rule positions of the labels. + pub fn print_rule_positions(&self) -> Result<(), Box<dyn std::error::Error>> { + let mut labels = std::collections::HashSet::<usize>::default(); + + for node in 0..self.graph.nodes_len() { + labels.extend(self.graph.labels_of(node)?.map(|(label, _)| label.label)); + } + + for label in labels.into_iter() { + println!("{}", self.atom.rule_pos_string(label)?); + } + + Ok(()) + } +} + +impl Graph for DefaultChain { + type Iter<'a> = <DLGraph<Edge> as Graph>::Iter<'a> + where + Self: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn edges_len(&self) -> Option<usize> { + self.graph.edges_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result<usize, GError> { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result<bool, GError> { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { + self.graph.has_edge(source, target) + } + + fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { + unimplemented!("I shall refactor this") + } +} + +impl LabelGraph<Edge> for DefaultChain { + type Iter<'a> = <DLGraph<Edge> as LabelGraph<Edge>>::Iter<'a> + where + Self: 'a; + + type LabelIter<'a> = <DLGraph<Edge> as LabelGraph<Edge>>::LabelIter<'a> + where + Self: 'a, + Edge: 'a; + + type EdgeLabelIter<'a> = <DLGraph<Edge> as LabelGraph<Edge>>::EdgeLabelIter<'a> + where + Self: 'a, + Edge: 'a; + + #[inline] + fn edge_label(&self, source: usize, target: usize) -> Result<Self::EdgeLabelIter<'_>, GError> { + self.graph.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &Edge, + ) -> Result<<Self as LabelGraph<Edge>>::Iter<'_>, GError> { + self.graph.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> { + self.graph.labels_of(node_id) + } + + #[inline] + fn has_edge_label(&self, node_id: usize, label: &Edge, target: usize) -> Result<bool, GError> { + self.graph.has_edge_label(node_id, label, target) + } +} + +impl LabelExtGraph<Edge> for DefaultChain { + #[inline] + fn extend(&mut self, edges: impl IntoIterator<Item = (Edge, usize)>) -> Result<usize, GError> { + let new = self.graph.extend(edges)?; + let accepting_len = self.accepting_vec.len(); + + if self.accepting_vec.get(new).is_none() { + // assert it can only grow by one node at a time. + #[cfg(debug_assertions)] + assert_eq!(new, accepting_len); + + let mut updated = false; + + for (label, child_iter) in self.graph.labels_of(new)? { + let old_accepting = { + let mut result = false; + for child in child_iter { + if *self + .accepting_vec + .get(child) + .ok_or(GError::IndexOutOfBounds(child, accepting_len))? + { + result = true; + break; + } + } + + result + }; + + if !old_accepting { + self.accepting_vec.push(false); + updated = true; + + break; + } + + if label.is_accepting() { + self.accepting_vec.push(true); + updated = true; + + break; + } + } + + if !updated { + self.accepting_vec.push(false); + } + } + + Ok(new) + } +} impl Chain for DefaultChain { type Error = Error; - fn unit() -> Self { - todo!() + type Atom = DefaultAtom; + + fn unit(atom: Self::Atom) -> Result<Self, Self::Error> { + let mut builder: DLGBuilder<Edge> = Default::default(); + + let root = builder.add_vertex(); + let first = builder.add_vertex(); + + let empty_state = atom.empty(); + + let initial_nullable = atom + .is_nullable(0) + .map_err(|_| Error::IndexOutOfBounds(0, atom.non_num()))?; + + builder.add_edge( + first, + root, + Edge::new(empty_state, Parent::new(0, 0), initial_nullable), + )?; + + let graph = builder.build(); + + let forest = + DefaultForest::new_leaf(GrammarLabel::new(GrammarLabelType::TNT(TNT::Non(0)), 0)); + + #[cfg(debug_assertions)] + assert_eq!(forest.root(), Some(0)); + + let current = 1; + + let history = Vec::new(); + + let accepting_vec = vec![true, initial_nullable]; + + Ok(Self { + graph, + atom, + current, + history, + forest, + accepting_vec, + }) + } + + fn epsilon(&self) -> Result<bool, Self::Error> { + self.accepting_vec + .get(self.current) + .copied() + .ok_or(Error::IndexOutOfBounds( + self.current, + self.accepting_vec.len(), + )) + } + + fn update_history(&mut self, new: usize) { + debug_assert!(new < self.graph.nodes_len()); + + self.history.push(self.current); + + self.current = new; + } + + type DerIter = DerIter; + + fn derive(&mut self, t: usize, pos: usize) -> Result<Self::DerIter, Self::Error> { + use TNT::*; + + /// Convert an error telling us that an index is out of bounds. + /// + /// # Panics + /// + /// The function panics if the error is not of the expected + /// kind. + fn index_out_of_bounds_conversion(ge: GrammarError) -> Error { + match ge { + GrammarError::IndexOutOfBounds(index, bound) => { + Error::IndexOutOfBounds(index, bound) + } + _ => panic!("wrong error kind"), + } + } + + /// A helper function to generate edges to join. + /// + /// It first checks if the base edge is accepting. If yes, + /// then pull in the children of the target. + /// + /// Then check if the label of the base edge has children. If + /// no, then do not add this base edge itself. + /// + /// The generated edges will be pushed to `output` directly, + /// to save some allocations. + // TODO: Handle forests as well. + fn generate_edges( + chain: &DefaultChain, + child_iter: impl Iterator<Item = usize> + ExactSizeIterator + Clone, + atom_child_iter: impl Iterator<Item = usize> + Clone, + mut output: impl AsMut<Vec<(Edge, usize)>>, + ) -> Result<(), Error> { + // First check the values from iterators are all valid. + let graph_len = chain.graph.nodes_len(); + let atom_len = chain.atom.nodes_len(); + + for child in child_iter.clone() { + if !chain.graph.has_node(child) { + return Err(Error::IndexOutOfBounds(child, graph_len)); + } + } + + for atom_child in atom_child_iter.clone() { + if !chain.atom.has_node(atom_child) { + return Err(Error::IndexOutOfBounds(atom_child, atom_len)); + } + } + + // From now on the nodes are all valid, so we can just + // call `unwrap`. + + // Then calculate the number of edges to append, to avoid + // repeated allocations + let mut num = 0usize; + + let child_iter_total_degree = child_iter + .clone() + .map(|child| chain.graph.degree(child).unwrap()) + .sum::<usize>(); + + for atom_child in atom_child_iter.clone() { + let atom_child_accepting = chain.atom.is_accepting(atom_child).unwrap(); + let atom_child_empty_node = chain.atom.is_empty_node(atom_child).unwrap(); + + if !atom_child_empty_node { + num += child_iter.len(); + } + + if atom_child_accepting { + num += child_iter_total_degree; + } + } + + let num = num; + + let output = output.as_mut(); + + output.try_reserve(num)?; + + // now push into output + + let parent = Parent::new(0, 0); + + for atom_child in atom_child_iter.clone() { + let atom_child_accepting = chain.atom.is_accepting(atom_child).unwrap(); + let atom_child_empty_node = chain.atom.is_empty_node(atom_child).unwrap(); + + let edge = Edge::new(atom_child, parent, atom_child_accepting); + + if !atom_child_empty_node { + output.extend(child_iter.clone().map(|child| (edge, child))); + } + + if atom_child_accepting { + for child in child_iter.clone() { + for (child_label, child_child) in chain.graph.labels_of(child).unwrap() { + output.extend(child_child.map(|target| (*child_label, target))); + } + } + } + } + + Ok(()) + } + + let mut der_iter = DerIter::default(); + + for (label, child_iter) in self.graph.labels_of(self.current)? { + for (atom_label, atom_child_iter) in self.atom.labels_of(label.label())? { + if atom_label.is_left_p() { + // We do not consider left-linearly expanded + // children in the first layer. + continue; + } + + match *atom_label.get_value() { + Some(Ter(ter)) if ter == t => { + generate_edges( + self, + child_iter.clone(), + atom_child_iter.clone(), + &mut der_iter.singles, + )?; + } + Some(Non(non)) => { + let virtual_node = self + .atom + .atom(non, t) + .map_err(index_out_of_bounds_conversion)?; + + if let Some(virtual_node) = virtual_node { + let accepting = self + .atom + .is_accepting(virtual_node) + .map_err(index_out_of_bounds_conversion)?; + + let mut new_edges = Vec::new(); + + generate_edges( + self, + child_iter.clone(), + atom_child_iter.clone(), + &mut new_edges, + )?; + + if accepting { + der_iter.singles.extend(new_edges.clone()); + } + + let parent = Parent::new(0, 0); + + if !self.atom.is_empty_node(virtual_node).unwrap() { + der_iter.add_second_layer( + virtual_node, + parent, + accepting, + new_edges, + ); + + // account for atom_children without + // children. + + for atom_child in atom_child_iter { + // this has been checked in + // `generate_edges` + if self.atom.is_empty_node(atom_child).unwrap() { + der_iter.singles.extend(child_iter.clone().map(|child| { + (Edge::new(virtual_node, parent, accepting), child) + })); + } + } + } else { + for atom_child in atom_child_iter { + // this has been checked in + // `generate_edges` + if self.atom.is_empty_node(atom_child).unwrap() { + // flat flat map, hmm... + der_iter.singles.extend(child_iter.clone().flat_map( + |child| { + self.graph.labels_of(child).unwrap().flat_map( + |(child_label, child_child_iter)| { + child_child_iter.map(|child_child| { + (*child_label, child_child) + }) + }, + ) + }, + )); + } + } + } + } + } + _ => { + continue; + } + } + } + } + + Ok(der_iter) + } +} + +#[cfg(test)] +mod test_der_iter { + use super::*; + + #[test] + fn test() -> Result<(), Box<dyn std::error::Error>> { + let mut der_iter = DerIter::default(); + + let parent = Parent::new(0, 0); + + der_iter.singles.push((Edge::new(0, parent, true), 0)); + + der_iter.singles.push((Edge::new(1, parent, true), 0)); + + der_iter.singles.push((Edge::new(2, parent, true), 0)); + + der_iter.add_second_layer(3, parent, true, vec![(Edge::new(4, parent, true), 1)]); + + der_iter.add_second_layer(6, parent, true, vec![(Edge::new(5, parent, true), 1)]); + + // add an entry with a repeated label + der_iter.add_second_layer(3, parent, true, vec![(Edge::new(7, parent, true), 2)]); + + assert_eq!( + der_iter.next(), + Some(TwoLayers::Two( + 3, + parent, + true, + vec![ + (Edge::new(4, parent, true), 1), + (Edge::new(7, parent, true), 2) + ] + )) + ); + + assert_eq!( + der_iter.next(), + Some(TwoLayers::Two( + 6, + parent, + true, + vec![(Edge::new(5, parent, true), 1)] + )) + ); + + assert_eq!( + der_iter.next(), + Some(TwoLayers::One(Edge::new(0, parent, true), 0)) + ); + + assert_eq!( + der_iter.next(), + Some(TwoLayers::One(Edge::new(1, parent, true), 0)) + ); + + assert_eq!( + der_iter.next(), + Some(TwoLayers::One(Edge::new(2, parent, true), 0)) + ); + + assert_eq!(der_iter.next(), None); + assert_eq!(der_iter.next(), None); + + Ok(()) } +} + +#[cfg(test)] +mod test_chain { + use super::*; + use grammar::test_grammar_helper::*; + + #[test] + fn base_test() -> Result<(), Box<dyn std::error::Error>> { + let grammar = new_notes_grammar()?; + + let atom = DefaultAtom::from_grammar(grammar)?; + + let mut chain = DefaultChain::unit(atom)?; + + chain.chain(3, 00)?; + chain.chain(1, 01)?; + chain.chain(2, 02)?; + chain.chain(2, 03)?; + chain.chain(2, 04)?; + chain.chain(0, 05)?; + chain.chain(5, 06)?; + chain.chain(1, 07)?; + chain.chain(6, 08)?; + chain.chain(6, 09)?; + chain.chain(6, 10)?; + chain.chain(0, 11)?; + chain.chain(0, 12)?; + + assert!(matches!(chain.epsilon(), Ok(true))); + + #[cfg(feature = "test-print-viz")] + { + chain.graph.print_viz("chain.gv")?; + chain.atom.print_nfa("nfa.gv")?; + } - fn chain(&mut self, _t: usize) { - todo!() + Ok(()) } - fn epsilon(&self) -> bool { - todo!() + #[test] + fn test_speed() -> Result<(), Box<dyn std::error::Error>> { + let grammar = new_notes_grammar_no_regexp()?; + + println!("grammar: {grammar}"); + + let atom = DefaultAtom::from_grammar(grammar)?; + + let mut chain = DefaultChain::unit(atom)?; + + let input_template = vec![3, 1, 2, 2, 2, 0, 5, 1, 6, 6, 6, 0, 0]; + + let repeat_times = { + let mut result = 1; + + for arg in std::env::args() { + let parse_as_digit: Result<usize, _> = arg.parse(); + + if let Ok(parse_result) = parse_as_digit { + result = parse_result; + + break; + } + } + + result + }; + + println!("repeating {repeat_times} times"); + + let input = { + let mut result = Vec::with_capacity(input_template.len() * repeat_times); + + for _ in 0..repeat_times { + result.extend(input_template.iter().copied()); + } + + result + }; + + let start = std::time::Instant::now(); + + for (index, t) in input.iter().copied().enumerate() { + chain.chain(t, index)?; + } + + let elapsed = start.elapsed(); + + // assert!(matches!(chain.epsilon(), Ok(true))); + + dbg!(elapsed); + dbg!(chain.current()); + + println!("index: terminal, history"); + for (index, t) in input.iter().copied().enumerate().take(input.len() - 1) { + println!("{index}: {t}, {}", chain.history().get(index).unwrap()); + } + + #[cfg(feature = "test-print-viz")] + { + chain.graph.print_viz("chain.gv")?; + chain.atom.print_nfa("nfa.gv")?; + } + + Ok(()) } } diff --git a/chain/src/lib.rs b/chain/src/lib.rs index 4e21e1d..91c37f7 100644 --- a/chain/src/lib.rs +++ b/chain/src/lib.rs @@ -10,41 +10,189 @@ pub mod atom; -// TODO: Define errors. +use graph::{error::Error as GError, LabelExtGraph, Parent}; + +use forest::Error as ForestError; + +/// An edge in the Chain-Rule machine. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct Edge { + /// The position in the atomic languages. + label: usize, + /// The source of the associated forest edge. + forest_source: Parent, + /// Whether or not this edge is "accepting". + accepting: bool, +} + +impl Edge { + /// Construct a new edge. + pub fn new(label: usize, forest_source: Parent, accepting: bool) -> Self { + Self { + label, + forest_source, + accepting, + } + } + + /// Return the label of the edge. + pub fn label(&self) -> usize { + self.label + } + + /// Tell whether or not the edge is accepting. + pub fn is_accepting(&self) -> bool { + self.accepting + } + + /// Return the associated forest edge of the edge. + pub fn forest_source(&self) -> Parent { + self.forest_source + } +} + +impl core::fmt::Display for Edge { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let label = self.label(); + let forest_source = self.forest_source(); + + write!( + f, + "edge label {label} with forest source {} and edge index {}", + forest_source.node(), + forest_source.edge() + ) + } +} + +/// Each derivation is a concatenation of two items, so there are two +/// layers. But some items have no children and are accepting, in +/// which case we just skip that item completely, for performance +/// reasons, and hence there could be only one layer as well. +/// +/// It might even happen that both layers have no children, in which +/// case we shall just put all previous edges here. +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum TwoLayers { + /// One layer has no children. + One(Edge, usize), + // REVIEW: Maybe we do not need to store an edge in the forest: we + // only need the source of the edge? + /// Both layers have children. + /// + /// The first element is the label of the second layer, the second + /// the source of the associated forest edge of the second layer, + /// and the third is a list of edges, which are the common first + /// layers. + Two(usize, Parent, bool, Vec<(Edge, usize)>), +} + +/// The type of a collapsed iterator. +pub struct Collapsed<'a, I, C> +where + I: Iterator<Item = TwoLayers>, + C: Chain<DerIter = I>, +{ + iter: I, + chain: &'a mut C, + stop: bool, +} + +impl<'a, I, C> Collapsed<'a, I, C> +where + I: Iterator<Item = TwoLayers>, + C: Chain<DerIter = I>, +{ + /// Collapse an iterator. + pub fn collapse(iter: I, chain: &'a mut C) -> Self { + let stop = false; + + Self { iter, chain, stop } + } +} + +impl<'a, I, C> Iterator for Collapsed<'a, I, C> +where + I: Iterator<Item = TwoLayers>, + C: Chain<DerIter = I>, +{ + type Item = Result<(Edge, usize), <C as Chain>::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.stop { + return None; + } + + if let Some(two_layer) = self.iter.next() { + match two_layer { + TwoLayers::One(edge, to) => Some(Ok((edge, to))), + TwoLayers::Two(label, forest_source, accepting, edges) => { + let new_index = match self.chain.extend(edges.into_iter()) { + Ok(new) => new, + Err(error) => { + // Prevent further iterations. + + return Some(Err(error.into())); + } + }; + + Some(Ok((Edge::new(label, forest_source, accepting), new_index))) + } + } + } else { + None + } + } +} /// The expected behaviours of a language which can take derivatives /// by chain rule. -pub trait Chain: Default { +pub trait Chain: LabelExtGraph<Edge> { /// The implementations should choose a type to represent errors. - type Error: std::error::Error; + type Error: std::error::Error + From<GError> + From<ForestError>; + + /// A type of atomic languages that is chosen for this chain rule + /// machine. + type Atom: atom::Atom; /// Represents the language that is present after we parse the /// empty string, that is the initial configuration of the /// language. This may or may not be different from what /// `Default::default` gives. - fn unit() -> Self; - - /// Take the derivative by a terminal symbol. - /// - /// This takes care of the union and the prepending operations. - /// - /// # A little remark about the design - /// - /// I have thought to separate different operations (like the - /// union, the prepending, and single derivatives) and then define - /// a function to put everything together. But I think that - /// design is not convenient to use. Also, I really do not need - /// those operations other than to provide this derivative - /// operation, so why define them? And putting all things - /// together may reduce the number of bugs caused by wrong uses of - /// those component functions, and can reduce the amount of - /// documentation strings a library user needs to read, in order - /// to make use of this trait. So I ended up with this design. - fn chain(&mut self, t: usize); + fn unit(atom: Self::Atom) -> Result<Self, Self::Error>; /// Return true if and only if the language contains the empty /// string. - fn epsilon(&self) -> bool; + fn epsilon(&self) -> Result<bool, Self::Error>; + + /// Update the history + fn update_history(&mut self, new: usize); + + /// An iterator that iterates all layers that need to be merged. + type DerIter: Iterator<Item = TwoLayers>; + + /// Take the derivative by a terminal symbol at position `POS`. + fn derive(&mut self, t: usize, pos: usize) -> Result<Self::DerIter, Self::Error>; + + /// Take the union of all derivatives. + fn union(&mut self, der_iter: Self::DerIter) -> Result<Vec<(Edge, usize)>, Self::Error> { + // REVIEW: Find a way to avoid allocations. + Collapsed::<_, Self>::collapse(der_iter, self).collect() + } + + /// Use chain rule to compute the derivative with respect to a + /// terminal. + fn chain(&mut self, t: usize, pos: usize) -> Result<(), Self::Error> { + let der_iter = self.derive(t, pos)?; + + let edges = self.union(der_iter)?; + + let new_index = self.extend(edges.into_iter())?; + + self.update_history(new_index); + + Ok(()) + } } pub mod default; diff --git a/chain/src/plan.org b/chain/src/plan.org index 84192f0..c2a9037 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -88,13 +88,15 @@ + [X] Define a trait with the expected behaviours. + [X] Implement them as parents-knowing graphs. + [X] Test -- [-] Implement languages. [1/4] +- [-] Implement languages. [4/6] + [X] First define a trait with the expected behaviours. - + [ ] Then implement them as doubly labelled graphs. - + [ ] Then implement finding derivatives by use of the chain rule. - + [ ] Each edge in the chain-rule machine needs to be labelled also + + [X] Then implement them as doubly labelled graphs. + + [X] Each edge in the chain-rule machine needs to be labelled also with a position in the forest. This perfectly solves the problem of those "plugs"! + + [X] Then implement finding derivatives by use of the chain rule. + + [ ] Handle Forests + + [-] Tests - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. diff --git a/forest/src/default.rs b/forest/src/default.rs index d79c1c7..9295f1a 100644 --- a/forest/src/default.rs +++ b/forest/src/default.rs @@ -69,7 +69,7 @@ impl<T: GraphLabel> Default for DefaultForest<T> { impl<T: GraphLabel> AsRef<DefaultForest<T>> for DefaultForest<T> { fn as_ref(&self) -> &DefaultForest<T> { - &self + self } } @@ -123,9 +123,24 @@ impl<T: GraphLabel> ParentsGraph for DefaultForest<T> { where Self:'a; + #[inline] fn parents_of(&self, node_id: usize) -> Result<<Self as ParentsGraph>::Iter<'_>, GError> { self.graph.parents_of(node_id) } + + #[inline] + fn nth_child(&self, node_id: usize, n: usize) -> Result<usize, GError> { + self.graph.nth_child(node_id, n) + } + + #[inline] + fn edge_to_parent( + &self, + source: usize, + target: usize, + ) -> Result<Option<graph::Parent>, GError> { + self.graph.edge_to_parent(source, target) + } } impl<T: GraphLabel> LabelGraph<T> for DefaultForest<T> { @@ -252,7 +267,7 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<T> { Ok(frag_stack.is_empty() && self_stack.is_empty()) } - fn plant<F>(&mut self, node_id: usize, fragment: F) -> Result<(), Self::Error> + fn plant<F>(&mut self, node_id: usize, fragment: F, planted: bool) -> Result<(), Self::Error> where F: AsRef<Self>, { @@ -284,16 +299,6 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<T> { let nodes_len = fragment.nodes_len(); - // First adjoin those nodes and join the edges later. - - for node in 0..nodes_len { - let label = fragment - .vertex_label(node)? - .ok_or(Error::NodeNoLabel(node))?; - - builder.add_vertex(label); - } - // If the fragment root has a duplicate label, the forest will // not grow, so we use the label to find the adjoined node // index. @@ -313,6 +318,25 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<T> { } ); + // If the fragment has been planted before, we just add an + // edge. + + if planted { + builder.add_edge(node_id, conversion!(root), root_label)?; + + return Ok(()); + } + + // First adjoin those nodes and join the edges later. + + for node in 0..nodes_len { + let label = fragment + .vertex_label(node)? + .ok_or(Error::NodeNoLabel(node))?; + + builder.add_vertex(label); + } + // Don't forget to join the new sub-forest to the original // forest, at the specified position. @@ -403,7 +427,7 @@ mod forest_test { // add some child - forest.plant(0, leaf!(1))?; + forest.plant(0, leaf!(1), false)?; assert_eq!(forest.nodes_len(), 2); let mut children = forest.children_of(0)?; @@ -412,10 +436,10 @@ mod forest_test { // add more children - forest.plant(0, leaf!(2))?; - forest.plant(0, leaf!(3))?; - forest.plant(0, leaf!(4))?; - forest.plant(2, leaf!(5))?; + forest.plant(0, leaf!(2), false)?; + forest.plant(0, leaf!(3), false)?; + forest.plant(0, leaf!(4), false)?; + forest.plant(2, leaf!(5), false)?; assert_eq!(forest.nodes_len(), 6); let mut children = forest.children_of(0)?; @@ -428,32 +452,32 @@ mod forest_test { assert_eq!(children.next(), None); let mut test_forest = leaf!(0); - test_forest.plant(0, leaf!(1))?; - test_forest.plant(0, leaf!(2))?; - test_forest.plant(0, leaf!(3))?; - test_forest.plant(2, leaf!(5))?; + test_forest.plant(0, leaf!(1), false)?; + test_forest.plant(0, leaf!(2), false)?; + test_forest.plant(0, leaf!(3), false)?; + test_forest.plant(2, leaf!(5), false)?; assert!(forest.is_prefix(0, &test_forest)?); let mut test_forest = leaf!(0); - test_forest.plant(0, leaf!(1))?; - test_forest.plant(0, leaf!(2))?; + test_forest.plant(0, leaf!(1), false)?; + test_forest.plant(0, leaf!(2), false)?; // this child of the root should have label 3 in order to be a // prefix. - test_forest.plant(0, leaf!(4))?; - test_forest.plant(2, leaf!(5))?; + test_forest.plant(0, leaf!(4), false)?; + test_forest.plant(2, leaf!(5), false)?; assert!(!forest.is_prefix(0, &test_forest)?); let mut test_forest = leaf!(2); - test_forest.plant(0, leaf!(5))?; + test_forest.plant(0, leaf!(5), false)?; assert!(forest.is_prefix(2, &test_forest)?); // now test cloning // add a duplicate label - forest.plant(3, leaf!(5))?; + forest.plant(3, leaf!(5), false)?; let len = forest.nodes_len(); diff --git a/forest/src/design.org b/forest/src/design.org index 09db113..c32b1c9 100644 --- a/forest/src/design.org +++ b/forest/src/design.org @@ -112,7 +112,8 @@ The chain-rule operation can be described as follows: 2. Prepare a new forest fragment as follows. 1) For every child \(g\) of \(e\) in the atomic language, if \(g\) is the terminal that appears as the current input \(t\), let the - new fragment be defined as the node moved by \(g\), alone. + new fragment be defined as the node moved by \(g\), followed by + a terminal node, as a single edge. 2) If \(g\) is a non-terminal and its first-set contains \(t\), then for every left-linearly closed child of \(g\) that is labelled \(t\), denoted as \(h\), then let the fragment be diff --git a/forest/src/lib.rs b/forest/src/lib.rs index f843bc1..8c9b918 100644 --- a/forest/src/lib.rs +++ b/forest/src/lib.rs @@ -37,7 +37,7 @@ pub trait Forest<T: GraphLabel>: ParentsGraph + LabelGraph<T> { F: AsRef<Self>; /// Extend the forest by adjoining another forest at a given node. - fn plant<F>(&mut self, node_id: usize, fragment: F) -> Result<(), Self::Error> + fn plant<F>(&mut self, node_id: usize, fragment: F, planted: bool) -> Result<(), Self::Error> where F: AsRef<Self>; @@ -54,3 +54,4 @@ pub trait Forest<T: GraphLabel>: ParentsGraph + LabelGraph<T> { } pub mod default; +pub use default::Error; diff --git a/grammar/src/label.rs b/grammar/src/label.rs new file mode 100644 index 0000000..58eaddc --- /dev/null +++ b/grammar/src/label.rs @@ -0,0 +1,124 @@ +//! This file implements a type of labels that could be used as the +//! labels of a parse forest. + +use super::*; + +/// The actual label of a grammar label. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum GrammarLabelType { + /// A terminal or a non-terminal. + TNT(TNT), + /// A rule position. + Rule(usize), +} + +impl GrammarLabelType { + /// Return the name of this label with the help of the associated + /// grammar. + pub fn name(&self, grammar: &Grammar) -> Result<String, Error> { + match self { + Self::TNT(tnt) => grammar.name_of_tnt(*tnt), + Self::Rule(pos) => grammar.rule_pos_to_string(*pos), + } + } +} + +/// The label to be used in a forest. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct GrammarLabel { + /// The actual label. + label: GrammarLabelType, + /// The start in the input that this label correponds to. + start: usize, + /// The end in the input that this label correponds to. + end: Option<usize>, + /// A node in the forest might be a packed node. + packed_p: bool, +} + +impl core::fmt::Display for GrammarLabel { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + // Simply displaying this without the help of a grammar is not + // of much help, so we just use the debug method to cheat, + // haha. + + write!(f, "{:?}", self) + } +} + +impl GrammarLabel { + /// Construct a new label. + pub fn new(label: GrammarLabelType, start: usize) -> Self { + let end = None; + let packed_p = false; + + Self { + label, + start, + end, + packed_p, + } + } + + /// Return the end in the input. + pub fn end(&self) -> Option<usize> { + self.end + } + + /// Return the start in the input. + pub fn start(&self) -> usize { + self.start + } + + /// Return the actual label. + pub fn label(&self) -> GrammarLabelType { + self.label + } + + /// Update the end. + pub fn set_end(&mut self, end: usize) { + self.end = Some(end); + } + + /// Check whether the node is a packed node. + pub fn is_packed(&self) -> bool { + self.packed_p + } + + /// Update the packed status. + pub fn set_packed_p(&mut self, packed_p: bool) { + self.packed_p = packed_p; + } + + /// Return a string description with the help of the associated + /// grammar. + pub fn to_string(&self, grammar: &Grammar) -> Result<String, Error> { + // REVIEW: It needs at least 34 bytes, so we just double it. + // Of course we can also calculate the length exactly, but + // this can be postponed till later. + let mut s = String::with_capacity(68); + s.push_str("a "); + + if self.is_packed() { + s.push_str("packed "); + } else { + s.push_str("normal "); + } + + s.push_str("node labelled "); + + s.push_str(&self.label().name(grammar)?); + + s.push_str(" from "); + + s.push_str(&format!("{} ", self.start())); + + if let Some(end) = self.end() { + s.push_str(&format!("to {end}")); + } else { + s.push_str("onwards"); + } + + Ok(s) + } +} diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs index 1630881..39461f0 100644 --- a/grammar/src/left_closure.rs +++ b/grammar/src/left_closure.rs @@ -114,10 +114,10 @@ impl Grammar { local_result.push(Or); builder.add_vertex(); - local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); - let non_lit_index = builder.add_vertex(); + // local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + // let non_lit_index = builder.add_vertex(); - builder.add_edge(0, non_lit_index, ()).unwrap(); + // builder.add_edge(0, non_lit_index, ()).unwrap(); // If this non-terminal is nullable, add an empty variant. if self.is_nullable(n)? { diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs index 297cb66..50a2b9b 100644 --- a/grammar/src/lib.rs +++ b/grammar/src/lib.rs @@ -320,11 +320,39 @@ impl Grammar { self.accumulators.last().copied().unwrap_or(0) } + /// Return an element of the accumulator. + #[inline] + pub fn nth_accumulator(&self, n: usize) -> Result<usize, Error> { + self.accumulators + .get(n) + .copied() + .ok_or_else(|| Error::IndexOutOfBounds(n, self.total())) + } + + /// Return the index of the rules a rule position belongs to. + #[inline] + pub fn get_rule_num(&self, pos: usize) -> Result<usize, Error> { + let mut result = None; + + for (index, accumulator) in self.accumulators.iter().copied().skip(1).enumerate() { + if accumulator > pos { + result = Some(index); + break; + } + } + + if let Some(n) = result { + Ok(n) + } else { + Err(Error::IndexOutOfBounds(pos, self.total())) + } + } + /// Query if a position is the starting position of a /// non-terminal. If it is, return the non-terminal, else return /// `None` . #[inline] - pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option<usize> { + pub fn get_nt_start_in_nfa(&self, pos: usize) -> Option<usize> { for (index, accumulator) in self.accumulators.iter().copied().enumerate() { let shifted_accumulator = accumulator << 1; @@ -456,7 +484,7 @@ impl Grammar { } } - Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..])) + Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| v.as_ref())) } // REVIEW: Do we have a better way to record expansion information @@ -483,7 +511,11 @@ impl Grammar { &mut self, two_edges: TwoEdges<LabelType<TNT>>, ) -> LabelType<TNT> { + #[cfg(debug_assertions)] let (first_source, first_target, first_label) = two_edges.first_edge(); + #[cfg(not(debug_assertions))] + let (first_source, _, first_label) = two_edges.first_edge(); + let (second_source, second_target, second_label) = two_edges.second_edge(); #[cfg(debug_assertions)] @@ -501,11 +533,11 @@ impl Grammar { // that is to say, the source of the second edge is the // starting edge of a non-terminal. - let mut left_p = first_label.get_left_p() || second_label.get_left_p(); + let mut left_p = first_label.is_left_p() || second_label.is_left_p(); // Record left-linear expansion information. - if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) { + if let Some(second_nt) = self.get_nt_start_in_nfa(second_source) { left_p = true; if !self @@ -554,12 +586,55 @@ impl Grammar { .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? .iter()) } + + /// Return a string describing a rule position. + pub fn rule_pos_to_string(&self, pos: usize) -> Result<String, Error> { + let rule_num = { + let mut result = None; + + for (index, accumulator) in self.accumulators.iter().copied().skip(1).enumerate() { + if accumulator > pos { + result = Some(index); + break; + } + } + + if let Some(n) = result { + n + } else { + return Err(Error::IndexOutOfBounds(pos, self.total())); + } + }; + + assert!(rule_num < self.rules.len()); + + let display_tnt = |tnt| self.name_of_tnt(tnt).unwrap_or_else(|e| format!("{e}")); + + Ok(self + .rules + .get(rule_num) + .unwrap() + .regex + .to_string_with_dot( + display_tnt, + if rule_num == 0 { + pos + } else { + pos - self.accumulators.get(rule_num - 1).copied().unwrap() + }, + ) + .unwrap()) + } } pub mod first_set; pub mod left_closure; +pub mod label; + +pub use label::{GrammarLabel, GrammarLabelType}; + impl Display for Grammar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { assert_eq!(self.non.len(), self.rules.len()); diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs index 89f9844..984eb50 100644 --- a/grammar/src/test_grammar_helper.rs +++ b/grammar/src/test_grammar_helper.rs @@ -45,7 +45,7 @@ fn scan_tnt( 'T' => { let mut name = String::new(); - while let Some(c) = chars.next() { + for c in chars.by_ref() { if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { len += 1; write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; @@ -63,7 +63,7 @@ fn scan_tnt( 'N' => { let mut name = String::new(); - while let Some(c) = chars.next() { + for c in chars.by_ref() { if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { len += 1; write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; @@ -125,6 +125,130 @@ pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { } /// Return a grammar that might serve as the grammar for my notes, +/// somehow, without regular expressions. +#[allow(dead_code)] +pub fn new_notes_grammar_no_regexp() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![ + Terminal::new("NL".to_owned()), + Terminal::new("SP".to_owned()), + Terminal::new("CON".to_owned()), + Terminal::new("STAR".to_owned()), + Terminal::new("NOTE".to_owned()), + Terminal::new("PRICE".to_owned()), + Terminal::new("DIGIT".to_owned()), + ]; + let non = vec![ + Nonterminal("document".to_owned()), + Nonterminal("item".to_owned()), + Nonterminal("header".to_owned()), + Nonterminal("title".to_owned()), + Nonterminal("note".to_owned()), + Nonterminal("note-content".to_owned()), + Nonterminal("price".to_owned()), + Nonterminal("ending".to_owned()), + Nonterminal("digits".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("NL", true); + regex_parser.add_tnt("SP", true); + regex_parser.add_tnt("CON", true); + regex_parser.add_tnt("STAR", true); + regex_parser.add_tnt("note", true); + regex_parser.add_tnt("price", true); + regex_parser.add_tnt("DIGIT", true); + regex_parser.add_tnt("document", false); + regex_parser.add_tnt("item", false); + regex_parser.add_tnt("header", false); + regex_parser.add_tnt("title", false); + regex_parser.add_tnt("note", false); + regex_parser.add_tnt("notecontent", false); + regex_parser.add_tnt("price", false); + regex_parser.add_tnt("ending", false); + regex_parser.add_tnt("digits", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Nitem | Nitem Ndocument", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse( + "Nheader | Nheader Nprice | Nheader Nnote | Nheader Nprice Nnote", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse( + "TSP Ntitle TNL Nending | TSTAR TSP Ntitle TNL Nending", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule4 = Rule { + regex: regex_parser + .parse("TCON | TCON Ntitle", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule5 = Rule { + regex: regex_parser + .parse("Tnote Nnotecontent TNL Nending", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule6 = Rule { + regex: regex_parser + .parse("TCON | TCON Nnotecontent", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule7 = Rule { + regex: regex_parser + .parse("Tprice TSP Ndigits TNL Nending", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule8 = Rule { + regex: regex_parser + .parse("TNL Nending | TSP Nending | ()", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule9 = Rule { + regex: regex_parser + .parse("TDIGIT | TDIGIT Ndigits", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![ + rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9, + ]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that might serve as the grammar for my notes, /// somehow. #[allow(dead_code)] pub fn new_notes_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { diff --git a/graph/src/error.rs b/graph/src/error.rs index ce45acc..bf2714b 100644 --- a/graph/src/error.rs +++ b/graph/src/error.rs @@ -18,8 +18,6 @@ pub enum Error { /// The graph does not permit duplicate edges but encounters a /// repeated edge. DuplicatedEdge(usize, usize), - /// The source node has no room to add a new edge. - FullNode(usize), } impl Display for Error { @@ -37,7 +35,6 @@ impl Display for Error { "No duplicate edges permitted, but found one from {source} to {target}" ) } - Error::FullNode(index) => write!(f, "the node {index} has no room for new edges"), } } } diff --git a/graph/src/labelled/binary.rs b/graph/src/labelled/binary.rs index bfd8109..e3e1943 100644 --- a/graph/src/labelled/binary.rs +++ b/graph/src/labelled/binary.rs @@ -19,18 +19,22 @@ struct PLChildren { } impl PLChildren { + #[inline] fn iter(&self) -> Iter<'_, usize> { self.children.iter() } + #[inline] fn len(&self) -> usize { self.children.len() } + #[inline] fn is_empty(&self) -> bool { self.children.is_empty() } + #[inline] fn contains(&self, key: &usize) -> bool { self.indices.contains_key(key) } @@ -73,6 +77,15 @@ impl PLChildren { self.children.remove(key_index); } + + /// Retrieve the `n`-th child. + #[inline] + fn nth(&self, n: usize) -> Result<usize, Error> { + self.children + .get(n) + .copied() + .ok_or(Error::IndexOutOfBounds(n, self.children.len())) + } } /// A node has some children, some parents, and a label. @@ -187,7 +200,7 @@ impl<T: GraphLabel> Graph for PLGraph<T> { } fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) { - todo!() + unimplemented!("use a `PLGBuilderMut` instead") } fn print_viz(&self, _filename: &str) -> Result<(), std::io::Error> { @@ -246,6 +259,31 @@ impl<T: GraphLabel> ParentsGraph for PLGraph<T> { Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) } } + + #[inline] + fn nth_child(&self, node_id: usize, n: usize) -> Result<usize, Error> { + if let Some(node) = self.nodes.get(node_id) { + node.children.nth(n) + } else { + Err(Error::IndexOutOfBounds(node_id, self.nodes_len())) + } + } + + #[inline] + fn edge_to_parent(&self, source: usize, target: usize) -> Result<Option<Parent>, Error> { + if !self.has_edge(source, target)? { + return Ok(None); + } + + Ok(self + .nodes + .get(source) + .unwrap() + .children + .indices + .get(&target) + .map(|index| Parent::new(source, *index))) + } } impl<T: GraphLabel> RedirectGraph for PLGraph<T> { @@ -378,13 +416,13 @@ impl<'a, T: GraphLabel> std::ops::Deref for PLGBuilderMut<'a, T> { type Target = PLGraph<T>; fn deref(&self) -> &Self::Target { - &self.graph + self.graph } } impl<'a, T: GraphLabel> std::ops::DerefMut for PLGBuilderMut<'a, T> { fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.graph + self.graph } } diff --git a/graph/src/labelled/double.rs b/graph/src/labelled/double.rs index 4ab8a38..174c8ef 100644 --- a/graph/src/labelled/double.rs +++ b/graph/src/labelled/double.rs @@ -150,6 +150,8 @@ impl<T: GraphLabel> Graph for DLGraph<T> { } fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + let filename = format!("output/{filename}"); + let preamble = "digraph nfa { fontname=\"Helvetica,Arial,sans-serif\" node [fontname=\"Helvetica,Arial,sans-serif\"] @@ -170,14 +172,14 @@ impl<T: GraphLabel> Graph for DLGraph<T> { let result = format!("{preamble}{post}"); - if std::fs::metadata(filename).is_ok() { - std::fs::remove_file(filename)?; + if std::fs::metadata(&filename).is_ok() { + std::fs::remove_file(&filename)?; } let mut file = std::fs::File::options() .write(true) .create(true) - .open(filename)?; + .open(&filename)?; use std::io::Write; @@ -206,7 +208,7 @@ impl<'a> Iterator for LabelIndexIter<'a> { #[inline] fn next(&mut self) -> Option<Self::Item> { - self.iter.as_mut().and_then(|iterator| iterator.next()) + self.iter.as_mut().and_then(Iterator::next) } #[inline] @@ -242,7 +244,7 @@ impl<'a> From<&'a Set<usize>> for LabelIndexIter<'a> { } } -#[derive(Debug)] +#[derive(Debug, Clone)] /// A delegation of iterators. /// /// This is used to avoid a boxed pointer to an iterator. @@ -278,7 +280,7 @@ impl<'a, T> Iterator for LabelIter<'a, T> { } /// This is used to avoid a boxed pointer to an iterator. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct EdgeLabelIter<'a, T> { iter: Option<Iter<'a, T>>, } diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 6af7889..2a0c50d 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -251,10 +251,14 @@ pub trait ParentsGraph: Graph { /// Return an iterator of parents for a node. fn parents_of(&self, node_id: usize) -> Result<<Self as ParentsGraph>::Iter<'_>, Error>; -} -// TODO: Design a trait of graphs which can "replace" a certain child -// by another child. To re-direct children, so to speak. + /// We need to be able to retrieve the `n`-the child, for the edge + /// index to be of any use. + fn nth_child(&self, node_id: usize, n: usize) -> Result<usize, Error>; + + /// Convert an edge to a `Parent` construct. + fn edge_to_parent(&self, source: usize, target: usize) -> Result<Option<Parent>, Error>; +} /// An /exended/ graph in the sense that it offers the ability to /// "redirect" children of a node to another node. diff --git a/graph_macro/src/lib.rs b/graph_macro/src/lib.rs index 5a0672b..a55efbb 100644 --- a/graph_macro/src/lib.rs +++ b/graph_macro/src/lib.rs @@ -1,12 +1,19 @@ +#![allow(unused_imports)] //! This file provides a macro to delegate the implementation of a //! type that wraps a graph. More precisely, the macro helps the //! wrapper type implement the various Graph-related traits. use proc_macro::TokenStream; +use core::iter::Peekable; + #[proc_macro_derive(Testing)] pub fn test_derive(input: TokenStream) -> TokenStream { - println!("input: {input:?}"); + let input = input.into_iter().peekable(); + + for tree in input { + println!("tree = {tree:?}"); + } TokenStream::new() } diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index a23c056..8d657d5 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -212,7 +212,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let top_label = regex.vertex_label(top_index)?; let nfa_start = offset + 2 * top_index; - let nfa_end = offset + 2 * top_index + 1; + let nfa_end = nfa_start + 1; match top_label { RegexType::Kleene => { diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index c8ad370..1e3e87b 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -359,6 +359,147 @@ impl<T: GraphLabel> DefaultRegex<T> { Ok(result) } + + /// Use a function to format the labels of the regular expressions + /// and format the entire regular expression with this aid, with a + /// dot at a specified position. + pub fn to_string_with_dot<F>(&self, mut f: F, dot_pos: usize) -> Result<String, std::fmt::Error> + where + F: FnMut(T) -> String, + { + #[derive(Copy, Clone)] + enum StackElement { + Seen(usize), + Unseen(usize), + } + + impl StackElement { + fn index(&self) -> usize { + match self { + Seen(index) => *index, + Unseen(index) => *index, + } + } + + fn is_seen(self) -> bool { + matches!(self, Seen(_)) + } + } + + use StackElement::{Seen, Unseen}; + + let mut result = String::new(); + + let mut stack: Vec<StackElement> = Vec::new(); + let mut types = self.types.clone(); + types.push(RegexType::Paren); + + stack.push(Unseen(0)); + + while let Some(top) = stack.pop() { + let node_type = types.get(top.index()).unwrap(); + + match node_type { + RegexType::Kleene => { + if !top.is_seen() { + stack.push(Seen(top.index())); + + if self.degree(top.index()).unwrap() > 1 { + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); + } + + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(result, "*")?; + } + } + RegexType::Plus => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(result, "+")?; + } + } + RegexType::Optional => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(result, "?")?; + } + } + RegexType::Or => { + if !top.is_seen() { + write!(result, "(")?; + + let len = self.len(); + + stack.push(Unseen(types.len() - 1)); + + for (child_index, child) in self + .graph + .children_of(top.index()) + .unwrap() + .enumerate() + .rev() + { + if child_index != len - 1 && child_index != 0 { + stack.push(Unseen(child)); + stack.push(Seen(top.index())); + } else { + stack.push(Unseen(child)); + } + } + } else { + write!(result, "|")?; + } + } + RegexType::Paren => { + write!(result, ")")?; + } + RegexType::Empty => { + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + + if self.graph.is_empty_node(top.index()).unwrap() { + write!(result, "ε")?; + } + } + RegexType::Lit(label) => write!(result, "{}", f(*label))?, + } + + if top.index() == dot_pos { + write!(result, " · ")?; + } + } + + Ok(result) + } } impl<T: GraphLabel + Display + Debug> Display for DefaultRegex<T> { diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index c1906e1..4440ea6 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -150,15 +150,17 @@ impl<T: GraphLabel> NfaLabel<T> { pub fn get_value(&self) -> T { self.value } + /// Retrieve the moved position from the label. #[inline] pub fn get_moved(&self) -> usize { self.moved } + /// Retrieve whether or not the label comes from left-linear /// expansions. #[inline] - pub fn get_left_p(&self) -> bool { + pub fn is_left_p(&self) -> bool { self.left_p } } |