From 18d7955b7d84c00467ede38baae53f4ce1fb6908 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 20 Jan 2023 13:48:26 +0800 Subject: chain: a prototype is added. I have an ostensibly working prototype now. Further tests are needed to make sure that the algorithm meets the time complexity requirement, though. --- chain/src/atom/default.rs | 214 +++++++++++++++++++++++++++++++++++++++++++--- chain/src/atom/mod.rs | 9 +- 2 files changed, 207 insertions(+), 16 deletions(-) (limited to 'chain/src/atom') diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 90133f4..0dc24c3 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -6,7 +6,7 @@ use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; use nfa::{ default::{nfa::DefaultNFA, regex::DefaultRegex}, - LabelType, + LabelType, NfaLabel, }; use core::fmt::Display; @@ -39,11 +39,55 @@ type VirtualMap = Map; pub struct DefaultAtom { grammar: Grammar, nfa: DefaultNFA>, + accepting_vec: Vec, // NOTE: This is mostly for printing and debugging regexp: Vec>, virtual_nodes: VirtualMap, } +impl DefaultAtom { + /// Return the string description of a rule position. + pub fn rule_pos_string(&self, pos: usize) -> Result> { + let rule_num = self.grammar.get_rule_num(pos)?; + + assert!(rule_num < self.grammar.non_num()); + + let display_tnt = |tnt| self.name_of_tnt(tnt).unwrap_or_else(|e| format!("{e}")); + + Ok(self.regexp.get(rule_num).unwrap().to_string_with_dot( + display_tnt, + if rule_num == 0 { + pos + } else { + pos - self.grammar.nth_accumulator(rule_num - 1)? + }, + )?) + } + + /// Print the underlying NFA. + pub fn print_nfa>(&self, filename: S) -> Result<(), std::io::Error> { + self.nfa.print_viz(filename.as_ref())?; + + let nullables: Vec<_> = self + .accepting_vec + .iter() + .enumerate() + .filter_map(|(index, pred)| if *pred { Some(index) } else { None }) + .collect(); + + if !nullables.is_empty() { + println!("nullables: {nullables:?}"); + } + + println!("printing virtual nodes:"); + for (vn, node) in self.virtual_nodes.iter() { + println!("[{}]^{{({})}}: {}", vn.s, vn.t, node); + } + + Ok(()) + } +} + impl Display for DefaultAtom { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let grammar = &self.grammar; @@ -260,15 +304,67 @@ impl DefaultAtom { .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) .collect(); + // Now record accepting information. + + let nfa_len = nfa.nodes_len(); + + let label_is_nullable = |label: NfaLabel>| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true + } + }; + + let mut accepting_vec: Vec = std::iter::repeat(false).take(nfa_len).collect(); + + for nfa_start in accumulators.iter().copied().take(regexp.len()) { + *accepting_vec.get_mut(nfa_start + 1).unwrap() = true; + } + + // The last is always accepting. + *accepting_vec.get_mut(nfa_len - 1).unwrap() = true; + + let mut updated = true; + + while updated { + updated = false; + + for node in nfa.nodes() { + // skip those that do not need to be updated + if *accepting_vec + .get(node) + .ok_or(GrammarError::IndexOutOfBounds(node, nfa_len))? + { + continue; + } + + 'label_loop: for (label, target_iter) in nfa + .labels_of(node) + .map_err(|_| GrammarError::IndexOutOfBounds(node, nfa_len))? + { + if label_is_nullable(*label) { + for target in target_iter { + if *accepting_vec + .get(target) + .ok_or(GrammarError::IndexOutOfBounds(target, nfa_len))? + { + // accepting_vec[node] must have been + // false, as we checked above + *accepting_vec.get_mut(node).unwrap() = true; + updated = true; + + break 'label_loop; + } + } + } + } + } + } + // Perform nulling and remove_epsilon at the same time. nfa.closure( - |label| { - if let Some(label) = *label.get_value() { - matches!(label, TNT::Non(n) if nullables.contains(&n)) - } else { - true - } - }, + label_is_nullable, true, |two_edges| grammar.transform_label_null_epsilon(two_edges), |label| label.get_value().is_none(), @@ -298,6 +394,8 @@ impl DefaultAtom { } } + // dbg!(&accumulators); + for nt in 0..nt_num { let children: std::collections::HashMap<_, _> = nfa // This is safe because of the above assertion. @@ -306,23 +404,92 @@ impl DefaultAtom { .map(|(label, target_iter)| (*label, target_iter)) .collect(); - let mut terminals_map: HashMap, usize)>> = + let mut terminals_map: HashMap, usize)>, bool)> = HashMap::new(); for (label, children_iter) in children.into_iter() { if let Some(TNT::Ter(t)) = *label.get_value() { - terminals_map - .entry(t) - .or_insert_with(|| HashSet::with_capacity(children_iter.len())) - .extend(children_iter.map(|target| (label, target))); + let estimated_len = { + let mut result = 0; + + for child in children_iter.clone() { + result += nfa.degree(child).map_err(index_out_of_bounds_conversion)?; + } + + result + }; + + let mut accepting = false; + + for child in children_iter { + accepting = accepting + || *accepting_vec.get(child).ok_or_else(|| { + GrammarError::IndexOutOfBounds(child, accepting_vec.len()) + })?; + + if nt == 3 + && nfa.degree(child).map_err(index_out_of_bounds_conversion)? == 0 + { + println!("accepting = {accepting}"); + } + + if let Some((_, old_accepting)) = terminals_map.get_mut(&t) { + *old_accepting = *old_accepting || accepting; + } else { + terminals_map + .insert(t, (HashSet::with_capacity(estimated_len), accepting)); + } + + for (child_label, child_children_iter) in nfa + .labels_of(child) + .map_err(index_out_of_bounds_conversion)? + { + // We checked this is safe above. + let (set, _) = terminals_map.get_mut(&t).unwrap(); + + set.extend(child_children_iter.map(|target| (*child_label, target))); + } + } } } - for (t, set) in terminals_map.into_iter() { + if nt == 3 { + println!("map = {terminals_map:?}"); + } + + for (t, (set, accepting)) in terminals_map.into_iter() { let new_index = nfa .extend(set.into_iter()) .map_err(index_out_of_bounds_conversion)?; + if accepting_vec.get(new_index).is_none() { + #[cfg(debug_assertions)] + assert_eq!(new_index, accepting_vec.len()); + + // let mut updated = false; + // let nfa_len = nfa.nodes_len(); + + // 'label_loop: for (label, target_iter) in nfa + // .labels_of(new_index) + // .map_err(|_| GrammarError::IndexOutOfBounds(new_index, nfa_len))? + // { + // if label_is_nullable(*label) { + // for target in target_iter { + // if *accepting_vec + // .get(target) + // .ok_or(GrammarError::IndexOutOfBounds(target, nfa_len))? + // { + // updated = true; + + // break 'label_loop; + // } + // } + // } + // } + + accepting_vec.push(accepting); + } + let virtual_node = VirtualNode::new(nt, t); virtual_nodes.insert(virtual_node, new_index); @@ -334,6 +501,7 @@ impl DefaultAtom { nfa, regexp, virtual_nodes, + accepting_vec, }) } } @@ -343,6 +511,14 @@ fn query(map: &VirtualMap, nt: usize, t: usize) -> Option { map.get(&VirtualNode::new(nt, t)).copied() } +impl std::ops::Deref for DefaultAtom { + type Target = Grammar; + + fn deref(&self) -> &Self::Target { + &self.grammar + } +} + impl Atom for DefaultAtom { fn atom(&self, nt: usize, t: usize) -> Result, GrammarError> { if nt >= self.grammar.non_num() { @@ -359,4 +535,14 @@ impl Atom for DefaultAtom { fn empty(&self) -> usize { self.grammar.total() << 1 } + + fn is_accepting(&self, node_id: usize) -> Result { + self.accepting_vec + .get(node_id) + .copied() + .ok_or(GrammarError::IndexOutOfBounds( + node_id, + self.accepting_vec.len(), + )) + } } diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs index 065640b..398edd2 100644 --- a/chain/src/atom/mod.rs +++ b/chain/src/atom/mod.rs @@ -6,17 +6,22 @@ //! Because this way I can easily substitute other implementations if //! I have better ideas in the future. -use grammar::{Error as GrammarError, TNT}; +use grammar::{Error as GrammarError, Grammar, TNT}; use nfa::{DOption, LabelType, Nfa}; +use std::ops::Deref; + /// The expected behaviours of an atomic language. -pub trait Atom: Nfa> { +pub trait Atom: Nfa> + Deref { /// Return the index of a node representing the derivative of the /// left-linear null closure of `nt` with respect to `t`. fn atom(&self, nt: usize, t: usize) -> Result, GrammarError>; /// Return the index of the empty state. fn empty(&self) -> usize; + + /// Tell whether a node is accepting. + fn is_accepting(&self, node_id: usize) -> Result; } pub mod default; -- cgit v1.2.3-18-g5258