//! This file provides a default implementation of the //! [`Atom`][super::Atom] trait. use super::*; use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; use nfa::{ default::{nfa::DefaultNFA, regex::DefaultRegex}, LabelType, NfaLabel, }; use core::fmt::Display; use std::collections::BTreeMap as Map; /// A virtual node represents the derivative of a non-terminal symbol /// `S` with respect to a terminal symbol `t`. #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] struct VirtualNode { s: usize, t: usize, } impl Display for VirtualNode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "VN[{}]^({})", self.s, self.t) } } impl VirtualNode { fn new(s: usize, t: usize) -> Self { Self { s, t } } } type VirtualMap = Map; /// The type of atomic languages. #[derive(Debug, Clone, Default)] pub struct DefaultAtom { grammar: Grammar, nfa: DefaultNFA>, accepting_vec: Vec, // NOTE: This is mostly for printing and debugging regexp: Vec>, virtual_nodes: VirtualMap, } impl DefaultAtom { /// Return the string description of a rule position. pub fn rule_pos_string(&self, pos: usize) -> Result> { let rule_num = self.grammar.get_rule_num(pos)?; assert!(rule_num < self.grammar.non_num()); let display_tnt = |tnt| self.name_of_tnt(tnt).unwrap_or_else(|e| format!("{e}")); Ok(self.regexp.get(rule_num).unwrap().to_string_with_dot( display_tnt, if rule_num == 0 { pos } else { pos - self.grammar.nth_accumulator(rule_num - 1)? }, )?) } /// Print the underlying NFA. pub fn print_nfa>(&self, filename: S) -> Result<(), std::io::Error> { self.nfa.print_viz(filename.as_ref())?; let nullables: Vec<_> = self .accepting_vec .iter() .enumerate() .filter_map(|(index, pred)| if *pred { Some(index) } else { None }) .collect(); if !nullables.is_empty() { println!("nullables: {nullables:?}"); } println!("printing virtual nodes:"); for (vn, node) in self.virtual_nodes.iter() { println!("[{}]^{{({})}}: {}", vn.s, vn.t, node); } Ok(()) } } impl Display for DefaultAtom { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let grammar = &self.grammar; let error_to_string = |err| format!("{err}"); let tnt_to_string = |tnt, deco| { grammar .name_of_tnt(tnt) .map(|name| format!("{deco}({name})")) .unwrap_or_else(error_to_string) }; let display_tnt = |tnt| match tnt { TNT::Ter(t) => format!( "({})", grammar .unpack_tnt(t) .map(|tnt| tnt_to_string(tnt, "")) .unwrap_or_else(error_to_string) ), TNT::Non(_) => tnt_to_string(tnt, "H"), }; writeln!(f, "regular expressions:")?; let mut accumulators = Vec::with_capacity(self.regexp.len() + 1); accumulators.push(0usize); for regex in self.regexp.iter() { writeln!(f, "accumulator: {}", accumulators.last().unwrap())?; accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); let string = regex.to_string_with(display_tnt)?; writeln!(f, "{string}")?; } writeln!(f, "total = {}", accumulators.last().unwrap())?; writeln!(f, "virtual nodes:")?; for (virtual_node, index) in self.virtual_nodes.iter() { writeln!(f, "{virtual_node}: {index}")?; } Ok(()) } } // Some boiler-plate delegation implementations for Graph and // LabelGraph, in order to implement Nfa. impl Graph for DefaultAtom { type Iter<'b> = > as Graph>::Iter<'b> where Self: 'b; fn is_empty(&self) -> bool { self.nfa.is_empty() } fn nodes_len(&self) -> usize { self.nfa.nodes_len() } fn children_of(&self, node_id: usize) -> Result, GraphError> { self.nfa.children_of(node_id) } fn degree(&self, node_id: usize) -> Result { self.nfa.degree(node_id) } fn is_empty_node(&self, node_id: usize) -> Result { self.nfa.is_empty_node(node_id) } fn has_edge(&self, source: usize, target: usize) -> Result { self.nfa.has_edge(source, target) } fn replace_by_builder(&mut self, _builder: impl graph::Builder) { // NOTE: We cannot replace by a builder whose result is an // atom, not the underlying graph type. unimplemented!() } } impl LabelGraph> for DefaultAtom { type Iter<'b> = > as LabelGraph>>::Iter<'b> where Self: 'b; type LabelIter<'b> = > as LabelGraph>>::LabelIter<'b> where Self: 'b, DOption: 'b; type EdgeLabelIter<'a> = > as LabelGraph>>::EdgeLabelIter<'a> where Self: 'a, DOption: 'a; #[inline] fn vertex_label(&self, node_id: usize) -> Result>, GraphError> { self.nfa.vertex_label(node_id) } #[inline] fn edge_label( &self, source: usize, target: usize, ) -> Result, GraphError> { self.nfa.edge_label(source, target) } #[inline] fn find_children_with_label( &self, node_id: usize, label: &LabelType, ) -> Result<>>::Iter<'_>, GraphError> { self.nfa.find_children_with_label(node_id, label) } #[inline] fn labels_of(&self, node_id: usize) -> Result, GraphError> { self.nfa.labels_of(node_id) } #[inline] fn has_edge_label( &self, node_id: usize, label: &LabelType, target: usize, ) -> Result { self.nfa.has_edge_label(node_id, label, target) } } impl LabelExtGraph> for DefaultAtom { #[inline] fn extend( &mut self, edges: impl IntoIterator, usize)>, ) -> Result { self.nfa.extend(edges) } } impl Nfa> for DefaultAtom { type FromRegex = (); #[inline] fn to_nfa( _regexps: &[impl nfa::Regex>>], _sub_pred: impl Fn(LabelType) -> Result>, nfa::error::Error>, _default: Option>, ) -> Result>>, nfa::error::Error> { // NOTE: We cannot construct an atom from a set of regular // languages alone. So it is appropriate to panic here, if // one tries to do so, for some reason. unimplemented!() } #[inline] fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> { self.nfa.remove_dead(reserve) } #[inline] fn closure( &mut self, predicate: impl FnMut(LabelType) -> bool, remove_after_p: bool, transform: impl FnMut(nfa::TwoEdges>) -> LabelType, remove_predicate: impl FnMut(LabelType) -> bool, ) -> Result<(), nfa::error::Error> { self.nfa .closure(predicate, remove_after_p, transform, remove_predicate) } } impl DefaultAtom { /// Construct an atomic language from a grammar. pub fn from_grammar(mut grammar: Grammar) -> Result { grammar.compute_firsts()?; let regexp = grammar.left_closure()?; let mut nfa = grammar.left_closure_to_nfa(®exp)?; use std::collections::{HashMap, HashSet}; let accumulators: Vec = { let mut result = Vec::with_capacity(regexp.len() + 1); result.push(0); for regex in regexp.iter() { // Calling `unwrap` here is safe as `result` is always // non-empty. result.push(regex.nodes_len() * 2 + result.last().unwrap()); } result }; let accumulators_set: HashSet = accumulators.iter().copied().collect(); let nullables: HashSet = (0..grammar.non_num()) .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) .collect(); // Now record accepting information. let nfa_len = nfa.nodes_len(); let label_is_nullable = |label: NfaLabel>| { if let Some(label) = *label.get_value() { matches!(label, TNT::Non(n) if nullables.contains(&n)) } else { true } }; let mut accepting_vec: Vec = std::iter::repeat(false).take(nfa_len).collect(); for nfa_start in accumulators.iter().copied().take(regexp.len()) { *accepting_vec.get_mut(nfa_start + 1).unwrap() = true; } // The last is always accepting. *accepting_vec.get_mut(nfa_len - 1).unwrap() = true; let mut updated = true; while updated { updated = false; for node in nfa.nodes() { // skip those that do not need to be updated if *accepting_vec .get(node) .ok_or(GrammarError::IndexOutOfBounds(node, nfa_len))? { continue; } 'label_loop: for (label, target_iter) in nfa .labels_of(node) .map_err(|_| GrammarError::IndexOutOfBounds(node, nfa_len))? { if label_is_nullable(*label) { for target in target_iter { if *accepting_vec .get(target) .ok_or(GrammarError::IndexOutOfBounds(target, nfa_len))? { // accepting_vec[node] must have been // false, as we checked above *accepting_vec.get_mut(node).unwrap() = true; updated = true; break 'label_loop; } } } } } } // Perform nulling and remove_epsilon at the same time. nfa.closure( label_is_nullable, true, |two_edges| grammar.transform_label_null_epsilon(two_edges), |label| label.get_value().is_none(), )?; nfa.remove_dead(|node| accumulators_set.contains(&node))?; // Now add the virtual nodes. let mut virtual_nodes: VirtualMap = Default::default(); let nt_num = grammar.non_num(); assert!(nt_num <= accumulators.len()); /// Convert an error telling us that an index is out of bounds. /// /// # Panics /// /// The function panics if the error is not of the expected /// kind. fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { match ge { GraphError::IndexOutOfBounds(index, bound) => { GrammarError::IndexOutOfBounds(index, bound) } _ => unreachable!(), } } // dbg!(&accumulators); for nt in 0..nt_num { let children: std::collections::HashMap<_, _> = nfa // This is safe because of the above assertion. .labels_of(*accumulators.get(nt).unwrap()) .map_err(index_out_of_bounds_conversion)? .map(|(label, target_iter)| (*label, target_iter)) .collect(); type TerminalsValue = (HashSet<(LabelType, usize)>, bool); let mut terminals_map: HashMap = HashMap::new(); for (label, children_iter) in children.into_iter() { if let Some(TNT::Ter(t)) = *label.get_value() { let estimated_len = { let mut result = 0; for child in children_iter.clone() { result += nfa.degree(child).map_err(index_out_of_bounds_conversion)?; } result }; let mut accepting = false; for child in children_iter { accepting = accepting || *accepting_vec.get(child).ok_or( GrammarError::IndexOutOfBounds(child, accepting_vec.len()), )?; if nt == 3 && nfa.degree(child).map_err(index_out_of_bounds_conversion)? == 0 { println!("accepting = {accepting}"); } if let Some((_, old_accepting)) = terminals_map.get_mut(&t) { *old_accepting = *old_accepting || accepting; } else { terminals_map .insert(t, (HashSet::with_capacity(estimated_len), accepting)); } for (child_label, child_children_iter) in nfa .labels_of(child) .map_err(index_out_of_bounds_conversion)? { // We checked this is safe above. let (set, _) = terminals_map.get_mut(&t).unwrap(); set.extend(child_children_iter.map(|target| (*child_label, target))); } } } } if nt == 3 { println!("map = {terminals_map:?}"); } for (t, (set, accepting)) in terminals_map.into_iter() { let new_index = nfa .extend(set.into_iter()) .map_err(index_out_of_bounds_conversion)?; if accepting_vec.get(new_index).is_none() { #[cfg(debug_assertions)] assert_eq!(new_index, accepting_vec.len()); // let mut updated = false; // let nfa_len = nfa.nodes_len(); // 'label_loop: for (label, target_iter) in nfa // .labels_of(new_index) // .map_err(|_| GrammarError::IndexOutOfBounds(new_index, nfa_len))? // { // if label_is_nullable(*label) { // for target in target_iter { // if *accepting_vec // .get(target) // .ok_or(GrammarError::IndexOutOfBounds(target, nfa_len))? // { // updated = true; // break 'label_loop; // } // } // } // } accepting_vec.push(accepting); } let virtual_node = VirtualNode::new(nt, t); virtual_nodes.insert(virtual_node, new_index); } } Ok(Self { grammar, nfa, regexp, virtual_nodes, accepting_vec, }) } } /// A convenient getter for the map of virtual nodes. fn query(map: &VirtualMap, nt: usize, t: usize) -> Option { map.get(&VirtualNode::new(nt, t)).copied() } impl std::ops::Deref for DefaultAtom { type Target = Grammar; fn deref(&self) -> &Self::Target { &self.grammar } } impl Atom for DefaultAtom { fn atom(&self, nt: usize, t: usize) -> Result, GrammarError> { if nt >= self.grammar.non_num() { return Err(GrammarError::IndexOutOfBounds(nt, self.grammar.non_num())); } if t >= self.grammar.ter_num() { return Err(GrammarError::IndexOutOfBounds(t, self.grammar.ter_num())); } Ok(query(&self.virtual_nodes, nt, t)) } fn empty(&self) -> usize { self.grammar.total() << 1 } fn is_accepting(&self, node_id: usize) -> Result { self.accepting_vec .get(node_id) .copied() .ok_or(GrammarError::IndexOutOfBounds( node_id, self.accepting_vec.len(), )) } }