diff options
author | JSDurand <mmemmew@gmail.com> | 2023-07-08 12:30:21 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-07-08 12:31:13 +0800 |
commit | 9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 (patch) | |
tree | 7bb6004196b38446a5ab0cb3a0ab642d35f113e9 | |
parent | 691f969eb104fa3d4c2a1667693fd0382eb9d6b5 (diff) |
Finished the Emacs binding.
Now the binding part is finished.
What remains is a bug encountered when planting a fragment to the
forest which intersects a packed node, which would lead to invalid
forests. This will also cause problem when planting a packed
fragment, but until now my testing grammars do not produce packed
fragments, so this problem is not encountered yet.
I am still figuring out efficient ways to solve this problem.
34 files changed, 5605 insertions, 267 deletions
@@ -18,9 +18,15 @@ members = [ "graph", "receme", "nfa", "chain", "viz", "grammar", # testing the new resolver, even though this has no dependencies ;p resolver = "2" +[lib] +crate-type = ["cdylib"] + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +chain = { path = "./chain" } +grammar = { path = "./grammar" } +graph = { path = "./graph" } [features] default = [] diff --git a/chain/src/archive.txt b/chain/src/archive.txt index 030ccba..7bd6f31 100644 --- a/chain/src/archive.txt +++ b/chain/src/archive.txt @@ -493,3 +493,50 @@ // PaVi::Empty => Ok(self.root().ok_or(Error::IndexOutOfBounds(0, 0))?), // } // } + +// /// Find the last child of the given node whose start and end +// /// positions contain the given position. If no such child is +// /// found, return `Ok(None)`. +// /// +// /// The returned tuple is of the form (child, index), where +// /// `child` is the index of the child node in question, and +// /// `index` means that the child is the `index`-th child of the +// /// node. +// #[allow(dead_code)] +// pub(crate) fn position_search( +// &self, +// node: usize, +// pos: usize, +// ) -> Result<Option<(usize, usize)>, Error> { +// fn range_contains(label: GrammarLabel, pos: usize) -> bool { +// let start = label.start(); +// +// if let Some(end) = label.end() { +// (start..end).contains(&pos) +// } else { +// (start..).contains(&pos) +// } +// } +// +// let node_label = self +// .vertex_label(node)? +// .ok_or(Error::NodeNoLabel(node))? +// .label(); +// +// if !range_contains(node_label, pos) { +// return Ok(None); +// } +// +// for (index, child) in self.children_of(node)?.enumerate().rev() { +// let child_label = self +// .vertex_label(child)? +// .ok_or(Error::NodeNoLabel(child))? +// .label(); +// +// if range_contains(child_label, pos) { +// return Ok(Some((child, index))); +// } +// } +// +// Ok(None) +// } diff --git a/chain/src/default.rs b/chain/src/default.rs index ed63ff4..7de720f 100644 --- a/chain/src/default.rs +++ b/chain/src/default.rs @@ -11,15 +11,19 @@ use crate::item::{ default::DefaultForest, generate_fragment, genins::index_out_of_bounds_conversion, Forest, ForestLabel, ForestLabelError, }; -use core::fmt::Display; use grammar::{GrammarLabel, GrammarLabelType, START_NONTERMINAL, TNT}; +#[allow(unused_imports)] use graph::{ labelled::DLGBuilder, Builder, DLGraph, Graph, LabelExtGraph, LabelGraph, ParentsGraph, }; +use std::fmt::Display; use graph_macro::Graph; -use std::collections::{HashMap as Map, HashSet, TryReserveError}; +use std::{ + borrow::Borrow, + collections::{HashMap as Map, HashSet, TryReserveError}, +}; /// The errors related to taking derivatives by chain rule. #[non_exhaustive] @@ -399,6 +403,10 @@ impl Chain for DefaultChain { }) } + fn atom(&self) -> &Self::Atom { + self.atom.borrow() + } + fn epsilon(&self) -> Result<bool, Self::Error> { self.accepting_vec .get(self.current) @@ -783,146 +791,119 @@ impl Chain for DefaultChain { Ok(der_iter) } - // FIXME: Do nothing more than to make this at the end of input, - // actually. - // - // This means we only close the expansions at the last accepting - // sources. - // - // As to the remaining still open fragments, they should be - // considered as bugs and should be fixed in the function - // `insert_item` instead. - - fn end_of_input(&mut self) -> Result<(), Self::Error> { - if !matches!(self.epsilon(), Ok(true)) { - return Ok(()); - } - - self.forest.remove_node(1)?; - - let mut stack = Vec::new(); - - dbg!(&self.accepting_sources); + type Item = DefaultForest<ForestLabel<GrammarLabel>>; - // self.forest.print_viz("dbg forest before.gv").unwrap(); + fn end_of_input(&mut self, pos: usize, ter: usize) -> Result<Self::Item, Error> { + let mut result = Default::default(); - for pavi in self.accepting_sources.iter() { - match pavi { - PaVi::Parent(node, _edge, child) => { - // REVIEW: This is a garbage node that has to be - // added when the machine starts. Is there a way - // to avoid this garbage? - if *node == 1 { - continue; - } + let root = if let Some(root) = self.forest.root() { + root + } else { + return Ok(result); + }; - stack.push(*child); - } - PaVi::Virtual(nt, t, node) => { - let node_label = self - .forest - .vertex_label(*node)? - .ok_or(Error::NodeNoLabel(*node))?; + // The 1-th node is a dummy node that should be removed. + assert_ne!(root, 1); - if node_label.label().end().is_none() { - let reduction_fragment = self.atom.generate_virtual_frags(*nt, *t, None); + self.forest.remove_node(1)?; - for frag in reduction_fragment.into_iter().flatten() { - let mut frag = frag.clone(); - frag.set_pos(node_label.label().start(), true)?; + let root_degree = self.forest.degree(root)?; - stack.push(self.forest.plant_at_start(*node, frag)?); - } - } - } - PaVi::Empty => {} - } - } + let root_label = self + .forest + .vertex_label(root)? + .ok_or(Error::NodeNoLabel(root))?; - let forest_nodes_len = self.forest.nodes_len(); - - let mut node_ends: Vec<Vec<usize>> = std::iter::repeat_with(Default::default) - .take(forest_nodes_len) - .collect(); - - // NOTE: Use iterator to avoid checking bounds when accessing - // `node_ends`. - for (node, node_end) in self.forest.nodes().zip(node_ends.iter_mut()) { - if let Some(end) = self - .forest - .vertex_label(node)? - .ok_or(Error::NodeNoLabel(node))? - .label() - .end() - { - node_end.push(end); - } - } + dbg!(root_degree, root_label); - #[derive(Copy, Clone)] - enum StackElement { - Seen(usize), - Unseen(usize), - } + // REVIEW: Why nodes? + let mut nodes = Vec::with_capacity(root_degree); - use StackElement::{Seen, Unseen}; + if root_label.is_packed() { + let mut all_completed_clones = Vec::with_capacity(root_degree); - // maybe we do not need so much space? - let mut stack = Vec::with_capacity(forest_nodes_len); + for clone in self.forest.children_of(root)? { + let mut all_completed = true; - stack.push(Unseen( - self.forest.root().ok_or(Error::IndexOutOfBounds(0, 0))?, - )); + // The last child does not count. + let clone_child_degree_minus_one = std::cmp::max(self.forest.degree(clone)?, 1) - 1; - let mut seen_nodes = HashSet::with_capacity(forest_nodes_len); + // The loop to determine whether or not it is all + // completed, except for the last child. + 'completion_det_loop: for clone_child in self + .forest + .children_of(clone)? + .take(clone_child_degree_minus_one) + { + let clone_child_label = self + .forest + .vertex_label(clone_child)? + .ok_or(Error::NodeNoLabel(clone_child))?; - // depth-first search - while let Some(top) = stack.pop() { - match top { - Seen(top) => { - // Every of its children should be processed - // already. + if clone_child_label.label().end().is_none() { + all_completed = false; - if !node_ends.get(top).unwrap().is_empty() { - continue; + break 'completion_det_loop; } - - let last_index = self.forest.degree(top).unwrap() - 1; - let last_child = self.forest.nth_child(top, last_index)?; - - *node_ends.get_mut(top).unwrap() = node_ends - .get(last_child) - .ok_or(Error::IndexOutOfBounds(last_child, forest_nodes_len))? - .clone(); } - Unseen(top) => { - if seen_nodes.contains(&top) { - continue; - } - seen_nodes.insert(top); - // NOTE: After the above check we can safely unwrap. + if all_completed { + all_completed_clones.push(clone); + } + } - let mut top_no_children = true; + if all_completed_clones.is_empty() { + // Nothing to do + return Ok(result); + } - for child in self.forest.children_of(top).unwrap() { - if top_no_children { - stack.push(Seen(top)); - } + for clone in all_completed_clones { + nodes.push( + self.forest + .reduction(clone, pos, ter, self.atom.borrow(), true)?, + ); + } + } else if root_label.clone_index().is_some() { + panic!( + "If the root of the forest is cloned, \ + the root should be changed to the packed node." + ); + } else { + // The last child does not count. + let root_degree_minus_one = std::cmp::max(root_degree, 1) - 1; - top_no_children = false; + dbg!(root_degree_minus_one); - stack.push(Unseen(child)); - } + // The loop to determine whether or not it is all + // completed, except for the last child. + for root_child in self.forest.children_of(root)?.take(root_degree_minus_one) { + let root_child_label = self + .forest + .vertex_label(root_child)? + .ok_or(Error::NodeNoLabel(root_child))?; - if !top_no_children { - continue; - } + if root_child_label.label().end().is_none() { + return Ok(result); } } + + nodes.push( + self.forest + .reduction(root, pos, ter, self.atom.borrow(), true)?, + ); } - Ok(()) + dbg!(&nodes); + + // self.forest + // .print_viz("forest before extraction.gv") + // .unwrap(); + + result = self.forest.extract(pos)?; + + // result.print_viz("extracted forest.gv").unwrap(); + + Ok(result) } } @@ -1051,7 +1032,7 @@ mod test_chain { chain.chain(0, 12, no_item)?; if !no_item { - chain.end_of_input()?; + let _output = chain.end_of_input(13, 0)?; chain.forest.print_viz("forest.gv")?; chain.graph.print_viz("chain.gv")?; @@ -1079,6 +1060,36 @@ mod test_chain { } #[test] + fn test_assumption() -> Result<(), Box<dyn std::error::Error>> { + use grammar::Grammar; + + dbg!(); + let grammar_str = std::fs::read_to_string( + "/Users/durand/Desktop/Centre/A propos de programmes/Rust/rep/grammar/abnf grammars/test.abnf", + ) + .unwrap(); + dbg!(); + let grammar: Grammar = grammar_str.parse()?; + + let atom = DefaultAtom::from_grammar(grammar)?; + let mut chain = DefaultChain::unit(atom)?; + + let no_item = false; + + let input: &[usize] = &[3, 0, 2, 2, 2, 1, 1, 1, 0, 1]; + + for (pos, t) in input.iter().copied().enumerate().take(2) { + chain.chain(t, pos, no_item)?; + chain.forest().print_viz(&format!("forest {pos}.gv"))?; + dbg!(pos, t); + } + + item::default::print_labels(&chain.atom, &chain.forest)?; + + Ok(()) + } + + #[test] fn test_ambiguity() -> Result<(), Box<dyn std::error::Error>> { if std::fs::metadata("debug.log").is_ok() { std::fs::remove_file("debug.log")?; @@ -1125,7 +1136,7 @@ mod test_chain { chain.chain(1, 4, no_item)?; if !no_item { - // chain.end_of_input()?; + let _output = chain.end_of_input(5, 1)?; chain.forest.print_viz("forest.gv")?; // chain.forest.print_closed_viz("closed.gv")?; } diff --git a/chain/src/item/default/extract.rs b/chain/src/item/default/extract.rs new file mode 100644 index 0000000..96f5119 --- /dev/null +++ b/chain/src/item/default/extract.rs @@ -0,0 +1,509 @@ +//! This module defines a function for extracting the "completed part" +//! of a forest. +//! +//! # Completed sub-forest +//! +//! The completed part of a forest refers to the sub-forest of the +//! forest that consists entirely of nodes whose ending positions are +//! set, and whose subtrees also consist of such nodes. +//! +//! # Definition +//! +//! To be more rigorous, we call a node *n* of a forest *F* complete +//! if its ending position is set. We call a sub-forest *S* of *F* +//! completed if the following two conditions are satisfied: +//! +//! 1. Every node in *S* is complete. +//! 2. For every node *n* in *S*, the subtree of *F* rooted at *n* +//! consists entirely of complete nodes. +//! +//! # Uniqueness of the maximal completed sub-forest +//! +//! **Lemma** +//! +//! For any forest *F*, there is only one maximal completed sub-forest +//! *S* of *F*. Here the maximality means that if *T* is a completed +//! sub-forest of *F* which contains *S*, then *S* is equal to *T*. +//! +//! Then by **the completed part** of a forest *F* we refer to the +//! unuique maximal completed sub-forest of *F*. +//! +//! **Proof** +//! +//! Note that if *S_1* and *S_2* are two completed sub-forests of *F*, +//! and if *S_1* is not contained in *S_2*, say *n* is a node in *S_1* +//! but not in *S_2*, then by adjoining the subtree of *S_2* rooted at +//! *n* to *S_1* we obtain a completed sub-forest *S_3* which contains +//! *S_1*, contradicting the maximality of *S_1*. Thus there can only +//! be one maximal completed sub-forest of a forest. +//! +//! # Connected component +//! +//! The extraction function actually returns the connected component +//! of the completed part of a forest which contains its root, as that +//! is what we care about the most. + +use super::*; + +impl DefaultForest<ForestLabel<GrammarLabel>> { + pub(crate) fn extract(&self, pos: usize) -> Result<Self, Error> { + // Preparations + + let nodes_len = self.nodes_len(); + + let mut graph = PLGraph::default(); + let mut builder = PLGBuilderMut::from_graph_mut(&mut graph); + let root = Some(0); + + // Now find the connected component of the completed part of + // `self` and clone that to graph by means of `builder`. + + // REVIEW: Maybe a small_vec can help here. + + // A fixed-size hash table, sort of. + let mut validity_array: Vec<bool> = std::iter::repeat(true).take(nodes_len).collect(); + + // Calculate the exact length to avoid too many allocations. + let mut stack_len = 0usize; + + // Use an iterator to avoid checking bounds in accessing + // elements of the array. + for (node, validity_ptr) in self.nodes().zip(validity_array.iter_mut()) { + if self + .vertex_label(node)? + .ok_or(Error::NodeNoLabel(node))? + .label() + .end() + .is_none() + { + *validity_ptr = false; + + stack_len += 1; + } + } + + dbg!(&validity_array); + + // A stack for propagating the falsehood to parents and + // children of incomplete nodes, like a plague. The only + // nodes that can stop the spread of this plague are packed + // nodes with a child that is not infected (yet) by the + // plague. + + let mut stack: Vec<usize> = Vec::with_capacity(stack_len); + + for (n, validity) in validity_array.iter().enumerate() { + if !*validity { + stack.push(n); + } + } + + while let Some(top) = stack.pop() { + 'parent_loop: for parent in self.parents_of(top)?.map(|p| p.node()) { + if !*validity_array + .get(parent) + .ok_or(Error::IndexOutOfBounds(parent, nodes_len))? + { + // already infected by the plague + + continue 'parent_loop; + } + + let should_spread_p = if self + .vertex_label(parent)? + .ok_or(Error::NodeNoLabel(parent))? + .is_packed() + { + !self + .children_of(parent)? + .any(|node| validity_array.get(node).copied() == Some(true)) + } else { + true + }; + + if should_spread_p { + *validity_array + .get_mut(parent) + .ok_or(Error::IndexOutOfBounds(parent, nodes_len))? = false; + stack.push(parent); + } + } + } + + let validity_array = validity_array; + + /// A little macro to produce a vector of valid children. + macro_rules! valid_children { + ($node:ident) => { + self.children_of($node)? + .filter(|child| validity_array.get(*child).copied() == Some(true)) + .collect::<Vec<usize>>() + }; + } + + dbg!(&validity_array); + + if validity_array.iter().all(|validity| !*validity) { + // every element is false + + let root = None; + + return Ok(Self { graph, root }); + } + + // Finally clone the connected component to the new graph. + + let root_label = GrammarLabel::new_closed(TNT::Non(0), 0, pos); + + let packed_label = ForestLabel::new(root_label, ForestLabelType::Packed); + + let plain_label = ForestLabel::new(root_label, ForestLabelType::Plain); + + let original_root_label; + + let original_root = if let Some(packed_node) = self.query_label(packed_label) { + original_root_label = packed_label; + + packed_node + } else if let Some(plain_node) = self.query_label(plain_label) { + original_root_label = plain_label; + + plain_node + } else { + let root = None; + return Ok(Self { graph, root }); + }; + + let mut roots_stack: Vec<usize>; + + if original_root_label.is_packed() { + roots_stack = valid_children!(original_root); + + match roots_stack.len() { + 0 => { + let root = None; + + return Ok(Self { graph, root }); + } + 1 => { + let child = *roots_stack.first().unwrap(); + + builder.add_vertex(self.vertex_label(child)?.ok_or(Error::NodeNoLabel(child))?); + } + _ => { + let builder_root = builder.add_vertex(original_root_label); + + for child in roots_stack.iter().copied() { + let child_node = builder.add_vertex( + self.vertex_label(child)?.ok_or(Error::NodeNoLabel(child))?, + ); + + builder.add_edge(builder_root, child_node, original_root_label)?; + } + } + } + } else { + builder.add_vertex(original_root_label); + + roots_stack = vec![original_root]; + } + + while let Some(top) = roots_stack.pop() { + let top_label = self.vertex_label(top)?.ok_or(Error::NodeNoLabel(top))?; + + let top_in_builder = match builder.query_label(top_label) { + Some(top_node) => top_node, + None => { + // an old cloned node now becomes a plain node + let plain_label = ForestLabel::new(top_label.label(), ForestLabelType::Plain); + + builder + .query_label(plain_label) + .unwrap_or_else(|| panic!("the top {top} should be planted already")) + } + }; + + 'children_loop: for child in self.children_of(top)? { + let child_label = self.vertex_label(child)?.ok_or(Error::NodeNoLabel(child))?; + + // filter out invalid children + if validity_array.get(child).copied() != Some(true) { + continue 'children_loop; + } + + // avoid unnecessary effort + if let Some(child_node) = builder.query_label(child_label) { + builder.add_edge(top_in_builder, child_node, child_label)?; + + continue 'children_loop; + } + + if child_label.is_packed() { + let child_valid_children = valid_children!(child); + + match child_valid_children.len() { + 0 => { + panic!("this case should not happen"); + } + 1 => { + // If a packed node only has one valid + // child, we clone a plain node instead. + + let child_child = *child_valid_children.first().unwrap(); + + let child_plain_label = + ForestLabel::new(child_label.label(), ForestLabelType::Plain); + + let child_plain_node = builder.add_vertex(child_plain_label); + + builder.add_edge( + top_in_builder, + child_plain_node, + child_plain_label, + )?; + + roots_stack.push(child_child); + } + _ => { + let child_node = builder.add_vertex(child_label); + + builder.add_edge(top_in_builder, child_node, child_label)?; + + roots_stack.push(child); + } + } + + continue 'children_loop; + } + + let child_node = builder.add_vertex(child_label); + + builder.add_edge(top_in_builder, child_node, child_label)?; + + roots_stack.push(child); + } + } + + Ok(Self { graph, root }) + } +} + +#[cfg(test)] +mod extract_tests { + use super::*; + + fn construct_test_forest( + ) -> Result<DefaultForest<ForestLabel<GrammarLabel>>, Box<dyn std::error::Error>> { + // node 0 + let mut result: DefaultForest<ForestLabel<GrammarLabel>> = DefaultForest::new_leaf( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(0)), 0, 3), + ); + + // node 1 + result.plant( + 0, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::Rule(15), 0, 3), + _ + ), + false, + )?; + + result.plant( + 1, + DefaultForest::new_leaf(GrammarLabel::new_closed( + GrammarLabelType::TNT(TNT::Non(0)), + 0, + 3, + )), + true, + )?; + + // node 2 + result.plant( + 0, + leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(6), 0, 1), _), + false, + )?; + + // node 3 + result.plant( + 2, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(0)), 0, 1), + _ + ), + false, + )?; + + // node 4 + result.plant( + 0, + leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(7), 1, 3), _), + false, + )?; + + // node 5 + result.plant( + 4, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(0)), 1, 3), + _ + ), + false, + )?; + + // node 6 + result.plant( + 5, + leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(3), 1, 2), _), + false, + )?; + + // node 7 + result.plant( + 6, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(1)), 1, 2), + _ + ), + false, + )?; + + // node 8 + result.plant( + 7, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::Rule(11), 1, 2), + _ + ), + false, + )?; + + // node 9 + result.plant( + 8, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(2)), 1, 2), + _ + ), + false, + )?; + + // Clone node 5 to have node 10 and node 11 + result.clone_node(5, 0, false)?; + + // node 12 + result.plant( + 11, + leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(3), 1, 3), _), + false, + )?; + + // node 13 + result.plant( + 12, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(1)), 1, 3), + _ + ), + false, + )?; + + result.plant( + 13, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::Rule(11), 1, 2), + _ + ), + true, + )?; + + // node 14 + result.plant( + 13, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::Rule(13), 2, 3), + _ + ), + false, + )?; + + // node 15 + result.plant( + 14, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(2)), 2, 3), + _ + ), + false, + )?; + + // node 16 + result.plant( + 5, + leaf!(GrammarLabel::new(GrammarLabelType::Rule(4), 2), _), + false, + )?; + + // node 17 + result.plant( + 16, + leaf!(GrammarLabel::new(GrammarLabelType::TNT(TNT::Non(0)), 2), _), + false, + )?; + + // node 18 + result.plant( + 17, + leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(3), 2, 3), _), + false, + )?; + + // node 19 + result.plant( + 18, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(1)), 2, 3), + _ + ), + false, + )?; + + // node 20 + result.plant( + 19, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::Rule(11), 2, 3), + _ + ), + false, + )?; + + result.plant( + 20, + leaf!( + GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(2)), 2, 3), + _ + ), + true, + )?; + + Ok(result) + } + + #[test] + fn test_extract() -> Result<(), Box<dyn std::error::Error>> { + let forest = construct_test_forest()?; + + assert_eq!(forest.nodes_len(), 21); + + forest.print_viz("forest before extraction.gv")?; + + let extract_result = forest.extract(3)?; + + extract_result.print_viz("extracted forest.gv")?; + + Ok(()) + } +} diff --git a/chain/src/item/default/mod.rs b/chain/src/item/default/mod.rs index 47e8c90..28bdbee 100644 --- a/chain/src/item/default/mod.rs +++ b/chain/src/item/default/mod.rs @@ -12,7 +12,7 @@ use graph_macro::Graph; use std::collections::HashSet; -use core::fmt::Display; +use std::fmt::Display; /// The type of errors for forest operations. #[derive(Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] @@ -216,6 +216,15 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<ForestLabel<T>> { return Err(Error::IndexOutOfBounds(node_id, self.nodes_len())); } + // FIXME: We should take the presence of packed nodes into + // consideration. That is, the fragment might have already + // been planted and packed, and we need to account for such + // possibilities. + // + // Moreover, the fragments might also contain packed nodes, in + // which case we need to check these multiple possibilities + // are properly contained. + // We do a depth-first traversal to determine if every node // encountered has the same set of children (labels taken into // the consideration). @@ -450,7 +459,7 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<ForestLabel<T>> { // Make sure it only has one parent, which is the // representative node. - assert_eq!(parents.len(), 1); + // assert_eq!(parents.len(), 1); // We checked its length is 1, so it is safe to unwrap // here. @@ -670,6 +679,8 @@ impl<T: GraphLabel> DefaultForest<ForestLabel<T>> { pub mod splone; +pub mod extract; + impl<T: GraphLabel> DefaultForest<T> { /// Create a forest with only one leaf from a raw label, unlike /// `new_leaf`, which transforms the label for convenience. @@ -798,53 +809,6 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { Ok(()) } - /// Find the last child of the given node whose start and end - /// positions contain the given position. If no such child is - /// found, return `Ok(None)`. - /// - /// The returned tuple is of the form (child, index), where - /// `child` is the index of the child node in question, and - /// `index` means that the child is the `index`-th child of the - /// node. - #[allow(dead_code)] - pub(crate) fn position_search( - &self, - node: usize, - pos: usize, - ) -> Result<Option<(usize, usize)>, Error> { - fn range_contains(label: GrammarLabel, pos: usize) -> bool { - let start = label.start(); - - if let Some(end) = label.end() { - (start..end).contains(&pos) - } else { - (start..).contains(&pos) - } - } - - let node_label = self - .vertex_label(node)? - .ok_or(Error::NodeNoLabel(node))? - .label(); - - if !range_contains(node_label, pos) { - return Ok(None); - } - - for (index, child) in self.children_of(node)?.enumerate().rev() { - let child_label = self - .vertex_label(child)? - .ok_or(Error::NodeNoLabel(child))? - .label(); - - if range_contains(child_label, pos) { - return Ok(Some((child, index))); - } - } - - Ok(None) - } - /// Set the position of every node. /// /// If ALLP is non-nil or if the node is a terminal node, also set diff --git a/chain/src/item/default/splone.rs b/chain/src/item/default/splone.rs index 581d1dc..da13c56 100644 --- a/chain/src/item/default/splone.rs +++ b/chain/src/item/default/splone.rs @@ -516,7 +516,13 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { if let Some((fragment, _planted)) = fragment { let modified_degree = std::cmp::max(child_degree, 1) - 1; - if self.has_same_children(child, node, modified_degree, edge_index)? + dbg!(node, end, edge_index, modified_degree); + + dbg!(child, child_degree); + + dbg!(&fragment); + + if self.has_same_children(child, node, modified_degree, edge_index + 1)? && child_degree != 0 { let last_child = self.nth_child(child, child_degree - 1)?; @@ -651,7 +657,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { } else if labela.clone_index().is_some() { let mut parentsa = self.parents_of(nodea)?; - assert_eq!(parentsa.len(), 1); + // assert_eq!(parentsa.len(), 1); let parenta = parentsa.next().unwrap().node(); diff --git a/chain/src/item/genins.rs b/chain/src/item/genins.rs index 19f835b..e409c3c 100644 --- a/chain/src/item/genins.rs +++ b/chain/src/item/genins.rs @@ -32,7 +32,7 @@ pub(crate) fn index_out_of_bounds_conversion(ge: GrammarError) -> Error { /// Determine if a label is labelled by a terminal. fn is_labelled_by_terminal(label: GrammarLabelType) -> bool { - matches!(label.tnt(), Some(t) if matches!(t, TNT::Ter(_))) + matches!(label.tnt(), Some(tnt) if matches!(tnt, TNT::Ter(_))) } /// A helper function to generate a fragment of forest. @@ -211,7 +211,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { // Whether or not to print detailed graphs of each step of // operation for debugging purposes. - let to_print = false; + let to_print = true; let tnt_string = { let empty_p = atom_child_iter.len() == 0; @@ -376,7 +376,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { if let PaVi::Parent(node, edge, _) = pavi { let nth_child = self.nth_child(node, edge)?; - let reduced = self.reduction(nth_child, pos, ter, atom.borrow())?; + let reduced = self.reduction(nth_child, pos, ter, atom.borrow(), false)?; if reduced != nth_child && !self.is_empty_node(reduced)? { parents.clear(); @@ -700,10 +700,14 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { let reduction_fragment = atom.generate_virtual_frags(nt, t, None); - // NOTE: the case of the root is exceptional - if reduction_fragment.is_none() && self.root() != Some(node) { - return Err(Error::CannotClose(nt, t, node, node_label_start)); - } + // Maybe we do not have to force the reduciton here? + + // // NOTE: the case of the root is exceptional + // if reduction_fragment.is_none() && self.root() != Some(node) { + // dbg!(self.root()); + // self.print_viz("cannot close.gv").unwrap(); + // return Err(Error::CannotClose(nt, t, node, node_label_start)); + // } for frag in reduction_fragment.into_iter().flatten() { let mut frag = frag.clone(); diff --git a/chain/src/item/mod.rs b/chain/src/item/mod.rs index 54ca946..d2c3127 100644 --- a/chain/src/item/mod.rs +++ b/chain/src/item/mod.rs @@ -7,7 +7,7 @@ use graph::{error::Error as GError, GraphLabel, LabelGraph, Parent, ParentsGraph}; -use core::borrow::Borrow; +use std::borrow::Borrow; /// A parent or a virtual segment. /// @@ -96,7 +96,7 @@ enum ForestLabelType { Cloned(usize), } -impl core::fmt::Display for ForestLabelType { +impl std::fmt::Display for ForestLabelType { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { Self::Packed => write!(f, "packed"), @@ -113,8 +113,8 @@ pub struct ForestLabel<T: GraphLabel> { status: ForestLabelType, } -impl<T: GraphLabel> core::fmt::Display for ForestLabel<T> { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl<T: GraphLabel> std::fmt::Display for ForestLabel<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if !matches!(self.status, ForestLabelType::Plain) { write!(f, "{}, {}", self.label, self.status) } else { @@ -132,8 +132,8 @@ pub enum ForestLabelError { ClonePack, } -impl core::fmt::Display for ForestLabelError { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl std::fmt::Display for ForestLabelError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::PackClone => write!(f, "cannot pack a cloned node"), Self::ClonePack => write!(f, "cannot clone a packed node"), @@ -230,6 +230,9 @@ impl<T: GraphLabel> From<T> for ForestLabel<T> { } } +// REVIEW: Should we move this trait (and only this trait) to a +// separate crate, or is it enough to keep it here? + /// The expected behaviours of an item derivation forest. /// /// Note that it requires only a subset of the functionalities of diff --git a/chain/src/item/reduction.rs b/chain/src/item/reduction.rs index 89306e6..3c76c1e 100644 --- a/chain/src/item/reduction.rs +++ b/chain/src/item/reduction.rs @@ -122,6 +122,9 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { /// The parameter `ter` is used to fill in segments for virtual /// nodes if they are found along the way. /// + /// The parameter `accept_root` controls whether we want to + /// perform reduction on the root. + /// /// # Errors /// /// 1. Index out of bounds: some node number is corrupted. @@ -132,12 +135,13 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { pos: usize, ter: usize, atom: &DefaultAtom, + accept_root: bool, ) -> Result<usize, Error> { let mut result = node; // step 1: Determine if this needs reductions. - if self.root() == Some(node) { + if !accept_root && self.root() == Some(node) { return Ok(result); } @@ -152,9 +156,30 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { return Ok(result); } + // Data type for representing the status when performing a + // search. + + #[derive(PartialEq, Eq, Copy, Clone, Debug)] + enum Status { + Correct, + Incorrect, + Visited, + } + + impl From<bool> for Status { + fn from(value: bool) -> Self { + match value { + true => Self::Correct, + false => Self::Incorrect, + } + } + } + + use Status::*; + // step 2: Find descendents that need reductions. - let mut correct_ends: Map<usize, bool> = Default::default(); + let mut correct_ends: Map<usize, Status> = Default::default(); let mut order_of_correct_ends: Vec<usize> = Vec::new(); @@ -165,7 +190,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { use std::{borrow::BorrowMut, io::Write}; // Whether or not to write a debug file. - let to_write = false; + let to_write = true; if to_write { if let Ok(ref mut file) = file.borrow_mut() { @@ -181,14 +206,18 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { } } - if correct_ends.get(&top).is_some() { + let old_value = correct_ends.get(&top).copied(); + + if matches!(old_value, Some(Correct) | Some(Incorrect)) { continue 'stack_loop; } + correct_ends.insert(top, Visited); + let top_label = self.vertex_label(top)?.ok_or(Error::NodeNoLabel(top))?; if let Some(end) = top_label.label().end() { - correct_ends.insert(top, end == pos); + correct_ends.insert(top, (end == pos).into()); continue 'stack_loop; } @@ -216,8 +245,8 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { 'child_loop: for child in self.children_of(top)? { match correct_ends.get(&child).copied() { - Some(true) => { - correct_ends.insert(top, true); + Some(Correct) => { + correct_ends.insert(top, Correct); inserted = true; } @@ -232,12 +261,15 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { if has_unexplored_children { stack.push(top); - stack.extend(self.children_of(top)?); + stack.extend( + self.children_of(top)? + .filter(|child| correct_ends.get(child).is_none()), + ); } else if !inserted { - correct_ends.insert(top, false); + correct_ends.insert(top, Incorrect); } } else { - correct_ends.insert(top, false); + correct_ends.insert(top, Incorrect); } continue 'stack_loop; @@ -245,6 +277,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { if top_label.is_packed() { let mut has_unexplored_children = false; + let mut inserted = false; if to_write { if let Ok(ref mut file) = file.borrow_mut() { @@ -255,12 +288,13 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { for child in self.children_of(top)? { match correct_ends.get(&child).copied() { - Some(true) => { + Some(Correct) => { // NOTE: This is not recorded in the // correct orders, as we do not handle // packed nodes directly. - correct_ends.insert(top, true); + correct_ends.insert(top, Correct); + inserted = true; } None => { has_unexplored_children = true; @@ -280,9 +314,12 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { if has_unexplored_children { stack.push(top); - stack.extend(self.children_of(top)?); - } else if correct_ends.get(&top).is_none() { - correct_ends.insert(top, false); + stack.extend( + self.children_of(top)? + .filter(|child| correct_ends.get(child).is_none()), + ); + } else if !inserted { + correct_ends.insert(top, Incorrect); } continue 'stack_loop; @@ -309,7 +346,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { panic!("a terminal node {top} has no ending position?"); } Some(TNT::Non(nt)) => { - correct_ends.insert(top, true); + correct_ends.insert(top, Correct); self.close_pavi(atom.borrow(), PaVi::Virtual(nt, ter, top), pos)?; @@ -322,7 +359,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { Some(end) => end, }; - correct_ends.insert(top, end == pos); + correct_ends.insert(top, (end == pos).into()); if end == pos { order_of_correct_ends.push(top); @@ -341,10 +378,12 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { let last_child = self.nth_child(top, last_index)?; if let Some(child_correctness_value) = correct_ends.get(&last_child).copied() { - correct_ends.insert(top, child_correctness_value); + if child_correctness_value != Visited { + correct_ends.insert(top, child_correctness_value); - if child_correctness_value { - order_of_correct_ends.push(top); + if child_correctness_value == Correct { + order_of_correct_ends.push(top); + } } } else { stack.extend([top, last_child]); @@ -375,12 +414,6 @@ impl DefaultForest<ForestLabel<GrammarLabel>> { #[cfg(debug_assertions)] { - let label_label_label = label.label().label(); - - assert!(label_label_label.rule().is_none()); - - assert!(matches!(label_label_label.tnt(), Some(TNT::Non(_)))); - assert!(label.label().end().is_none()); assert_ne!(degree, 0); diff --git a/chain/src/lib.rs b/chain/src/lib.rs index 9de1df7..bf3a8b4 100644 --- a/chain/src/lib.rs +++ b/chain/src/lib.rs @@ -240,6 +240,10 @@ pub trait Chain: LabelExtGraph<Edge> { /// language. fn unit(atom: Self::Atom) -> Result<Self, Self::Error>; + /// Produce the underlying atom, so that we can produce an initial + /// empty chain from the same atom again. + fn atom(&self) -> &Self::Atom; + /// Return true if and only if the language contains the empty /// string. fn epsilon(&self) -> Result<bool, Self::Error>; @@ -283,7 +287,8 @@ pub trait Chain: LabelExtGraph<Edge> { /// The argument `t` is the terminal we computet the derivative /// with. /// - /// The argument `pos` is the position within the input. + /// The argument `pos` is the zero-based position within the + /// input. /// /// The argument `no_item` determines whether we want the item /// derivation forest as well. @@ -307,9 +312,29 @@ pub trait Chain: LabelExtGraph<Edge> { Ok(()) } + // FIXME: I shall probably not use the trait of forests for this + // purpose, but I have not figured out yet wha the correct trait + // should be. + /// The type of output item that will be produced by this machine. + type Item: item::Forest<grammar::GrammarLabel>; + /// Signal to the parser that the end of the input is reached, so /// that the parser knows to generate suitable forests. - fn end_of_input(&mut self) -> Result<(), Self::Error>; + /// + /// This is not needed when recognizing instead of parsing. + /// + /// We also pass in the current position `pos` so that we have a + /// little flexibility in the position of the end of input. In + /// addition, this makes it easier to produce the suitable + /// forests. + /// + /// As a reminder, `pos` should be the position of the last + /// terminal, so if there are three terminals in total, the `pos` + /// should be `2` , instead of `3`. + /// + /// Similarly, we pass in the terminal at the position `pos` to + /// aid the production of the forest. + fn end_of_input(&mut self, pos: usize, ter: usize) -> Result<Self::Item, Self::Error>; } pub mod default; diff --git a/grammar/src/abnf.rs b/grammar/src/abnf.rs deleted file mode 100644 index eb56819..0000000 --- a/grammar/src/abnf.rs +++ /dev/null @@ -1,37 +0,0 @@ -//! This file implements the function to read a grammar from a string, -//! in the [augmented Backus Naur -//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form). -//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its -//! specifications. -//! -//! In fact, the rules of ABNF are considered to be too strict. For -//! example, only ASCII characters are allowed in the format, even -//! within comments. I think any character should be allowed within -//! comments, so the implementation here does that. - -use super::*; - -/// The type of errors encountered during parsing a grammar in the -/// ABNF format. See the function [`abnf_to_grammar`] for the parsing -/// function. -#[derive(Debug, Clone, Copy)] -pub enum Error { - /// Something invalid has occurred. - Invalid, -} - -impl core::fmt::Display for Error { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - match self { - Error::Invalid => write!(f, "invalid"), - } - } -} - -#[allow(unused)] -/// This function converts a string to a grammar. -pub fn abnf_to_grammar(str: impl AsRef<str>) -> Result<Grammar, Error> { - let str = str.as_ref(); - - todo!() -} diff --git a/grammar/src/abnf/boolean_fns.rs b/grammar/src/abnf/boolean_fns.rs new file mode 100644 index 0000000..08f317c --- /dev/null +++ b/grammar/src/abnf/boolean_fns.rs @@ -0,0 +1,42 @@ +//! This file collects some functions that return a boolean value from +//! a character. + +pub(super) fn is_wsp(c: char) -> bool { + c == ' ' || c == '\t' +} + +pub(crate) fn is_v_char(c: char) -> bool { + ('!'..='~').contains(&c) +} + +pub(super) fn is_wsp_or_v_char(c: char) -> bool { + is_wsp(c) || is_v_char(c) +} + +pub(super) fn is_not_newline(c: char) -> bool { + c != '\n' && c != '\r' +} + +pub(super) fn is_alpha(c: char) -> bool { + c.is_ascii_alphabetic() +} + +pub(super) fn is_decimal_digit(c: char) -> bool { + c.is_ascii_digit() +} + +pub(super) fn is_binary_digit(c: char) -> bool { + c == '0' || c == '1' +} + +pub(super) fn is_hexadecimal_digit(c: char) -> bool { + is_decimal_digit(c) || ('A'..='F').contains(&c) || ('a'..='f').contains(&c) +} + +pub(super) fn is_alpha_or_digit_or_hyphen(c: char) -> bool { + is_alpha(c) || is_decimal_digit(c) || c == '-' +} + +pub(super) fn is_char_value(c: char) -> bool { + (' '..='~').contains(&c) && c != '"' +} diff --git a/grammar/src/abnf/mod.rs b/grammar/src/abnf/mod.rs new file mode 100644 index 0000000..f825bef --- /dev/null +++ b/grammar/src/abnf/mod.rs @@ -0,0 +1,2536 @@ +//! This file implements the function to read a grammar from a string, +//! in the [augmented Backus Naur +//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form). +//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its +//! specifications. +//! +//! In fact, the rules of ABNF are considered to be too strict. For +//! example, only ASCII characters are allowed in the format, even +//! within comments. I think any character should be allowed within +//! comments, so the implementation here does that. + +#![allow(dead_code)] + +use super::*; + +/// The type of errors encountered during parsing a grammar in the +/// ABNF format. +#[derive(Debug)] +#[non_exhaustive] +pub enum Error { + /// A rule name without rule body. + HangingRuleName(usize), + /// The minimum is greater than maximum? + MinGreaterThanMax(usize), + /// A digit is expected, but is not encountered. + InvalidDigit(usize), + /// A character that is not allowed. + InvalidCharacter(usize), + /// A numeric value is not completed. + HangingNumeric(usize), + /// A repetition is hanging. + HangingRepetition(usize), + /// A double quote is hanging. + HangingDQuote(usize), + /// A percentage sign is hanging. + HangingPercent(usize), + /// An equal sign is hanging. + HangingEqualsSign(usize), + /// An operation is unsupported. + UnsupportedOperation, + /// Encounters an unknown node when constructing the regular + /// expression. + RegexUnknownNode(usize), + /// An index is out of bounds when constructing the regular + /// expression. + RegexIndexOutOfBounds(usize, usize), + /// A node is duplicated when constructing the regular expression. + RegexDuplicatedNode(usize), + /// An edge is duplicated when constructing the regular + /// expression. + RegexDuplicatedEdge(usize, usize), + /// The constructed regular expression graph has no root. + RegexNoRoot, + /// The constructed regular expression graph has cycles. + RegexCycle, + /// The stack illegally becomes empty when constructing a regular + /// expression. + RegexEmptyStack(usize), + /// The action stack illegally becomes empty when constructing a + /// regular expression. + RegexEmptyActionStack(usize), + /// An unbalanced parenthesis is encountered when constructing a + /// regular expression. + RegexUnbalancedParen(usize), + /// An invalid repetition specification is encountered when + /// constructing a regular expression. + RegexInvalidRepeat(usize), + /// An invalid comment character is encountered. + RegexInvalidComment(usize), + /// An invalid numeric value is encoutnered. + RegexInvalidNumeric(usize), + /// A rule should have an equal sign. + MissingEqualSign(usize), + /// An unimplemented feature is required. + Unimplemented(usize), + /// Input-Output error. + IOError(std::io::Error), + /// The grammar is empty. + EmptyGrammar, + /// Found an appending rule associated to a non-existent rule. + AppendToNothing(usize), + /// Found a rule declaration associated to an existing rule. + CreateAgain(usize), + /// This terminal is not found. + UnknownTerminal(String), + /// This non-terminal is not found. + UnknownNonterminal(String), + /// Something invalid has occurred. + Invalid, +} + +impl std::error::Error for Error {} + +impl From<std::io::Error> for Error { + fn from(e: std::io::Error) -> Self { + Self::IOError(e) + } +} + +impl From<graph::error::Error> for Error { + fn from(value: graph::error::Error) -> Self { + match value { + graph::error::Error::IndexOutOfBounds(index, bound) => { + Error::RegexIndexOutOfBounds(index, bound) + } + graph::error::Error::DuplicatedNode(n) => Error::RegexDuplicatedNode(n), + graph::error::Error::DuplicatedEdge(from, to) => Error::RegexDuplicatedEdge(from, to), + _ => Error::Invalid, + } + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::HangingRuleName(pos) => write!(f, "a rule name without body at {pos}"), + Error::MinGreaterThanMax(pos) => write!( + f, + "minimum should not be strictly greater than the maximum at {pos}." + ), + Error::InvalidDigit(pos) => { + write!(f, "a digit is expected at {pos}, but is not found.") + } + Error::InvalidCharacter(pos) => write!(f, "the character at {pos} is not allowed"), + Error::HangingNumeric(pos) => write!(f, "a numeric value is hanging at {pos}"), + Error::HangingRepetition(pos) => write!(f, "a repetition is hanging at {pos}"), + Error::HangingDQuote(pos) => write!(f, "a double quote is hanging at {pos}"), + Error::HangingPercent(pos) => write!(f, "a percentage sign is hanging at {pos}"), + Error::HangingEqualsSign(pos) => write!(f, "an equal sign is hanging at {pos}"), + Error::UnsupportedOperation => write!(f, "an unsupported operation is encountered"), + Error::RegexUnknownNode(n) => write!(f, "an unknown node {n} is encountered \ + when constructing the regular expression."), + Error::RegexIndexOutOfBounds(index, bound) => write!(f, "an index {index} is out of \ + bound {bound} when constructing the regular expression."), + Error::RegexDuplicatedNode(n) => write!(f, "a node {n} is duplicated \ + when constructing the regular expression."), + Error::RegexDuplicatedEdge(from, to) => write!(f, "an edge from {from} to {to} is duplicated \ + when constructing the regular expression."), + Error::RegexNoRoot => write!(f, "the constructed regular expression has no root."), + Error::RegexCycle => write!(f, "the constructed regular expression has cycles."), + Error::RegexEmptyStack(pos) => write!(f, "the stack illegally becomes empty at {pos} \ + when constructing a regular expression."), + Error::RegexEmptyActionStack(pos) => write!(f, "the action stack illegally becomes empty at {pos} \ + when constructing a regular expression."), + Error::RegexUnbalancedParen(pos) => write!(f, "An unbalanced parenthesis is encountered at {pos} \ + when constructing a regular expression."), + Error::RegexInvalidRepeat(pos) => write!(f, "An invalid repetition specification is encountered \ + at {pos} when constructing a regular expression."), + Error::RegexInvalidComment(pos) => write!(f, "An invalid character is encountered in a comment \ + at {pos}"), + Error::RegexInvalidNumeric(pos) => write!(f, "An invalid numeric value is encoutnered at {pos}."), + Error::MissingEqualSign(pos) => write!(f, "A rule is missing an equal sign at {pos}"), + Error::Unimplemented(pos) => write!(f, "An unimplemented feature is required at {pos}."), + Error::IOError(e) => write!(f, "An IO error: {e}"), + Error::EmptyGrammar => write!(f, "the grammar is empty"), + Error::AppendToNothing(pos) => write!(f, "the rule at {pos} is appending to a non-existing rule."), + Error::CreateAgain(pos) => write!(f, "the rule at {pos} is creating a duplicated rule."), + Error::UnknownTerminal(name) => write!(f, "the terminal {name} is not found."), + Error::UnknownNonterminal(name) => write!(f, "the non-terminal {name} is not found."), + Error::Invalid => write!(f, "invalid"), + } + } +} + +/// Return the number of bytes taken by the characters for which the +/// function `f` returns `true`. +fn skip_by(s: &str, f: impl Fn(char) -> bool) -> usize { + for (index, c) in s.char_indices() { + if !f(c) { + return index; + } + } + + s.len() +} + +mod boolean_fns; + +pub(crate) use boolean_fns::*; + +fn skip_c_nl(s: &str) -> Option<usize> { + if s.is_empty() { + return Some(0); + } + + let s_len = s.len(); + + match s.chars().next().unwrap() { + ';' => { + let after_not_newline = skip_by(&s[1..], is_not_newline); + + if 1 + after_not_newline >= s_len { + Some(s_len) + } else { + let s = &s[1 + after_not_newline..]; + + match s.chars().next().unwrap() { + '\n' | '\r' => Some(after_not_newline + 2), + _ => { + // We skipped anything not newline, so this + // character must be newline. + unreachable!(); + } + } + } + } + '\n' | '\r' => Some(1), + _ => None, + } +} + +fn skip_c_wsp(s: &str) -> usize { + let mut currently_skipped = 0usize; + let mut s = s; + + let mut continue_skipping = true; + + 'skipping_loop: while continue_skipping { + continue_skipping = false; + + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp > 0 { + if skip_wsp >= s.len() { + return currently_skipped + s.len(); + } + + continue_skipping = true; + s = &s[skip_wsp..]; + currently_skipped += skip_wsp; + + continue 'skipping_loop; + } + + if let Some(skip_c_nl_size) = skip_c_nl(s) { + if skip_c_nl_size >= s.len() { + return currently_skipped + s.len(); + } + + continue_skipping = true; + + s = &s[skip_c_nl_size..]; + currently_skipped += skip_c_nl_size; + + let skip_wsp_after_c_nl = skip_by(s, is_wsp); + + s = &s[skip_wsp_after_c_nl..]; + currently_skipped += skip_wsp_after_c_nl; + } + } + + currently_skipped +} + +#[derive(Debug, Eq, PartialEq)] +enum RepetitionSpec { + Star, + Equal(usize), + Min(usize), + Max(usize), + MinMax(usize, usize), +} + +#[derive(Debug)] +enum RepetitionAction { + /// Ignore the following element. + Ignore, + /// The following is put in a star. + Star, + /// The following is put in an optional construct. + Optional, + /// The following is put in a plus construct. + Plus, + /// Copy the following some number of times. + Copy(usize), + /// Copy the following some number of times, and then add a plus + /// of that same element. + CopyPlus(usize), + /// Copy the following some number of times, and then add another + /// number of *nested* optional construts of that same element. + /// + /// # Why nested? + /// + /// The optional constructs are nested here as a sequence of + /// optional constructs is exponentially more ambiguous than the + /// nested equivalent: "a?a?a?" is more ambiguous than + /// "(a(a(a)?)?)?". For example, the input "a" can have three + /// derivations according to "a?a?a?", but has only one derivation + /// according to "(a(a(a)?)?)?". + CopyOptional(usize, usize), +} + +impl TryFrom<Option<RepetitionSpec>> for RepetitionAction { + type Error = Error; + + fn try_from(spec: Option<RepetitionSpec>) -> Result<Self, Self::Error> { + if let Some(r) = spec { + match r { + RepetitionSpec::Star => Ok(Self::Star), + RepetitionSpec::Equal(0) => Ok(Self::Ignore), + RepetitionSpec::Equal(n) => Ok(Self::Copy(n)), + RepetitionSpec::Min(0) => Ok(Self::Star), + RepetitionSpec::Min(1) => Ok(Self::Plus), + RepetitionSpec::Min(n) => Ok(Self::CopyPlus(n - 1)), + RepetitionSpec::Max(0) => Ok(Self::Ignore), + RepetitionSpec::Max(1) => Ok(Self::Optional), + RepetitionSpec::Max(n) => Ok(Self::CopyOptional(0, n)), + RepetitionSpec::MinMax(0, 0) => Ok(Self::Ignore), + RepetitionSpec::MinMax(0, 1) => Ok(Self::Optional), + RepetitionSpec::MinMax(n, m) if n > m => Err(Error::MinGreaterThanMax(0)), + RepetitionSpec::MinMax(n, m) if n == m => Ok(Self::Copy(n)), + RepetitionSpec::MinMax(n, m) => Ok(Self::CopyOptional(n, m - n)), + } + } else { + Ok(Self::Copy(1)) + } + } +} + +fn parse_repetition_spec(s: &str) -> Option<(usize, RepetitionSpec)> { + let mut s = s; + + let digit_size = skip_by(s, is_decimal_digit); + + let mut min: Option<usize> = None; + let mut max: Option<usize> = None; + + if digit_size != 0 { + min = Some(s[..digit_size].parse::<usize>().unwrap()); + } + + let mut current_index = digit_size; + + if digit_size == s.len() { + if let Some(m) = min { + return Some((digit_size, RepetitionSpec::Equal(m))); + } else { + return None; + }; + } + + s = &s[current_index..]; + + if s.starts_with('*') { + if s.len() == 1 { + if let Some(m) = min { + return Some((current_index + 1, RepetitionSpec::Min(m))); + } else { + return Some((current_index + 1, RepetitionSpec::Star)); + } + } + + s = &s[1..]; + current_index += 1; + + let digit_size = skip_by(s, is_decimal_digit); + + if digit_size != 0 { + max = Some(s[..digit_size].parse::<usize>().unwrap()); + } + + current_index += digit_size; + + match (min, max) { + (Some(mi), Some(ma)) => Some((current_index, RepetitionSpec::MinMax(mi, ma))), + (Some(m), None) => Some((current_index, RepetitionSpec::Min(m))), + (None, Some(m)) => Some((current_index, RepetitionSpec::Max(m))), + _ => Some((current_index, RepetitionSpec::Star)), + } + } else { + min.map(|m| (current_index, RepetitionSpec::Equal(m))) + } +} + +fn parse_rulename(s: &str) -> Result<usize, Error> { + let first_char = s.chars().next().unwrap(); + + if !is_alpha(first_char) { + return Ok(0); + } + + let rulename_end = skip_by(s, is_alpha_or_digit_or_hyphen); + + if rulename_end >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + Ok(rulename_end) +} + +#[derive(Debug, Eq, PartialEq, Hash)] +enum IntermediateConstruct { + Terminal(String), + Nonterminal(String), + Range(char, char), +} + +impl Display for IntermediateConstruct { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IntermediateConstruct::Terminal(t) => write!(f, "t{t}"), + IntermediateConstruct::Nonterminal(n) => write!(f, "n{n}"), + IntermediateConstruct::Range(min, max) => write!(f, "[{min}-{max}]"), + } + } +} + +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +enum Radix { + Binary, + Decimal, + Hexadecimal, +} + +fn parse_numeric(s: &str, radix: Radix) -> Result<(usize, IntermediateConstruct), Error> { + // first read a number + + let is_radix_digit = match radix { + Radix::Binary => is_binary_digit, + Radix::Decimal => is_decimal_digit, + Radix::Hexadecimal => is_hexadecimal_digit, + }; + + let radix = match radix { + Radix::Binary => 2, + Radix::Decimal => 10, + Radix::Hexadecimal => 16, + }; + + let mut s = s; + let mut current_index = 0; + + let first_digit_size = skip_by(s, is_radix_digit); + + if first_digit_size == 0 { + // dbg!(); + return Err(Error::InvalidDigit(current_index)); + } + + let first_number = + match std::char::from_u32(u32::from_str_radix(&s[..first_digit_size], radix).unwrap()) { + Some(c) => c, + None => { + // dbg!(); + return Err(Error::InvalidCharacter(current_index)); + } + }; + + if first_digit_size >= s.len() { + return Ok(( + current_index + s.len(), + IntermediateConstruct::Terminal(first_number.to_string()), + )); + } + + s = &s[first_digit_size..]; + + current_index += first_digit_size; + + // then read a dot or a dash + + match s.chars().next().unwrap() { + '.' => { + s = &s[1..]; + current_index += 1; + + let mut stop_reading = false; + let mut chars_vec = vec![first_number]; + + while !stop_reading { + if s.is_empty() { + return Err(Error::HangingNumeric(current_index - 1)); + } + + let digit_size = skip_by(s, is_radix_digit); + + if digit_size == 0 { + // dbg!(); + return Err(Error::InvalidDigit(current_index)); + } + + let a_number = + std::char::from_u32(u32::from_str_radix(&s[..digit_size], radix).unwrap()) + .ok_or(Error::InvalidCharacter(current_index))?; + + chars_vec.push(a_number); + + s = &s[digit_size..]; + current_index += digit_size; + + if !s.starts_with('.') { + stop_reading = true; + // println!( + // "digit_size = {digit_size} a number = {a_number}, s = {}", + // &s[..10] + // ); + } else { + s = &s[1..]; + current_index += 1; + } + } + + Ok(( + current_index, + IntermediateConstruct::Terminal(chars_vec.into_iter().collect()), + )) + } + '-' => { + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingNumeric(current_index - 1)); + } + + let second_digit_size = skip_by(s, is_radix_digit); + + if second_digit_size == 0 { + println!( + "s = {}", + s.chars() + .take(if s.len() >= 50 { 50 } else { s.len() }) + .collect::<String>() + ); + return Err(Error::InvalidDigit(current_index)); + } + + let _second_number = + std::char::from_u32(u32::from_str_radix(&s[..second_digit_size], radix).unwrap()) + .ok_or(Error::InvalidCharacter(current_index))?; + + Err(Error::Unimplemented(current_index - 1)) + + // Ok(( + // current_index + second_digit_size, + // IntermediateConstruct::Range(first_number, second_number), + // )) + } + c if is_wsp(c) || c == '\n' || c == '\r' || c == ';' => Ok(( + current_index, + IntermediateConstruct::Terminal(first_number.to_string()), + )), + _ => Err(Error::InvalidCharacter(current_index)), + } +} + +#[derive(Debug)] +struct AlternationConstruct { + intermediates: Vec<IntermediateConstruct>, + regex: DefaultRegex<usize>, +} + +fn parse_alternation(s: &str, indentation: usize) -> Result<(usize, AlternationConstruct), Error> { + let mut stack: Vec<usize> = vec![0, 1]; + let mut intermediates: Vec<IntermediateConstruct> = Vec::new(); + let mut types: Vec<RegexType<usize>> = vec![RegexType::Or, RegexType::Empty]; + let mut edges: Vec<Vec<usize>> = vec![vec![1], Vec::new()]; + + fn error_conversion(error: nfa::error::Error) -> Error { + match error { + nfa::error::Error::UnknownNode(n) => Error::RegexUnknownNode(n), + nfa::error::Error::UnsupportedOperation => Error::UnsupportedOperation, + nfa::error::Error::NoRoot => Error::RegexNoRoot, + nfa::error::Error::Graph(ge) => ge.into(), + nfa::error::Error::Cycle => Error::RegexCycle, + _ => Error::Invalid, + } + } + + let mut s = s; + let mut current_index = 0usize; + + let mut nonterminal_nodes: HashSet<usize> = HashSet::new(); + + if s.is_empty() { + let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; + + return Ok(( + current_index, + AlternationConstruct { + intermediates, + regex, + }, + )); + } + + // A vector of tuples to record some information for each level of + // nesting. + // + // A tuple (len, action) means that the stack grows by LEN many + // tokens at this level, and the action ACTION is pending. + let mut repetition_action_stack: Vec<(usize, Option<RepetitionAction>)> = vec![]; + + while !stack.is_empty() { + // First read repetition specifications. + let mut repetition_spec = None; + + if let Some((spec_size, spec)) = parse_repetition_spec(s) { + if spec_size >= s.len() { + return Err(Error::HangingRepetition(current_index)); + } + + s = &s[spec_size..]; + current_index += spec_size; + + repetition_spec = Some(spec); + } + + let repetition_spec = repetition_spec; + + // if a rulename + match parse_rulename(s) { + Ok(0) => {} + Ok(size) => { + let name = &s[..size].to_string(); + + intermediates.push(IntermediateConstruct::Nonterminal(name.clone())); + + let nt_index = intermediates.len() - 1; + + let action = repetition_spec.try_into()?; + // println!("rep_action = {action:?}"); + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + types.push(RegexType::Lit(nt_index)); + + edges.push(vec![types.len() - 1]); + edges.push(Vec::new()); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Optional => { + types.append(&mut vec![RegexType::Optional]); + types.push(RegexType::Lit(nt_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Plus => { + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(nt_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + nonterminal_nodes.insert(types.len() - 1); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::Copy(n) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // edges[stack_last].push(types.len() - 1); + // } + + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(nt_index)); + + nonterminal_nodes.insert(types.len() - 1); + + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::CopyOptional(n, m) => { + let old_types_len = types.len(); + + nonterminal_nodes.extend((0..n).map(|i| old_types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| old_types_len + i)); + + types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + // for _ in 0..n { + // types.push(RegexType::Lit(nt_index)); + // edges.push(vec![]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 1); + // } + + let types_len = types.len(); + + types.extend( + std::iter::repeat([RegexType::Optional, RegexType::Lit(nt_index)]) + .take(m) + .flatten(), + ); + + edges.extend((0..(m - 1)).flat_map(|i| { + [ + vec![types_len + 1 + 2 * i, types_len + 2 + 2 * i], + Vec::new(), + ] + })); + + edges.extend([vec![types.len() - 1], Vec::new()]); + + nonterminal_nodes.extend((0..m).map(|i| types_len + 2 * i + 1)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types_len); + + // for _ in 0..m { + // types.extend([RegexType::Optional, RegexType::Lit(nt_index)]); + // edges.extend([vec![types.len() - 1], Vec::new()]); + + // nonterminal_nodes.insert(types.len() - 1); + + // edges[stack_last].push(types.len() - 2); + // } + } + } + + s = &s[size..]; + current_index += size; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + Err(_) => { + intermediates.push(IntermediateConstruct::Nonterminal(s.to_owned())); + + types.push(RegexType::Lit(intermediates.len() - 1)); + edges.push(vec![]); + + let stack_last = stack + .last() + .copied() + .ok_or(Error::RegexEmptyStack(current_index))?; + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + nonterminal_nodes.insert(types.len() - 1); + + break; + } + } + + // match against the first charcter + + let first_char = s.chars().next().unwrap(); + + match first_char { + '(' => { + // group + + s = &s[1..]; + current_index += 1; + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Star => { + types.extend([RegexType::Kleene, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + repetition_action_stack.push((3, None)); + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + } + RepetitionAction::Optional => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + repetition_action_stack.push((3, None)); + } + RepetitionAction::Plus => { + types.extend([RegexType::Plus, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + repetition_action_stack.push((3, None)); + } + RepetitionAction::Copy(1) => { + types.extend([RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + + stack.extend([types.len() - 2, types.len() - 1]); + repetition_action_stack.push((2, None)); + } + RepetitionAction::CopyOptional(0, _) => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 3); + + stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]); + + repetition_action_stack.push((3, Some(action))); + } + _ => { + types.extend([RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + + stack.extend([types.len() - 2, types.len() - 1]); + + repetition_action_stack.push((2, Some(action))); + } + } + + s = &s[1..]; + current_index += 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + return Err(Error::RegexUnbalancedParen(current_index)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + ')' => { + // end group + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + let (stack_growth, repetition_action) = repetition_action_stack + .pop() + .ok_or(Error::RegexEmptyActionStack(current_index))?; + + match repetition_action { + Some(action) => { + if stack.len() <= stack_growth + 1 { + dbg!(); + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth + 1); + + // this is not going to fail because of the previous check + let stack_last = stack.pop().unwrap(); + let stack_second_last = stack.iter().copied().last().unwrap(); + + match action { + RepetitionAction::Ignore => { + edges[stack_second_last].pop(); + + types.truncate(stack_last); + edges.truncate(stack_last); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyPlus(n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_second_last].push(types.len() - 1); + + // if stack_second_last == 3 && nodes.len() == 4 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyOptional(n, m) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + edges[stack_second_last].push(types.len()); + + let nodes_cloned_len = nodes_cloned.len(); + + for _ in 0..(m - 1) { + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + nodes_cloned_len]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Star + | RepetitionAction::Optional + | RepetitionAction::Plus => { + // this will not happen here + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + None => { + if stack.len() <= stack_growth { + dbg!(); + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + '[' => { + // option + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + match action { + RepetitionAction::Ignore => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, Some(action))); + } + RepetitionAction::Optional => { + types.extend([ + RegexType::Optional, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Star => { + types.extend([ + RegexType::Kleene, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Plus => { + types.extend([ + RegexType::Kleene, + RegexType::Optional, + RegexType::Or, + RegexType::Empty, + ]); + edges.extend([ + vec![types.len() - 3], + vec![types.len() - 2], + vec![types.len() - 1], + Vec::new(), + ]); + + edges[stack_last].push(types.len() - 4); + + stack.push(types.len() - 4); + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((4, None)); + } + RepetitionAction::Copy(1) => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, None)); + } + _ => { + types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]); + edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]); + + edges[stack_last].push(types.len() - 3); + + stack.push(types.len() - 3); + stack.push(types.len() - 2); + stack.push(types.len() - 1); + + repetition_action_stack.push((3, Some(action))); + } + } + + s = &s[1..]; + current_index += 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + return Err(Error::RegexUnbalancedParen(current_index)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + ']' => { + // end option + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + let (stack_growth, repetition_action) = repetition_action_stack + .pop() + .ok_or(Error::RegexEmptyActionStack(current_index))?; + + match repetition_action { + Some(action) => { + if stack.len() <= stack_growth + 1 { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth + 1); + + // this is not going to fail because of the previous check + let stack_last = stack.pop().unwrap(); + let stack_second_last = stack.iter().copied().last().unwrap(); + + match action { + RepetitionAction::Ignore => { + edges[stack_second_last].pop(); + + types.truncate(stack_last); + edges.truncate(stack_last); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyPlus(n) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_second_last].push(types.len() - 1); + + // if stack_second_last == 3 && nodes.len() == 4 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::CopyOptional(n, m) => { + let nodes_cloned = + types.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + let edges_cloned = + edges.iter().skip(stack_last).cloned().collect::<Vec<_>>(); + + for _ in 0..(n - 1) { + edges[stack_second_last].push(types.len()); + + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + let nodes_cloned_len = nodes_cloned.len(); + + for _ in 0..(m - 1) { + // if stack_second_last == 3 && nodes.len() == 3 { + // println!("haha"); + // } + + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + nodes_cloned_len]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + types.extend(nodes_cloned.iter().copied()); + + let edges_offset = edges.len() - stack_last; + + edges.extend(edges_cloned.iter().map(|edges| { + edges + .iter() + .map(|des| des + edges_offset) + .collect::<Vec<_>>() + })); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + RepetitionAction::Star + | RepetitionAction::Optional + | RepetitionAction::Plus => { + // this will not happen here + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + None => { + if stack.len() <= stack_growth { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.truncate(stack.len() - stack_growth); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + } + } + '/' => { + // end concatenation + + s = &s[1..]; + current_index += 1; + + if repetition_spec.is_some() { + return Err(Error::RegexInvalidRepeat(current_index)); + } + + if stack.len() <= 1 { + return Err(Error::RegexEmptyStack(current_index)); + } + + stack.pop().unwrap(); + + let stack_last = stack.iter().copied().last().unwrap(); + + edges[stack_last].push(types.len()); + + types.push(RegexType::Empty); + edges.push(vec![]); + + stack.push(types.len() - 1); + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '"' => { + // string + + s = &s[1..]; + current_index += 1; + + let skip_char_value = skip_by(s, is_char_value); + + if skip_char_value >= s.len() { + return Err(Error::HangingDQuote(current_index)); + } + + let fakes = &s[skip_char_value..]; + + if !fakes.starts_with('"') { + return Err(Error::HangingDQuote(current_index)); + } + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + let slice_value = s[..skip_char_value].to_owned(); + + intermediates.push(IntermediateConstruct::Terminal(slice_value.clone())); + + let t_index = intermediates.len() - 1; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Optional => { + types.append(&mut vec![RegexType::Optional]); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Plus => { + types.append(&mut vec![RegexType::Plus]); + types.push(RegexType::Lit(t_index)); + edges.push(vec![types.len() - 1]); + edges.push(vec![]); + + edges[stack_last].push(types.len() - 2); + } + RepetitionAction::Copy(n) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + // for _ in 0..n { + // types.push(RegexType::Lit(t_index)); + // edges.push(vec![]); + + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + types.extend([RegexType::Plus, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1], Vec::new()]); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 2); + } + RepetitionAction::CopyOptional(n, m) => { + let types_len = types.len(); + + types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit)); + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .extend((0..n).map(|i| types_len + i)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len()); + + for _ in 0..(m - 1) { + types.extend([RegexType::Optional, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1, types.len()], Vec::new()]); + } + + types.extend([RegexType::Optional, RegexType::Lit(t_index)]); + edges.extend([vec![types.len() - 1], Vec::new()]); + } + } + + s = &fakes[1..]; + current_index += skip_char_value + 1; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '%' => { + // numeric value + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingPercent(current_index - 1)); + } + + let inner_first_char = s.chars().next().unwrap(); + + let intermediate_value: IntermediateConstruct; + + match inner_first_char { + 'b' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Binary)?; + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + 'd' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Decimal)?; + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + 'x' => { + s = &s[1..]; + current_index += 1; + + let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Hexadecimal)?; + // println!("size = {parsed_size}, construct = {parsed_construct:#?}"); + + intermediate_value = parsed_construct; + + s = &s[parsed_size..]; + current_index += parsed_size; + } + _ => { + return Err(Error::RegexInvalidNumeric(current_index - 1)); + } + } + + let action = repetition_spec.try_into()?; + + let stack_last = stack + .iter() + .copied() + .last() + .ok_or(Error::RegexEmptyStack(current_index))?; + + intermediates.push(intermediate_value); + + let inter_index = intermediates.len() - 1; + + match action { + RepetitionAction::Ignore => {} + RepetitionAction::Star => { + types.push(RegexType::Kleene); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Optional => { + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Plus => { + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::Copy(n) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + // for _ in 0..n { + // types.push(RegexType::Lit(inter_index)); + // edges.push(vec![]); + // edges[stack_last].push(types.len() - 1); + // } + } + RepetitionAction::CopyPlus(n) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + types.push(RegexType::Plus); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + RepetitionAction::CopyOptional(n, m) => { + types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit)); + + edges.extend(std::iter::repeat_with(Default::default).take(n)); + + let edges_len = edges.len(); + + edges + .get_mut(stack_last) + .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))? + .push(types.len() - 1); + + for _ in 0..(m - 1) { + types.push(RegexType::Optional); + edges.push(vec![types.len(), types.len() + 1]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + + types.push(RegexType::Optional); + edges.push(vec![types.len()]); + + edges[stack_last].push(types.len() - 1); + + types.push(RegexType::Lit(inter_index)); + edges.push(vec![]); + } + } + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + break; + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + continue; + } + '<' => { + // prose value + return Err(Error::Unimplemented(current_index)); + } + ';' => { + // Possibly extend to the next line if the indentation + // of the next line is greater than `indentation`. + + s = &s[1..]; + current_index += 1; + + let mut in_comment = true; + + loop { + let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; + + let mut after_wsp_and_v_char = skip_by(s, skip_fn); + + if after_wsp_and_v_char >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[after_wsp_and_v_char..]; + current_index += after_wsp_and_v_char; + + let first_char = s.chars().next().unwrap(); + + let mut skip_again = false; + + match first_char { + '\n' | '\r' => { + s = &s[1..]; + current_index += 1; + skip_again = true; + } + _ if in_comment => { + // dbg!("1815"); + return Err(Error::RegexInvalidComment(current_index)); + } + _ => {} + } + + if skip_again { + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + after_wsp_and_v_char = skip_wsp; + + s = &s[skip_wsp..]; + current_index += skip_wsp; + } + + let first_char = s.chars().next().unwrap(); + + match first_char { + ';' => { + // continue looping + s = &s[1..]; + current_index += 1; + in_comment = true; + + continue; + } + '\n' | '\r' => { + in_comment = false; + s = &s[1..]; + current_index += 1; + + continue; + } + _ if after_wsp_and_v_char <= indentation => { + stack = vec![]; + break; + } + _ => { + break; + } + } + } + } + '\n' | '\r' => { + // Possibly extend to the next line if the indentation + // of the next line is greater than `indentation`. + // println!("s = {}", s.chars().take(10).collect::<String>()); + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + break; + } + + let mut in_comment = false; + + loop { + let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp }; + + let mut after_wsp = skip_by(s, skip_fn); + + if after_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[after_wsp..]; + current_index += after_wsp; + + let first_char = s.chars().next().unwrap(); + + let mut skip_again = false; + + match first_char { + '\n' | '\r' => { + s = &s[1..]; + current_index += 1; + skip_again = true; + } + _ if in_comment => { + // dbg!("1903"); + return Err(Error::RegexInvalidComment(current_index)); + } + _ => {} + } + + if skip_again { + let skip_wsp = skip_by(s, is_wsp); + + if skip_wsp >= s.len() { + current_index += s.len(); + stack = vec![]; + break; + } + + s = &s[skip_wsp..]; + current_index += skip_wsp; + + after_wsp = skip_wsp; + } + + let first_char = s.chars().next().unwrap(); + + match first_char { + ';' => { + // continue looping + s = &s[1..]; + current_index += 1; + in_comment = true; + + continue; + } + '\n' | '\r' => { + in_comment = false; + s = &s[1..]; + current_index += 1; + + continue; + } + _ if after_wsp <= indentation => { + stack = vec![]; + break; + } + _ => { + break; + } + } + } + } + _ => { + // invalid + println!("s = {}", &s[..(if s.len() > 50 { 50 } else { s.len() })]); + return Err(Error::InvalidCharacter(current_index)); + } + } + } + + let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?; + + Ok(( + current_index, + AlternationConstruct { + intermediates, + regex, + }, + )) +} + +type RuleConstruct = ( + // Name + String, + // Data + Vec<IntermediateConstruct>, + // Rule + DefaultRegex<usize>, + // extra_alternation_or_not + bool, +); + +fn parse_one_rule(s: &str, indentation: usize) -> Result<(usize, RuleConstruct), Error> { + if s.is_empty() { + return Ok((0, ("".to_owned(), Vec::new(), Default::default(), false))); + } + + let mut s = s; + + let mut current_index = 0usize; + + let mut appending_alternation = false; + + // rule name + + let rulename_end = parse_rulename(s)?; + let rule_name = s[..rulename_end].to_string(); + s = &s[rulename_end..]; + current_index += rulename_end; + + // c wsp + + let skip_c_wsp_size = skip_by(s, is_wsp); + + if skip_c_wsp_size >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + // equal or equal forward slash + + let next_char = s.chars().next().unwrap(); + + if next_char != '=' { + // println!("s = {}", &s[..50]); + // println!("rule_name = {rule_name}"); + return Err(Error::MissingEqualSign(current_index + 1)); + } + + s = &s[1..]; + current_index += 1; + + if s.is_empty() { + return Err(Error::HangingEqualsSign(current_index - 1)); + } + + let next_char = s.chars().next().unwrap(); + + if next_char == '/' { + appending_alternation = true; + s = &s[1..]; + current_index += 1; + } + + if s.is_empty() { + return Err(Error::HangingEqualsSign(current_index - 1)); + } + + // c wsp + + let skip_c_wsp_size = skip_c_wsp(s); + + if skip_c_wsp_size >= s.len() { + // dbg!(); + return Err(Error::HangingRuleName(0)); + } + + s = &s[skip_c_wsp_size..]; + current_index += skip_c_wsp_size; + + // elements + + // alternation + + let (alternation_size, alternation_result) = parse_alternation(s, indentation)?; + + let intermediates_data = alternation_result.intermediates; + let regex = alternation_result.regex; + + // println!("size = {alternation_size}, result = {alternation_result:?}"); + + if alternation_size >= s.len() { + s = ""; + } else { + s = &s[alternation_size..]; + } + + current_index += alternation_size; + + // c wsp + + let skip_c_wsp_size = skip_c_wsp(s); + + if skip_c_wsp_size >= s.len() { + s = ""; + } else { + s = &s[skip_c_wsp_size..]; + } + + current_index += skip_c_wsp_size; + + // c-nl + + if let Some(skip_c_nl) = skip_c_nl(s) { + current_index += skip_c_nl; + } + + Ok(( + current_index, + (rule_name, intermediates_data, regex, appending_alternation), + )) +} + +impl std::str::FromStr for Grammar { + // First skip whitespaces, linefeeds, carraige returns and + // semicolons until the first rule appears. The basic indentation + // level is set to the indentation level of the first rule's line. + // + // Then enter identifier reading mode, reading a letter followed + // by a sequence of letters, digits and hyphens. + // + // After skipping white spaces, read an equal sign. + // + // Then enter alternation reading mode, reading alternations of + // concatenations. + // + // Then enter element reading mode, reading optionally repetition + // specification followed by an identifier, or a numeral value, or + // a quoted string. + + type Err = Error; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + let mut before_first_rule = true; + let mut basic_indentation: usize = 0; + let mut current_skipped_index: usize = 0; + + let mut s = s; + + while before_first_rule { + before_first_rule = false; + + let after_wsp = skip_by(s, is_wsp); + + if after_wsp >= s.len() { + // The entire string consists of white-spaces and + // comments. + return Err(Error::EmptyGrammar); + } + + s = &s[after_wsp..]; + current_skipped_index += after_wsp; + + match skip_c_nl(s) { + Some(skipped_size) => { + if skipped_size >= s.len() { + return Err(Error::EmptyGrammar); + } + + // Repeat skipping until we find the first rule. + before_first_rule = true; + + s = &s[skipped_size..]; + current_skipped_index += skipped_size; + } + None => { + // Found the first rule. + basic_indentation = after_wsp; + } + } + } + + // TODO: Core rules are not implemented for now. + + #[allow(unused_macros)] + macro_rules! log { + () => { + let mut file = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open("debug.log") + .unwrap(); + + let _ = file.write_all(format!("{}\n", line!()).as_bytes()); + }; + } + + let mut nonterminals_vec: Vec<String> = vec![]; + let mut nonterminals_map: HashMap<String, usize> = HashMap::new(); + + let mut all_rules: HashMap<String, DefaultRegex<TNT>> = Default::default(); + + let mut terminals_vec: Vec<String> = vec![]; + let mut terminals_map: HashMap<String, usize> = HashMap::new(); + + let mut temp = s; + + // First collect non-terminals in the order they are defined. + loop { + let (rule_size, (rule_name, _rule_data, _rule_regex, appending_p)) = + parse_one_rule(temp, basic_indentation)?; + + if rule_size == 0 { + break; + } + + if appending_p && nonterminals_map.get(&rule_name).is_none() { + return Err(Error::AppendToNothing(current_skipped_index)); + } else if !appending_p && nonterminals_map.get(&rule_name).is_some() { + return Err(Error::CreateAgain(current_skipped_index)); + } + + nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); + nonterminals_vec.push(rule_name); + + if rule_size > 0 && rule_size < temp.len() { + temp = &temp[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + let mut temp = s; + + // Then gather the data about names of terminals and + // non-terminals. + loop { + let (rule_size, (_rule_name, rule_data, _rule_regex, _appending_p)) = + parse_one_rule(temp, basic_indentation)?; + + if rule_size == 0 { + break; + } + + for data in rule_data { + match data { + IntermediateConstruct::Terminal(name) => { + if terminals_map.get(&name).is_none() { + terminals_map.insert(name.clone(), terminals_vec.len()); + terminals_vec.push(name); + } + } + IntermediateConstruct::Nonterminal(name) => { + if nonterminals_map.get(&name).is_none() { + return Err(Error::UnknownNonterminal(name)); + } + } + IntermediateConstruct::Range(_, _) => unimplemented!(), + } + } + + if rule_size > 0 && rule_size < temp.len() { + temp = &temp[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + // Then collect the actual rules. + loop { + let (rule_size, (rule_name, rule_data, rule_regex, appending_p)) = + parse_one_rule(s, basic_indentation)?; + // println!("rule name = {rule_name}"); + // println!("regexp = {rule_regexp:#?}"); + + let rule_transform = |old_label: usize| -> Result<TNT, Error> { + if let Some(old_value) = rule_data.get(old_label) { + match old_value { + IntermediateConstruct::Terminal(name) => { + if let Some(index) = terminals_map.get(name) { + Ok(TNT::Ter(*index)) + } else { + dbg!(); + Err(Error::UnknownTerminal(name.clone())) + } + } + IntermediateConstruct::Nonterminal(name) => { + if let Some(index) = nonterminals_map.get(name) { + Ok(TNT::Non(*index)) + } else { + dbg!(); + Err(Error::UnknownNonterminal(name.clone())) + } + } + IntermediateConstruct::Range(_, _) => { + dbg!(); + Err(Error::UnsupportedOperation) + } + } + } else { + dbg!(); + Err(Error::RegexIndexOutOfBounds(old_label, rule_data.len())) + } + }; + + let transformed_rule = rule_regex.maps_to(rule_transform)?; + + if rule_size == 0 { + break; + } + + if !appending_p { + nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len()); + nonterminals_vec.push(rule_name.clone()); + + all_rules.insert(rule_name, transformed_rule); + } else { + all_rules + .get_mut(&rule_name) + .unwrap() + .append(transformed_rule); + } + + if rule_size > 0 && rule_size < s.len() { + s = &s[rule_size..]; + current_skipped_index += rule_size; + } else { + break; + } + } + + let rules_vec: Vec<Rule> = nonterminals_vec + .iter() + .map(|name| { + all_rules + .get(name) + .map(|regex| Rule::new(regex.clone())) + .ok_or(Error::UnknownNonterminal(name.clone())) + }) + .collect::<Result<_, _>>()?; + + let terminals: Vec<Terminal> = terminals_vec.into_iter().map(Terminal::new).collect(); + let nonterminals: Vec<Nonterminal> = + nonterminals_vec.into_iter().map(Nonterminal).collect(); + + Ok(Self::new(terminals, nonterminals, rules_vec)) + } +} + +#[cfg(test)] +mod tests; diff --git a/grammar/src/abnf/tests.rs b/grammar/src/abnf/tests.rs new file mode 100644 index 0000000..67dfaaa --- /dev/null +++ b/grammar/src/abnf/tests.rs @@ -0,0 +1,291 @@ +//! This module contains unit tests for the abnf grammar parser. + +use super::*; + +#[test] +fn test_skip_by() -> Result<(), Box<dyn std::error::Error>> { + let string = std::iter::repeat('a') + .take(10) + .chain(std::iter::repeat('哈').take(10)) + .chain(" \t\n".chars()) + .collect::<String>(); + + let mut s: &str = string.as_str(); + + dbg!(&s); + + let alpha_len = skip_by(s, is_alpha); + + assert_eq!(alpha_len, 10); + + s = &s[alpha_len..]; + + dbg!(&s); + + let not_newline_len = skip_by(s, is_not_newline); + + // 哈 takes 3 bytes, so the count should be 30 plus two bytes for + // the space and the tab. + assert_eq!(not_newline_len, 32); + + Ok(()) +} + +#[test] +fn test_skip_c_nl() -> Result<(), Box<dyn std::error::Error>> { + let string = std::iter::repeat(' ') + .take(10) + .chain(std::iter::repeat('哈').take(10)) + .chain(" \t\n".chars()) + .collect::<String>(); + + let s = string.as_str(); + + dbg!(s); + + let c_nl = skip_c_nl(s); + + assert!(c_nl.is_none()); + + let new_string = std::iter::once(';') + .chain(string.chars()) + .chain(string.chars()) + .collect::<String>(); + + let s = new_string.as_str(); + + dbg!(s); + + let c_nl = skip_c_nl(s); + + assert_eq!(c_nl, Some(string.len() + 1)); + + let c_nl = skip_c_nl("\n"); + + assert_eq!(c_nl, Some(1)); + + Ok(()) +} + +#[test] +fn test_skip_c_wsp() -> Result<(), Box<dyn std::error::Error>> { + let mut string: String = std::iter::repeat(' ') + .take(10) + .chain(std::iter::repeat('\t').take(10)) + .collect(); + + string.extend( + std::iter::once(';') + .chain( + std::iter::repeat(std::iter::once('\n').chain(std::iter::once(';'))) + .take(10) + .flatten(), + ) + .chain(std::iter::once('\n')) + .chain(std::iter::repeat('a').take(10)), + ); + + let string = string; + + dbg!(&string); + + let skip_c_wsp_size = skip_c_wsp(&string); + + assert_eq!(skip_c_wsp_size, string.len() - 10); + + Ok(()) +} + +#[test] +fn test_parse_repetition() -> Result<(), Box<dyn std::error::Error>> { + let strs = ["*A", "0*A", "*0A", "2*A", "*2A", "5*10A", "A"]; + + let results = strs.map(parse_repetition_spec); + + dbg!(&results); + + let answers = [ + Some((1, RepetitionSpec::Star)), + Some((2, RepetitionSpec::Min(0))), + Some((2, RepetitionSpec::Max(0))), + Some((2, RepetitionSpec::Min(2))), + Some((2, RepetitionSpec::Max(2))), + Some((4, RepetitionSpec::MinMax(5, 10))), + None, + ]; + + for (result, answer) in results.iter().zip(answers.iter()) { + assert_eq!(result, answer); + } + + Ok(()) +} + +use super::Error; + +#[test] +fn test_parse_rulename() -> Result<(), Box<dyn std::error::Error>> { + let s = "NONTERMINAL"; + + assert!(matches!(parse_rulename(s), Err(Error::HangingRuleName(0)))); + + let s = " NONTERMINAL = "; + + assert!(matches!(parse_rulename(s), Ok(0))); + + let s = "NONTERMINAL = "; + + assert!(matches!(parse_rulename(s), Ok(11))); + + Ok(()) +} + +#[test] +fn test_parse_numeric() -> Result<(), Box<dyn std::error::Error>> { + let s = "haha"; + + assert!(matches!( + parse_numeric(s, Radix::Binary), + Err(Error::InvalidDigit(0)) + )); + + let s = "124"; + + assert!(matches!( + parse_numeric(s, Radix::Binary), + Err(Error::InvalidCharacter(1)) + )); + + let s = "32"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Ok((2, IntermediateConstruct::Terminal(string))) if string == " ".to_string() + )); + + let s = "32.65.66.66.65"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Ok((14, IntermediateConstruct::Terminal(string))) if string == " ABBA".to_string() + )); + + let s = "32-"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Err(Error::HangingNumeric(2)) + )); + + let s = "32-124"; + + assert!(matches!( + parse_numeric(s, Radix::Decimal), + Ok((6, IntermediateConstruct::Range(from, to))) if from == ' ' && to == '|', + )); + + Ok(()) +} + +#[test] +fn test_parse_alternation() -> Result<(), Box<dyn std::error::Error>> { + let s = "0*10nonterminal\n"; + + let (len, alt) = parse_alternation(s, 0)?; + + assert_eq!(len, s.len()); + + let regex = alt.regex; + + println!("regex = {regex}"); + + assert_eq!( + format!("{regex}"), + "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?)" + ); + + let s = "0*10nonterminal ( *nonterminal nonterminal / [ nonterminal ] 1*5nonterminal )\n"; + + let (len, alt) = parse_alternation(s, 0)?; + + assert_eq!(len, s.len()); + + let regex = alt.regex; + + println!("regex = {regex}"); + + assert_eq!( + format!("{regex}"), + "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?((1)*2|((3))?4(4(4(4(4)?)?)?)?))" + ); + + let s = "5nonterminal\n"; + + let (len, alt) = parse_alternation(s, 0)?; + + assert_eq!(len, s.len()); + + let regex = alt.regex; + + println!("regex = {regex}"); + + assert_eq!(format!("{regex}"), "(00000)"); + + Ok(()) +} + +#[test] +fn test_parse_onerule() -> Result<(), Box<dyn std::error::Error>> { + let s = "start = 1*10nonterminal [ nonterminal ] / 1*nonterminal\n"; + + let (len, (name, data, regex, extra_p)) = parse_one_rule(s, 0)?; + + println!("name = {name}"); + print!("data = "); + + for (index, non_terminal) in data.iter().enumerate() { + print!("({index}, {non_terminal})"); + } + + println!(); + + println!("regex = {regex}"); + + println!("extra_p = {extra_p}"); + + assert_eq!(len, s.len()); + + assert_eq!(name, "start".to_string()); + + let answers = ["nnonterminal", "nnonterminal", "nnonterminal"]; + + assert_eq!(data.len(), answers.len()); + + for (tnt, answer) in data.into_iter().zip(answers) { + assert_eq!(format!("{tnt}"), answer.to_string()); + } + + assert_eq!( + format!("{regex}"), + "(0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?((1))?|(2)+)".to_string() + ); + + assert!(!extra_p); + + Ok(()) +} + +#[test] +fn test_parse_grammar() -> Result<(), Box<dyn std::error::Error>> { + let grammar_file_name = "abnf grammars/test.abnf"; + + let file_content = std::fs::read_to_string(grammar_file_name)?; + + let grammar: Grammar = file_content.parse()?; + + println!("print grammar:"); + + println!("{grammar}"); + + Ok(()) +} diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs index 11cb161..ea1299f 100644 --- a/grammar/src/lib.rs +++ b/grammar/src/lib.rs @@ -93,7 +93,7 @@ impl Display for TNT { } /// Errors related to grammar operations. -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone)] #[non_exhaustive] pub enum Error { /// The operation requires the grammar to be after a certain @@ -101,6 +101,8 @@ pub enum Error { WrongState(GrammarState, GrammarState), /// The first component is the index, and the second the bound. IndexOutOfBounds(usize, usize), + /// The given name of a terminal or a non-terminal is unknown. + UnknownTNTName(String), /// Fail to build the N-th regular expression, due to the /// ParseError. BuildFail(usize, ParseError), @@ -123,6 +125,11 @@ impl Display for Error { "Failed to build the {n}-th regular expression due to error: {pe}" ), Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + Error::UnknownTNTName(name) => write!( + f, + "the name {name} is unknown \ + for a terminal or a non-terminal." + ), Error::WrongState(current, threshold) => { write!(f, "require state {threshold}, but in state {current}") } @@ -158,6 +165,12 @@ impl Rule { pub fn len(&self) -> usize { self.regex.len() } + + /// Wrap a regular expression into a rule. + #[inline] + pub fn new(regex: DefaultRegex<TNT>) -> Self { + Self { regex } + } } /// The state of Grammar. @@ -190,6 +203,37 @@ impl Display for GrammarState { } } +/// This enum represents the name of either a terminal or a +/// non-terminal. +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub enum TNTName { + /// The name of a terminal. + Ter(String), + /// The name of a non-terminal. + Non(String), +} + +impl TNTName { + /// Construct the name of a terminal or a non-terminal. + #[inline] + pub fn new(name: String, terminal_p: bool) -> Self { + if terminal_p { + Self::Ter(name) + } else { + Self::Non(name) + } + } + + /// Return the underlying name. + #[inline] + pub fn name(&self) -> &str { + match self { + TNTName::Ter(name) => name, + TNTName::Non(name) => name, + } + } +} + /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { @@ -197,6 +241,8 @@ pub struct Grammar { ter: Vec<Terminal>, /// A list of non-terminals. non: Vec<Nonterminal>, + /// A map from the names of terminals or non-terminals to TNT. + tnt_map: HashMap<TNTName, TNT>, /// A list of rules. /// /// The length of the list must match that of the list of @@ -286,6 +332,22 @@ impl Grammar { let expansion_map = Default::default(); let reduction_map = Default::default(); + let mut tnt_map: HashMap<TNTName, TNT> = Default::default(); + + for (index, ter_element) in ter.iter().enumerate() { + tnt_map.insert( + TNTName::new(ter_element.name().to_string(), true), + TNT::Ter(index), + ); + } + + for (index, non_element) in non.iter().enumerate() { + tnt_map.insert( + TNTName::new(non_element.name().to_string(), false), + TNT::Non(index), + ); + } + // NOTE: We cannot calculate accumulators here, as we want the // accumulators of the regular expression of the left-closure, // not of the original one. @@ -294,6 +356,7 @@ impl Grammar { Self { ter, non, + tnt_map, rules, firsts, first_nodes, @@ -304,6 +367,16 @@ impl Grammar { } } + /// Convert from the name of a terminal or a non-terminal to a + /// struct TNT. + #[inline] + pub fn name_to_tnt(&self, name: &TNTName) -> Result<TNT, Error> { + self.tnt_map + .get(name) + .copied() + .ok_or_else(|| Error::UnknownTNTName(name.name().to_string())) + } + /// Return the name of a terminal or a non-terminal. pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> { match tnt { @@ -313,6 +386,14 @@ impl Grammar { .get(t) .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? .name() + .chars() + .map(|c| if crate::abnf::is_v_char(c) { + c.to_string() + } else { + format!("{:#x}", c as usize) + }) + .collect::<Vec<_>>() + .join("") )), TNT::Non(n) => Ok(format!( "N{}", @@ -822,7 +903,7 @@ impl Display for Grammar { f, "{}", rule.regex.to_string_with(|tnt| format!( - "({})", + " {} ", self.name_of_tnt(tnt) .unwrap_or_else(|_| format!("Unknown {tnt:?}")) ))? diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs index ba3077e..d36b250 100644 --- a/graph/src/adlist.rs +++ b/graph/src/adlist.rs @@ -3,9 +3,16 @@ //! [`Graph`][super::Graph]. This data type represents graphs using //! adjacency lists internally. -use std::ops::{Deref, DerefMut}; +use std::{ + borrow::{Borrow, BorrowMut}, + ops::{Deref, DerefMut}, +}; -use crate::{builder::Builder, error::Error, ExtGraph, Graph}; +use crate::{ + builder::{Builder, BuilderMut}, + error::Error, + ExtGraph, Graph, +}; // #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] // struct ALEdge { @@ -118,6 +125,99 @@ impl From<Vec<Vec<usize>>> for ALGraph { } } +/// A builder that modifies ALGraph in place. +#[derive(Debug)] +pub struct ALGBuilderMut<'a> { + graph: &'a mut ALGraph, +} + +impl<'a> std::ops::Deref for ALGBuilderMut<'a> { + type Target = ALGraph; + + fn deref(&self) -> &Self::Target { + self.graph.borrow() + } +} + +impl<'a> std::ops::DerefMut for ALGBuilderMut<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.graph.borrow_mut() + } +} + +impl<'a> BuilderMut for ALGBuilderMut<'a> { + type Label = (); + + type Graph = ALGraph; + + type ResultBuilder<'b> = ALGBuilderMut<'b> + where + Self::Label: 'b; + + fn from_graph_mut(graph: &mut Self::Graph) -> Self::ResultBuilder<'_> { + ALGBuilderMut { graph } + } + + fn add_vertex(&mut self, _label: Self::Label) -> usize { + self.nodes.push(Default::default()); + + self.nodes.len() - 1 + } + + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { + if self.graph.has_edge(source, target)? { + return Ok(()); + } + + self.graph + .nodes + .get_mut(source) + .unwrap() + .children + .push(target); + + Ok(()) + } + + fn append(&mut self, other: Self::Graph) { + let self_len = self.nodes_len(); + + self.graph + .nodes + .extend(other.nodes.into_iter().map(|mut node| { + for child in node.children.iter_mut() { + *child += self_len; + } + + node + })); + } + + fn set_label(&mut self, _node_id: usize, _label: Self::Label) -> Result<(), Error> { + Ok(()) + } + + fn remove_edge<F>(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> + where + F: FnMut(Self::Label) -> bool, + { + let nodes_len = self.nodes.len(); + + let source_children = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + source_children.children.retain(|child| *child != target); + + Ok(()) + } +} + // Builder for this implementation of graph /// A builder for adjacency list graphs. diff --git a/graph/src/builder.rs b/graph/src/builder.rs index c5f9252..1278e26 100644 --- a/graph/src/builder.rs +++ b/graph/src/builder.rs @@ -82,6 +82,11 @@ pub trait BuilderMut { /// Add an edge from the source to the target. fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error>; + /// Append another graph to this builder. + fn append(&mut self, _other: Self::Graph) { + unimplemented!() + } + /// Set the label of an existing node to a new label. fn set_label(&mut self, node_id: usize, label: Self::Label) -> Result<(), Error>; diff --git a/graph/src/labelled/binary.rs b/graph/src/labelled/binary.rs index 9f3afa8..ccbd5cb 100644 --- a/graph/src/labelled/binary.rs +++ b/graph/src/labelled/binary.rs @@ -214,9 +214,11 @@ impl<T: GraphLabel> Graph for PLGraph<T> { let mut post = String::new(); - // FIXME: Find a way to print only used nodes. Maybe remove + // SOLVED: Find a way to print only used nodes. Maybe remove // unwanted edges from unwanted nodes, so that we can print // out only those used nodes. + // + // This is solved as we can extract the forest out. for node in self.nodes() { post.push_str(&format!( diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index 1b1b325..9b5fab1 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -1,6 +1,9 @@ //! This file provides a default implementation of Regex. -use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; +use graph::{ + adlist::ALGBuilderMut, builder::BuilderMut, error::Error as GError, ALGraph, ExtGraph, Graph, + GraphLabel, +}; use crate::{desrec::DesRec, error::Error, Regex}; @@ -10,6 +13,7 @@ use graph_macro::Graph; use receme::{algebra::Algebra, catana::Cata}; use std::{ + borrow::BorrowMut, collections::HashMap, fmt::{Debug, Display, Write}, marker::PhantomData, @@ -271,10 +275,8 @@ impl<T: GraphLabel> DefaultRegex<T> { if !top.is_seen() { stack.push(Seen(top.index())); - if self.degree(top.index()).unwrap() > 1 { - write!(result, "(")?; - stack.push(Unseen(types.len() - 1)); - } + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); stack.extend( self.graph @@ -289,7 +291,11 @@ impl<T: GraphLabel> DefaultRegex<T> { } RegexType::Plus => { if !top.is_seen() { + write!(result, "(")?; + stack.push(Seen(top.index())); + stack.push(Unseen(types.len() - 1)); + stack.extend( self.graph .children_of(top.index()) @@ -303,7 +309,11 @@ impl<T: GraphLabel> DefaultRegex<T> { } RegexType::Optional => { if !top.is_seen() { + write!(result, "(")?; + stack.push(Seen(top.index())); + stack.push(Unseen(types.len() - 1)); + stack.extend( self.graph .children_of(top.index()) @@ -504,6 +514,56 @@ impl<T: GraphLabel> DefaultRegex<T> { Ok(result) } + + /// Use a function transform the labels of a regular expression. + pub fn maps_to<S: GraphLabel, E>( + &self, + f: impl Fn(T) -> Result<S, E>, + ) -> Result<DefaultRegex<S>, E> { + let root = self.root(); + + let graph = self.internal_graph(); + + let types: Vec<RegexType<S>> = self + .types + .iter() + .map(|element| match element { + RegexType::Lit(value) => f(*value).map(RegexType::Lit), + RegexType::Kleene => Ok(RegexType::Kleene), + RegexType::Plus => Ok(RegexType::Plus), + RegexType::Optional => Ok(RegexType::Optional), + RegexType::Or => Ok(RegexType::Or), + RegexType::Paren => Ok(RegexType::Paren), + RegexType::Empty => Ok(RegexType::Empty), + }) + .collect::<Result<_, E>>()?; + + Ok(DefaultRegex { graph, types, root }) + } + + /// Append another regular expression to the end of this regular + /// expression. + pub fn append(&mut self, other: Self) { + if self.is_empty() { + // copy the other here + self.graph = other.graph; + self.types = other.types; + self.root = other.root; + } else if other.is_empty() { + // nothing to do + } else { + let mut builder_mut = ALGBuilderMut::from_graph_mut(self.graph.borrow_mut()); + + let old_len = builder_mut.nodes_len(); + + builder_mut.append(other.graph); + + if let Some(root) = self.root { + // Deliberately ignore errors here. + let _ = builder_mut.add_edge(root, old_len, ()); + } + } + } } impl<T: GraphLabel> Display for DefaultRegex<T> { diff --git a/semiring/src/lib.rs b/semiring/src/lib.rs index 6c0e0d5..b3e26b7 100644 --- a/semiring/src/lib.rs +++ b/semiring/src/lib.rs @@ -33,6 +33,8 @@ pub trait Semiring { /// The addition. fn add(&mut self, other: &Self); + // FIXME: The multiplication needs another information: the parent + // of the other value in the item derivation tree. /// The multiplication. fn mul(&mut self, other: &Self); diff --git a/src/big_endian.c b/src/big_endian.c new file mode 100644 index 0000000..2db8624 --- /dev/null +++ b/src/big_endian.c @@ -0,0 +1,28 @@ +#include "big_endian.h" + +void to_big_endian(uint64_t num, unsigned char *result) +{ + *(result+0) = (num >> 56) & 0xff; + *(result+1) = (num >> 48) & 0xff; + *(result+2) = (num >> 40) & 0xff; + *(result+3) = (num >> 32) & 0xff; + *(result+4) = (num >> 24) & 0xff; + *(result+5) = (num >> 16) & 0xff; + *(result+6) = (num >> 8) & 0xff; + *(result+7) = (num >> 0) & 0xff; +} + +uint64_t from_big_endian(unsigned char *num) +{ + uint64_t result = ((uint64_t) (*num) << 56); + + result += ((uint64_t) (*(num+1)) << 48); + result += ((uint64_t) (*(num+2)) << 40); + result += ((uint64_t) (*(num+3)) << 32); + result += ((uint64_t) (*(num+4)) << 24); + result += ((uint64_t) (*(num+5)) << 16); + result += ((uint64_t) (*(num+6)) << 8); + result += ((uint64_t) (*(num+7)) << 0); + + return result; +} diff --git a/src/big_endian.h b/src/big_endian.h new file mode 100644 index 0000000..e6c587e --- /dev/null +++ b/src/big_endian.h @@ -0,0 +1,9 @@ +#ifndef BIG_ENDIAN_H +#define BIG_ENDIAN_H +#include <stdint.h> + +void to_big_endian(uint64_t num, unsigned char *result); + +uint64_t from_big_endian(unsigned char *num); + +#endif diff --git a/src/big_endian.o b/src/big_endian.o Binary files differnew file mode 100644 index 0000000..6d77a13 --- /dev/null +++ b/src/big_endian.o diff --git a/src/binding.c b/src/binding.c new file mode 100644 index 0000000..43f73ba --- /dev/null +++ b/src/binding.c @@ -0,0 +1,286 @@ +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <emacs-module.h> +#include "helper.h" + +int plugin_is_GPL_compatible = 3; + +emacs_value +rep_new_parser(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data) +{ + char *buffer = NULL; + ptrdiff_t len = 0; + emacs_value error_data[1]; + + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + env->copy_string_contents(env, *args, NULL, &len); + len++; + buffer = malloc(sizeof(char) * len); + + if (buffer == NULL) return env->intern(env, "nil"); + + env->copy_string_contents(env, *args, buffer, &len); + + struct parser *new_parser_result = new_parser(buffer, &error_vec); + + free(buffer); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length); + + clean_signed(&error_vec, 4); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + return env->make_user_ptr(env, clean_parser, new_parser_result); +} + +emacs_value +rep_parser_recognize(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data) +{ + struct parser *parser = env->get_user_ptr(env, *args); + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) + return env->intern(env, "nil"); + + unsigned char *buffer = NULL; + ptrdiff_t len = 0; + + emacs_value error_data[1]; + + if (!env->eq(env, + env->type_of(env, *(args+1)), + env->intern(env, "vector"))) { + error_data[0] = env->make_string + (env, "INPUT should be a vector of integers", 36); + + env->non_local_exit_signal(env, env->intern(env, "wrong-type-argument"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + len = env->vec_size(env, *(args+1)); + + buffer = malloc(sizeof(unsigned char) * len * 8); + + if (buffer == NULL) { + error_data[0] = env->make_string(env, "out of memory", 13); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + for (ptrdiff_t i = 0; i < len; i++) { + emacs_value element = env->vec_get(env, *(args+1), i); + + to_big_endian((uint64_t) env->extract_integer(env, element), + buffer + 8 * i); + } + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) { + free(buffer); + return env->intern(env, "nil"); + } + + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + unsigned char input_len[8] = { 0 }; + + to_big_endian((uint64_t) len * 8, input_len); + + struct UnsignedVec input_vec = (struct UnsignedVec) + { input_len, NULL, buffer }; + + int result = parser_recognize(parser, &input_vec, &error_vec, (unsigned char) 1); + + free(buffer); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length); + + clean_signed(&error_vec, 4); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + return (result) ? env->intern(env, "t") : env->intern(env, "nil"); +} + +void +clean_forest(void *ptr) +{ + clean_unsigned((struct UnsignedVec *) ptr, 15); +} + +emacs_value +rep_parser_parse(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data) +{ + struct parser *parser = env->get_user_ptr(env, *args); + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) + return env->intern(env, "nil"); + + unsigned char *buffer = NULL; + ptrdiff_t len = 0; + + emacs_value error_data[1]; + + if (!env->eq(env, + env->type_of(env, *(args+1)), + env->intern(env, "vector"))) { + error_data[0] = env->make_string + (env, "INPUT should be a vector of integers", 36); + + env->non_local_exit_signal(env, env->intern(env, "wrong-type-argument"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + len = env->vec_size(env, *(args+1)); + + buffer = malloc(sizeof(char) * len * 8); + + if (buffer == NULL) { + error_data[0] = env->make_string(env, "out of memory", 13); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + for (ptrdiff_t i = 0; i < len; i++) { + emacs_value element = env->vec_get(env, *(args+1), i); + + to_big_endian((uint64_t) env->extract_integer(env, element), + buffer + 8 * i); + } + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) { + free(buffer); + return env->intern(env, "nil"); + } + + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + unsigned char input_len[8] = { 0 }; + + to_big_endian((uint64_t) len * 8, input_len); + +struct UnsignedVec input_vec = (struct UnsignedVec) + { input_len, NULL, buffer }; + + struct UnsignedVec *result = parser_parse(parser, &input_vec, &error_vec, (unsigned char) 1); + + free(buffer); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length); + + clean_signed(&error_vec, 4); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + if (result) { + return env->make_user_ptr(env, clean_forest, result); + } + + return env->intern(env, "nil"); +} + +int +emacs_module_init(struct emacs_runtime *ert) +{ + emacs_env *env = ert->get_environment(ert); + + unsigned char emacs_version = 0; + + if ((unsigned long) env->size >= sizeof(struct emacs_env_27)) + emacs_version = 27; + else if ((unsigned long) env->size >= sizeof(struct emacs_env_26)) + emacs_version = 26; + else if ((unsigned long) env->size >= sizeof(struct emacs_env_25)) + emacs_version = 25; + else return 2; + + emacs_value newfunc = env->make_function + (env, 1, 1, rep_new_parser, + "Create a new parser from the grammar string GRAMMAR_STRING." + "\n\n\(fn grammar_string)", + NULL); + + env->funcall(env, env->intern(env, "defalias"), + 2, (emacs_value[]){ env->intern(env, "rep-new-parser"), newfunc }); + + emacs_value recognizefunc = env->make_function + (env, 2, 2, rep_parser_recognize, + "Recognize an INPUT using PARSER." + "\n\n\(fn parser INPUT)", + NULL); + + env->funcall(env, env->intern(env, "defalias"), + 2, (emacs_value[]){ env->intern(env, "rep-recognize"), recognizefunc }); + + emacs_value parsefunc = env->make_function + (env, 2, 2, rep_parser_parse, + "Parse an INPUT using PARSER." + "\n\n\(fn parser input)", + NULL); + + env->funcall(env, env->intern(env, "defalias"), + 2, (emacs_value[]){ env->intern(env, "rep-parse"), parsefunc }); + + env->funcall(env, env->intern(env, "provide"), + 1, (emacs_value[]){ env->intern(env, "rep") }); + + return 0; +} diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..f764077 --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,175 @@ +//! This module defines functions to turn a forest of forest labels +//! into a sequence of bytes. +//! +//! To be more specific, a forest consists of two parts: a vector of +//! node labels and a graph specifying the relations between nodes. +//! +//! # Number format (endianness) +//! +//! Every number mentionned in this format is an unsigned integer of +//! 64 bits, or 8 bytes. Every such number is specified in the big +//! endian format. +//! +//! # Parts of the sequence of bytes +//! +//! The sequence of bytes has three parts: +//! +//! ## Header +//! +//! The header specifies metadata of this forest. +//! +//! ### Special mark +//! +//! The first three bytes form the string "rep", as a special mark. +//! +//! ### Number of nodes +//! +//! The next 8 bytes specifies the number of nodes of this forest. +//! +//! ## Graph +//! +//! Next comes the underlying graph for this forest. This part +//! consists of a vector of vectors of numbers. Each vector of +//! numbers represents the list of children of a node. So the number +//! of vectors of numbers is equal to the number of nodes. +//! +//! ### Vector of vectors of numbers +//! +//! The vectors are not simply concatenated one after another, as that +//! way one cannot read a random node in a constant amount of time. +//! +//! Instead, we first specify the number of children of each node +//! first, along with the offset for that node of the vector of +//! children. +//! +//! As an example, if a graph has three nodes, represented as the +//! adjacency list: `[[1, 2], [2], []]`, then its representation as a +//! sequence of bytes is as follows: +//! ```text +//! 2, x, 1, y, 0, z, 1, 2, 2, +//! ^ ^ ^ +//! x y z +//! ``` +//! +//! This has the advantage that we can read the children of the `n`-th +//! node in constant time. +//! +//! ## Vector of labels +//! +//! Each label occupies a fixed number of bytes, so we simply put the +//! labels one after another. The only thing to note here is the +//! format of the labels. +//! +//! ### Labels +//! +//! Each label has 3 parts: +//! +//! 1. Status: either Packed, Cloned, or Plain. If the node is +//! cloned, it has a clone index. So in total this part occupies 1 +//! byte for the status and 8 bytes for the clone index. +//! 2. Start and end: the range in the input sentence. We just store +//! two numbers here. Hence this part occupies 16 bytes. +//! 3. Grammar label: either a terminal, a non-terminal, or a rule. +//! Each variant needs a number to speify its index. So in total this +//! part occupies 1 byte for the variant discriminant and 8 bytes for +//! the number. +//! +//! To sum up, each label occupies 34 bytes. + +use chain::item::{default::DefaultForest, ForestLabel}; +use grammar::{GrammarLabel, GrammarLabelType, TNT}; +use graph::{Graph, LabelGraph}; + +pub(super) fn forest_to_bytes(forest: &DefaultForest<ForestLabel<GrammarLabel>>) -> Vec<u8> { + // First calculate the total number of bytes. + let nodes_len = forest.nodes_len(); + + let degrees: Vec<_> = forest + .nodes() + .map(|node| forest.degree(node).unwrap_or(0)) + .collect(); + + let total_degree: usize = degrees.iter().copied().sum(); + + let len: usize = 8 // total number of bytes at the start + + 3 // special mark + + 8 // number of nodes + + 8 // offset of labels + + 16 * nodes_len // degree & offset for each node + + 8 * total_degree // children of each node + + 34 * nodes_len // labels + ; + + // Then fill in the bytes. + + let mut bytes: Vec<u8> = Vec::with_capacity(len); + + // First the headers + + bytes.extend(len.to_be_bytes()); + bytes.extend([114, 101, 112]); // rep + bytes.extend(nodes_len.to_be_bytes()); + bytes.extend((len - 34 * nodes_len).to_be_bytes()); + + let mut accumulated: usize = 0; + + // Then degrees and offsets + + for degree in degrees.iter().copied() { + bytes.extend(degree.to_be_bytes()); + bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes()); + + accumulated += degree; + } + + // Then the children + + bytes.extend(forest.nodes().flat_map(|node| { + forest + .children_of(node) + .unwrap_or_default() + .flat_map(|child| child.to_be_bytes()) + })); + + // Finally the labels + + 'nodes_loop: for node in forest.nodes() { + let label = match forest.vertex_label(node) { + Ok(Some(label)) => label, + _ => continue 'nodes_loop, + }; + + if label.is_plain() { + bytes.extend(std::iter::repeat(0).take(9)); + } else if label.is_packed() { + bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8))); + } else if let Some(index) = label.clone_index() { + bytes.extend(std::iter::once(2).chain(index.to_be_bytes())); + } + + let label = label.label(); + + bytes.extend(label.start().to_be_bytes()); + bytes.extend(label.end().unwrap_or(0).to_be_bytes()); + + let label = label.label(); + + match label { + GrammarLabelType::TNT(TNT::Ter(t)) => { + bytes.extend(std::iter::once(0).chain(t.to_be_bytes())); + } + GrammarLabelType::TNT(TNT::Non(n)) => { + bytes.extend(std::iter::once(1).chain(n.to_be_bytes())); + } + GrammarLabelType::Rule(r) => { + bytes.extend(std::iter::once(2).chain(r.to_be_bytes())); + } + } + } + + if bytes.len() != len { + dbg!(); + } + + bytes +} diff --git a/src/helper.c b/src/helper.c new file mode 100644 index 0000000..5d7e9f8 --- /dev/null +++ b/src/helper.c @@ -0,0 +1,73 @@ +#include "helper.h" + +struct Label +read_label(unsigned char *ptr) +{ + struct Label label = { 0 }; + + switch (*ptr) { + case Plain: + label.status = Plain; + break; + case Packed: + label.status = Packed; + break; + default: + label.status = Clone; + label.clone_index = from_big_endian(ptr+1); + break; + } + + label.start = from_big_endian(ptr+9); + label.end = from_big_endian(ptr+17); + + switch (*(ptr+25)) { + case Terminal: + label.variant = Terminal; + break; + case Nonterminal: + label.variant = Nonterminal; + break; + default: + label.variant = Rule; + break; + } + + label.content = from_big_endian(ptr+26); + + return label; +} + +void +print_label(struct Label label) +{ + switch (label.status) { + case Plain: + printf("a plain node "); + break; + case Packed: + printf("a packed node "); + break; + default: + printf("a cloned node with index %llu ", label.clone_index); + break; + } + + printf("spanning (%llu, %llu) ", label.start, label.end); + + printf("labelled as a "); + + switch (label.variant) { + case Terminal: + printf("terminal "); + break; + case Nonterminal: + printf("non-terminal "); + break; + default: + printf("rule "); + break; + } + + printf("%llu\n", label.content); +} diff --git a/src/helper.h b/src/helper.h new file mode 100644 index 0000000..8be7497 --- /dev/null +++ b/src/helper.h @@ -0,0 +1,75 @@ +#ifndef HELPER_H +#define HELPER_H + +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include "big_endian.h" + +struct SignedVec { + unsigned char *len; + unsigned char *capacity; + char *data; +}; + +struct UnsignedVec { + unsigned char *len; + unsigned char *capacity; + unsigned char *data; +}; + +struct parser; + +struct parser *new_parser(char *grammar_string, + struct SignedVec *error_vec); + +void clean_parser(void *parser); + +int parser_recognize(struct parser *parser, + struct UnsignedVec *input_vec, + struct SignedVec *error_vec, + unsigned char reset_p); + +struct UnsignedVec * +parser_parse +(struct parser *parser, + struct UnsignedVec *input_vec, + struct SignedVec *error_vec, + unsigned char reset_p ); + +struct UnsignedVec * +parser_parse(struct parser *parser, + struct UnsignedVec *input_vec, + struct SignedVec *error_vec, + unsigned char reset_p); + +void clean_signed(struct SignedVec *vec, unsigned char flag); +void clean_unsigned(struct UnsignedVec *vec, unsigned char flag); + +typedef enum Status { + Plain = 0, + Packed = 1, + Clone = 2, +} Status; + +typedef enum Variant { + Terminal = 0, + Nonterminal = 1, + Rule = 2, +} Variant; + +struct Label { + Status status; + uint64_t clone_index; + uint64_t start; + uint64_t end; + Variant variant; + uint64_t content; +}; + +struct Label read_label(unsigned char *ptr); + +void print_label(struct Label label); + +#endif diff --git a/src/helper.o b/src/helper.o Binary files differnew file mode 100644 index 0000000..e8801dd --- /dev/null +++ b/src/helper.o @@ -1,16 +1,548 @@ -// TODO: Add Emacs bindings +#![warn(missing_docs)] +//! This top level package provides necessary functions for Emacs to +//! call. -pub fn add(left: usize, right: usize) -> usize { - left + right +extern crate chain; +extern crate grammar; + +use chain::{atom::DefaultAtom, default::DefaultChain, Chain}; +use grammar::Grammar; + +/// This struct is the representation of a parser. +/// +/// When the user constructs a parser, an instance of this struct will +/// be constructed and the user will receive an opaque pointer to this +/// struct. +#[derive(Debug, Clone)] +#[repr(C)] +pub struct Parser { + chain: DefaultChain, +} + +impl Parser { + /// Construct a parser from the grammar string. + /// + /// The grammar is supposed to conform to the Augmented + /// Backus-Naur Format. See RFC 5234 for exact details of this + /// format. + pub fn new(s: &str) -> Result<Self, String> { + let grammar: Grammar = s.parse().map_err(|err| format!("{err}"))?; + let atom: DefaultAtom = + DefaultAtom::from_grammar(grammar).map_err(|err| format!("{err}"))?; + + DefaultChain::unit(atom) + .map_err(|err| format!("{err}")) + .map(|chain| Self { chain }) + } +} + +/// Actual function that is called through C ABI. +/// +/// The parameter `ERROR_LEN` is supposed to point to an integer, +/// which will be set to the length of the error message, if and only +/// if an error occurs, in which case the `ERROR_STR` will be set to +/// point to the actual error message. +/// +/// It is expected that `*ERROR_STR` should hold the value `NULL` . +#[no_mangle] +extern "C" fn new_parser( + grammar_string: *mut std::os::raw::c_char, + error_vec: *mut LenVec<std::os::raw::c_char>, +) -> *mut Parser { + let parsed_str; + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + match std::ffi::CStr::from_ptr(grammar_string).to_str() { + Ok(ccstr) => { + parsed_str = ccstr.to_string(); + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + } + + match Parser::new(&parsed_str) { + Ok(result) => unsafe { + for i in 0..8 { + *(error_len.add(i)) = 0; + } + + Box::into_raw(Box::new(result)) + }, + Err(e) => unsafe { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + + std::mem::forget(e_string); + + std::ptr::null_mut() + }, + } +} + +#[no_mangle] +extern "C" fn clean_parser(parser: *const std::ffi::c_void) { + unsafe { + drop(Box::from_raw(parser as *mut Parser)); + } +} + +/// To make it easier to pass arrays to and from C. +#[repr(C)] +pub struct LenVec<T> { + /// This must be an array of unsigned char of length 8. + /// + /// A less length leads to access to invalid memory location, + /// while a longer length will be ignored, possibly leading to + /// wrong length. + /// + /// The length will be interpreted as a 64-bits integer stored in + /// big endian format. + pub len: *mut std::os::raw::c_uchar, + /// This must be an array of unsigned chars of length 8. + /// + /// This is only used so that Rust knows how to reconstruct the + /// data from C, in order for Rust to deallocate the objects + /// exposed by this library. + pub capacity: *mut std::os::raw::c_uchar, + /// The actual pointer to the data. + /// + /// In case this should be set by the function on the Rust end, + /// this field should be `NULL`. + pub data: *mut T, +} + +#[no_mangle] +extern "C" fn clean_signed(vec: *mut LenVec<std::os::raw::c_char>, flag: std::os::raw::c_uchar) { + let len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).len, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } + .try_into() + .unwrap(), + ); + + if (flag & 1) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); + } + + if (flag & 2) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); + } + + if (flag & 4) != 0 { + drop(unsafe { String::from_raw_parts((*vec).data as *mut u8, len, capacity) }); + } + + if (flag & 8) != 0 { + drop(unsafe { Box::from_raw(vec) }); + } +} + +#[no_mangle] +extern "C" fn clean_unsigned(vec: *mut LenVec<std::os::raw::c_uchar>, flag: std::os::raw::c_uchar) { + let len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).len, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } + .try_into() + .unwrap(), + ); + + if (flag & 1) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); + } + + if (flag & 2) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); + } + + if (flag & 4) != 0 { + drop(unsafe { Vec::<u8>::from_raw_parts((*vec).data, len, capacity) }); + } + + if (flag & 8) != 0 { + drop(unsafe { Box::from_raw(vec) }); + } +} + +#[no_mangle] +extern "C" fn parser_recognize( + parser: *mut Parser, + input_vec: *mut LenVec<std::os::raw::c_uchar>, + error_vec: *mut LenVec<std::os::raw::c_char>, + reset_p: std::os::raw::c_uchar, +) -> std::os::raw::c_int { + let input_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let mut parser_box; + let input_array_len = input_len; + let input_array; + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + parser_box = Box::from_raw(parser); + + input_array = std::slice::from_raw_parts((*input_vec).data, input_array_len); + } + + // If the parser has already been used before, reset it to the + // initial state. + + if reset_p != 0 && !parser_box.chain.history().is_empty() { + match DefaultChain::unit(parser_box.chain.atom().clone()) { + Ok(chain) => { + parser_box.chain = chain; + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + Box::leak(parser_box); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + } + } + + if input_array_len.rem_euclid(8) != 0 { + let mut e_string = + format!("error: input length should be divisible by 8, but got {input_array_len}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + + #[cfg(target_pointer_width = "64")] + let input_iter = input_array + .chunks_exact(8) + .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); + + #[cfg(not(target_pointer_width = "64"))] + compile_error!("this program assumes to be run on 64-bits machines"); + + for (index, token) in input_iter.enumerate() { + let chain_result = parser_box.chain.chain(token, index, true); + + if let Err(e) = chain_result { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + } + + match parser_box.chain.epsilon() { + Ok(result) => { + Box::leak(parser_box); + + result as std::os::raw::c_int + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + 0 + } + } } -#[cfg(test)] -mod tests { - use super::*; +#[no_mangle] +extern "C" fn parser_parse( + parser: *mut Parser, + input_vec: *mut LenVec<std::os::raw::c_uchar>, + error_vec: *mut LenVec<std::os::raw::c_char>, + reset_p: std::os::raw::c_uchar, +) -> *mut LenVec<std::os::raw::c_uchar> { + let input_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let mut parser_box; + let input_array; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + parser_box = Box::from_raw(parser); + + input_array = std::slice::from_raw_parts((*input_vec).data, input_len); + } + + // If the parser has already been used before, reset it to the + // initial state. + + if reset_p != 0 && !parser_box.chain.history().is_empty() { + match DefaultChain::unit(parser_box.chain.atom().clone()) { + Ok(chain) => { + parser_box.chain = chain; + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + Box::leak(parser_box); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + } + + if input_len.rem_euclid(8) != 0 { + let mut e_string = + format!("error: input length should be divisible by 8, but got {input_len}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } else if input_len == 0 { + Box::leak(parser_box); + + return std::ptr::null_mut(); + } + + #[cfg(target_pointer_width = "64")] + let input_iter = input_array + .chunks_exact(8) + .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); + + #[cfg(not(target_pointer_width = "64"))] + compile_error!("this program assumes to be run on 64-bits machines"); + + let mut last_pos: usize = 0; + let mut last_token: usize = 0; + + for (index, token) in input_iter.enumerate() { + last_pos = index; + last_token = token; + + let chain_result = parser_box.chain.chain(token, index, false); + + if let Err(e) = chain_result { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + + match parser_box.chain.epsilon() { + Ok(result) => { + if result { + let forest = parser_box.chain.end_of_input(last_pos + 1, last_token); + + match forest { + Ok(forest) => { + Box::leak(parser_box); + + let mut bytes = bytes::forest_to_bytes(&forest); + + let bytes_len = bytes.len().to_be_bytes().to_vec(); + + let bytes_capacity = bytes.capacity().to_be_bytes().to_vec(); + + let bytes_vec: LenVec<std::os::raw::c_uchar> = LenVec { + len: Box::leak(bytes_len.into_boxed_slice()).as_mut_ptr(), + capacity: Box::leak(bytes_capacity.into_boxed_slice()).as_mut_ptr(), + data: bytes.as_mut_ptr(), + }; + + std::mem::forget(bytes); + + Box::into_raw(Box::new(bytes_vec)) + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + std::ptr::null_mut() + } + } + } else { + Box::leak(parser_box); + + std::ptr::null_mut() + } + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + std::ptr::null_mut() + } } } + +pub mod bytes; diff --git a/src/old binding.txt b/src/old binding.txt new file mode 100644 index 0000000..0311f49 --- /dev/null +++ b/src/old binding.txt @@ -0,0 +1,218 @@ +//! This file provides the Emacs binding for the core. + +#![allow(unused_imports)] + +extern crate repcore; + +use repcore::{ + derive::stdag::{PFRecorder, STDAGParser}, + grammar::{Grammar, GrammarInfo}, + semiring::{IdentityRecorder, Nat, PFS}, + tokenizer::to_tokenizer, + valuation::{Count, ParseForest}, +}; + +use repcore::parser::Parser as PParser; +use repcore::parser::ParserReportConfig as PParserConfig; + +// TODO: Write Emacs binding + +// Let's design the API for Emacs to use. +// +// Emacs offers the grammar file or grammar string, and then we +// generate a parser. +// +// Thereafter when we want to parse some input, Emacs gives the input +// file name or the input file string to this package, and then the +// package gives back the parse result. +// +// Besides, we need a data type to deal with errors. +// +// Handling a string is not a problem for us, but to return a parser +// to Emacs, and to return the parse result, are kind of troublesome. +// +// To achieve this, we need data types for a parser and for the parse +// result. Since a parser is just an implementation detail and should +// not be of concern to the user, we mught just represent a parser as +// just a custom user-ptr in Emacs. +// +// As to the parse result, we return a set of tuples: +// +// (label, i, k, j), +// +// where LABEL is represented as a tuple +// +// (INDEX, STRING), +// +// where index is the index of the rule and the string is the name of +// the non-terminal or the terminal that is spanned by this packed +// node. +// +// This is not intended for the user to inspect and play with. The +// package should provide another function to interpret this data, for +// example to search something or to display it. The reason we don't +// directly return something to display is that the plain forest might +// contain cyclic data, and is inconvenient to represent in Emacs. So +// we only try to display this information to the user if the need +// arises. + +#[derive(Debug, Clone)] +#[repr(C)] +pub struct Parser { + g: Grammar, + gi: GrammarInfo, + stparser: STDAGParser<PFS, ParseForest, PFRecorder>, +} + +impl Parser { + fn new(gs: &str) -> Result<Self, Box<dyn std::error::Error>> { + let g: Grammar = gs.parse()?; + + let mut stparser = STDAGParser::new(); + + let mut gi = g.generate_info()?; + + stparser.init(&g, &mut gi)?; + + Ok(Self { g, gi, stparser }) + } +} + +#[no_mangle] +extern "C" fn new_parser( + gs: *mut std::os::raw::c_char, + error: *mut std::os::raw::c_int, +) -> *mut Parser { + let parsed_str; + let parser_result; + + unsafe { + let cstr = std::ffi::CStr::from_ptr(gs).to_str(); + + if cstr.is_err() { + *error = 1; + return std::ptr::null_mut(); + } + + parsed_str = cstr.unwrap().to_owned(); + + parser_result = Parser::new(&parsed_str); + + if parser_result.is_err() { + *error = 2; + eprintln!("failed: {parser_result:?}"); + return std::ptr::null_mut(); + } + + *error = 0; + } + + Box::into_raw(Box::new(parser_result.unwrap())) +} + +#[no_mangle] +extern "C" fn clean_parser(parser: *const std::ffi::c_void) { + unsafe { + Box::from_raw(parser as *mut Parser); + } +} + +// #[no_mangle] +// extern "C" fn reset_parser(parser: *mut Parser) { +// let mut parser_box; +// unsafe { +// parser_box = Box::from_raw(parser); +// } + +// parser_box +// .stparser +// .init(&parser_box.g, &parser_box.gi) +// .unwrap(); + +// std::mem::forget(parser_box); +// } + +// FIXME: Use a pointer to int to indicate the type of errors, and use +// a pointer to a custom error struct to convey the error messages. +#[no_mangle] +extern "C" fn parser_parse( + parser: *mut Parser, + file: *const std::os::raw::c_char, +) -> std::os::raw::c_int { + let mut parser_box; + let cstr; + + unsafe { + parser_box = Box::from_raw(parser); + cstr = std::ffi::CStr::from_ptr(file).to_str(); + + if cstr.is_err() { + eprintln!("error reading string: {:?}", cstr); + return 0; + } + } + + parser_box.stparser.reinit(); + + let file_string = std::fs::read_to_string(cstr.unwrap()); + + if file_string.is_err() { + eprintln!("error reading file: {:?}", file_string); + return 0; + } + + let file_string = file_string.unwrap(); + + let parse_result = parser_box + .stparser + .parse(&file_string.chars().collect::<Vec<_>>()); + + if parse_result.is_err() { + eprintln!("failed to parse: {:?}", parse_result); + return 0; + } + + let (accepting, _) = parse_result.unwrap(); + + std::mem::forget(parser_box); + + accepting as std::os::raw::c_int +} + +#[no_mangle] +extern "C" fn parser_parse_string( + parser: *mut Parser, + input: *const std::os::raw::c_char, +) -> std::os::raw::c_int { + let mut parser_box; + let cstr; + + unsafe { + parser_box = Box::from_raw(parser); + cstr = std::ffi::CStr::from_ptr(input).to_str(); + + if cstr.is_err() { + eprintln!("error reading string: {:?}", cstr); + return 0; + } + } + + parser_box.stparser.reinit(); + + let file_string = cstr.unwrap().to_owned(); + + let parse_result = parser_box + .stparser + .parse(&file_string.chars().collect::<Vec<_>>()); + + if parse_result.is_err() { + eprintln!("failed to parse: {:?}", parse_result); + return 0; + } + + let (accepting, _) = parse_result.unwrap(); + + std::mem::forget(parser_box); + + accepting as std::os::raw::c_int +} diff --git a/src/rep.so b/src/rep.so Binary files differnew file mode 100755 index 0000000..2bc412c --- /dev/null +++ b/src/rep.so diff --git a/src/test b/src/test Binary files differnew file mode 100755 index 0000000..5806c2d --- /dev/null +++ b/src/test diff --git a/src/test.c b/src/test.c new file mode 100644 index 0000000..d309fb7 --- /dev/null +++ b/src/test.c @@ -0,0 +1,224 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <emacs-module.h> + +#include "big_endian.h" +#include "helper.h" + +int +main(int argc, char **argv) +{ + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + struct parser *parser = new_parser("start = \"a\"\"b\"\n", &error_vec); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + printf("error: "); + + char *error_str = error_vec.data; + + for (int i = 0; i < error_length; i++) { + printf("%c", *(error_str+i)); + } + + printf("\n"); + + clean_signed(&error_vec, 4); + + return 1; + } + + unsigned char *input = malloc (sizeof(unsigned char) * 16); + + for (int i = 0; i < 16; i++) *(input+i) = 0; + + *(input+15) = 1; + + unsigned char input_len[8] = { 0 }; + input_len[7] = 16; + + struct UnsignedVec input_vec = (struct UnsignedVec) { input_len, NULL, input }; + + int result = parser_recognize + (parser, + &input_vec, + &error_vec, + (unsigned char) 0); + + error_length = from_big_endian(error_vec.len); + + if (error_length) { + clean_parser((void *) parser); + free(input); + + printf("error: "); + + char *error_str = error_vec.data; + + for (uint64_t i = 0; i < error_length; i++) { + printf("%c", *(error_str + (unsigned long) i)); + } + + printf("\n"); + + clean_signed(&error_vec, 4); + + return 1; + } + + if (result) + printf("result = true\n"); + else + printf("result = false\n"); + + for (int i = 0; i < 8; i++) { + error_vec.len[i] = 0; + error_vec.capacity[i] = 0; + } + + struct UnsignedVec *forest_ptr = parser_parse + (parser, + &input_vec, + &error_vec, + (unsigned char) 1); + + error_length = from_big_endian(error_vec.len); + + if (error_length) { + clean_parser((void *) parser); + free(input); + + printf("error: "); + + char *error_str = error_vec.data; + + for (uint64_t i = 0; i < error_length; i++) { + printf("%c", *(error_str + (unsigned long) i)); + } + + printf("\n"); + + clean_signed(&error_vec, 4); + + return 1; + } + + free(input); + clean_parser((void *) parser); + + if (forest_ptr) { + uint64_t forest_len = from_big_endian(forest_ptr->len); + + /* forest_len++; */ + + printf("a non-empty forest of length %llu\n", forest_len); + + unsigned char *forest = forest_ptr->data; + + uint64_t forest_real_len = from_big_endian(forest); + + if (forest_real_len != forest_len) { + fprintf(stderr, "wrong length: %llu\n", forest_real_len); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + if (forest_len < 27) { + fprintf(stderr, "length too small: %llu\n", forest_len); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + printf("the lengths match\n"); + + if (*(forest+8) != 114 || *(forest+9) != 101 || *(forest+10) != 112) { + fprintf(stderr, "the forest does not begin with the special mark\n"); + + fprintf(stderr, "the first bytes are: "); + fprintf(stderr, "%c", *(forest+8)); + fprintf(stderr, "%c", *(forest+9)); + fprintf(stderr, "%c\n", *(forest+10)); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + printf("the special mark is correct.\n"); + + uint64_t nodes_len = from_big_endian(forest+11); + + printf("forest has %llu nodes\n", nodes_len); + + uint64_t labels_offset = from_big_endian(forest+19); + + printf("the offset of labels is %llu\n", labels_offset); + + if (forest_len < labels_offset || + forest_len < (27 + 16 * nodes_len)) { + fprintf(stderr, "length too small: %llu\n", forest_len); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + uint64_t total_degree = 0; + + for (uint64_t i = 0; i < nodes_len; i++) { + uint64_t offset = 27 + 16 * i; + + uint64_t degree = from_big_endian(forest+offset); + uint64_t node_offset = from_big_endian(forest+offset+8); + + total_degree += degree; + + printf("the %llu-th node has degree %llu and offset %llu\n", + i, degree, node_offset); + + printf("label: "); + print_label(read_label(forest + labels_offset + 34 * i)); + + for (uint64_t j = 0; j < degree; j++) { + uint64_t child = from_big_endian(forest+node_offset+8*j); + + printf("the %llu-th child is %llu\n", j, child); + } + + printf("\n"); + } + + uint64_t correct = 27 + 50 * nodes_len + 8 * total_degree; + + if (forest_len != correct) { + fprintf(stderr, "length does not conform to the format: %llu\n", correct); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + printf("the forest has the correct length according to the format\n"); + + clean_unsigned(forest_ptr, 15); + } else { + printf("no forest\n"); + } + + return 0; +} diff --git a/src/test.el b/src/test.el new file mode 100644 index 0000000..d106a03 --- /dev/null +++ b/src/test.el @@ -0,0 +1,28 @@ +(load-file "rep.so") +(setq rep-dir (vc-call-backend 'Git 'root default-directory)) + +(setq test-parser + (rep-new-parser + (with-temp-buffer + (insert-file-contents + (expand-file-name + "test.abnf" + (expand-file-name + "abnf grammars" + (expand-file-name "grammar" rep-dir)))) + (buffer-string)))) + +(defvar input nil "A vector that represents a testing input.") + +(setq input (vector 3 0 2 2 2 1 1 1 0 1)) + +(rep-recognize test-parser input) +(rep-parse test-parser input) + +;; (rep-parse-string +;; test-parser +;; "* title\nprice: 512\nnote: this is a test\n") + +;; (rep-parse-string +;; test-parser +;; "183t3ru") |