summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-07-08 12:30:21 +0800
committerJSDurand <mmemmew@gmail.com>2023-07-08 12:31:13 +0800
commit9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 (patch)
tree7bb6004196b38446a5ab0cb3a0ab642d35f113e9
parent691f969eb104fa3d4c2a1667693fd0382eb9d6b5 (diff)
Finished the Emacs binding.
Now the binding part is finished. What remains is a bug encountered when planting a fragment to the forest which intersects a packed node, which would lead to invalid forests. This will also cause problem when planting a packed fragment, but until now my testing grammars do not produce packed fragments, so this problem is not encountered yet. I am still figuring out efficient ways to solve this problem.
-rw-r--r--Cargo.toml6
-rw-r--r--chain/src/archive.txt47
-rw-r--r--chain/src/default.rs247
-rw-r--r--chain/src/item/default/extract.rs509
-rw-r--r--chain/src/item/default/mod.rs62
-rw-r--r--chain/src/item/default/splone.rs10
-rw-r--r--chain/src/item/genins.rs18
-rw-r--r--chain/src/item/mod.rs15
-rw-r--r--chain/src/item/reduction.rs85
-rw-r--r--chain/src/lib.rs29
-rw-r--r--grammar/src/abnf.rs37
-rw-r--r--grammar/src/abnf/boolean_fns.rs42
-rw-r--r--grammar/src/abnf/mod.rs2536
-rw-r--r--grammar/src/abnf/tests.rs291
-rw-r--r--grammar/src/lib.rs85
-rw-r--r--graph/src/adlist.rs104
-rw-r--r--graph/src/builder.rs5
-rw-r--r--graph/src/labelled/binary.rs4
-rw-r--r--nfa/src/default/regex.rs70
-rw-r--r--semiring/src/lib.rs2
-rw-r--r--src/big_endian.c28
-rw-r--r--src/big_endian.h9
-rw-r--r--src/big_endian.obin0 -> 1112 bytes
-rw-r--r--src/binding.c286
-rw-r--r--src/bytes.rs175
-rw-r--r--src/helper.c73
-rw-r--r--src/helper.h75
-rw-r--r--src/helper.obin0 -> 1768 bytes
-rw-r--r--src/lib.rs552
-rw-r--r--src/old binding.txt218
-rwxr-xr-xsrc/rep.sobin0 -> 50480 bytes
-rwxr-xr-xsrc/testbin0 -> 33904 bytes
-rw-r--r--src/test.c224
-rw-r--r--src/test.el28
34 files changed, 5605 insertions, 267 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 1ddd0a0..a1409fa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,9 +18,15 @@ members = [ "graph", "receme", "nfa", "chain", "viz", "grammar",
# testing the new resolver, even though this has no dependencies ;p
resolver = "2"
+[lib]
+crate-type = ["cdylib"]
+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
+chain = { path = "./chain" }
+grammar = { path = "./grammar" }
+graph = { path = "./graph" }
[features]
default = []
diff --git a/chain/src/archive.txt b/chain/src/archive.txt
index 030ccba..7bd6f31 100644
--- a/chain/src/archive.txt
+++ b/chain/src/archive.txt
@@ -493,3 +493,50 @@
// PaVi::Empty => Ok(self.root().ok_or(Error::IndexOutOfBounds(0, 0))?),
// }
// }
+
+// /// Find the last child of the given node whose start and end
+// /// positions contain the given position. If no such child is
+// /// found, return `Ok(None)`.
+// ///
+// /// The returned tuple is of the form (child, index), where
+// /// `child` is the index of the child node in question, and
+// /// `index` means that the child is the `index`-th child of the
+// /// node.
+// #[allow(dead_code)]
+// pub(crate) fn position_search(
+// &self,
+// node: usize,
+// pos: usize,
+// ) -> Result<Option<(usize, usize)>, Error> {
+// fn range_contains(label: GrammarLabel, pos: usize) -> bool {
+// let start = label.start();
+//
+// if let Some(end) = label.end() {
+// (start..end).contains(&pos)
+// } else {
+// (start..).contains(&pos)
+// }
+// }
+//
+// let node_label = self
+// .vertex_label(node)?
+// .ok_or(Error::NodeNoLabel(node))?
+// .label();
+//
+// if !range_contains(node_label, pos) {
+// return Ok(None);
+// }
+//
+// for (index, child) in self.children_of(node)?.enumerate().rev() {
+// let child_label = self
+// .vertex_label(child)?
+// .ok_or(Error::NodeNoLabel(child))?
+// .label();
+//
+// if range_contains(child_label, pos) {
+// return Ok(Some((child, index)));
+// }
+// }
+//
+// Ok(None)
+// }
diff --git a/chain/src/default.rs b/chain/src/default.rs
index ed63ff4..7de720f 100644
--- a/chain/src/default.rs
+++ b/chain/src/default.rs
@@ -11,15 +11,19 @@ use crate::item::{
default::DefaultForest, generate_fragment, genins::index_out_of_bounds_conversion, Forest,
ForestLabel, ForestLabelError,
};
-use core::fmt::Display;
use grammar::{GrammarLabel, GrammarLabelType, START_NONTERMINAL, TNT};
+#[allow(unused_imports)]
use graph::{
labelled::DLGBuilder, Builder, DLGraph, Graph, LabelExtGraph, LabelGraph, ParentsGraph,
};
+use std::fmt::Display;
use graph_macro::Graph;
-use std::collections::{HashMap as Map, HashSet, TryReserveError};
+use std::{
+ borrow::Borrow,
+ collections::{HashMap as Map, HashSet, TryReserveError},
+};
/// The errors related to taking derivatives by chain rule.
#[non_exhaustive]
@@ -399,6 +403,10 @@ impl Chain for DefaultChain {
})
}
+ fn atom(&self) -> &Self::Atom {
+ self.atom.borrow()
+ }
+
fn epsilon(&self) -> Result<bool, Self::Error> {
self.accepting_vec
.get(self.current)
@@ -783,146 +791,119 @@ impl Chain for DefaultChain {
Ok(der_iter)
}
- // FIXME: Do nothing more than to make this at the end of input,
- // actually.
- //
- // This means we only close the expansions at the last accepting
- // sources.
- //
- // As to the remaining still open fragments, they should be
- // considered as bugs and should be fixed in the function
- // `insert_item` instead.
-
- fn end_of_input(&mut self) -> Result<(), Self::Error> {
- if !matches!(self.epsilon(), Ok(true)) {
- return Ok(());
- }
-
- self.forest.remove_node(1)?;
-
- let mut stack = Vec::new();
-
- dbg!(&self.accepting_sources);
+ type Item = DefaultForest<ForestLabel<GrammarLabel>>;
- // self.forest.print_viz("dbg forest before.gv").unwrap();
+ fn end_of_input(&mut self, pos: usize, ter: usize) -> Result<Self::Item, Error> {
+ let mut result = Default::default();
- for pavi in self.accepting_sources.iter() {
- match pavi {
- PaVi::Parent(node, _edge, child) => {
- // REVIEW: This is a garbage node that has to be
- // added when the machine starts. Is there a way
- // to avoid this garbage?
- if *node == 1 {
- continue;
- }
+ let root = if let Some(root) = self.forest.root() {
+ root
+ } else {
+ return Ok(result);
+ };
- stack.push(*child);
- }
- PaVi::Virtual(nt, t, node) => {
- let node_label = self
- .forest
- .vertex_label(*node)?
- .ok_or(Error::NodeNoLabel(*node))?;
+ // The 1-th node is a dummy node that should be removed.
+ assert_ne!(root, 1);
- if node_label.label().end().is_none() {
- let reduction_fragment = self.atom.generate_virtual_frags(*nt, *t, None);
+ self.forest.remove_node(1)?;
- for frag in reduction_fragment.into_iter().flatten() {
- let mut frag = frag.clone();
- frag.set_pos(node_label.label().start(), true)?;
+ let root_degree = self.forest.degree(root)?;
- stack.push(self.forest.plant_at_start(*node, frag)?);
- }
- }
- }
- PaVi::Empty => {}
- }
- }
+ let root_label = self
+ .forest
+ .vertex_label(root)?
+ .ok_or(Error::NodeNoLabel(root))?;
- let forest_nodes_len = self.forest.nodes_len();
-
- let mut node_ends: Vec<Vec<usize>> = std::iter::repeat_with(Default::default)
- .take(forest_nodes_len)
- .collect();
-
- // NOTE: Use iterator to avoid checking bounds when accessing
- // `node_ends`.
- for (node, node_end) in self.forest.nodes().zip(node_ends.iter_mut()) {
- if let Some(end) = self
- .forest
- .vertex_label(node)?
- .ok_or(Error::NodeNoLabel(node))?
- .label()
- .end()
- {
- node_end.push(end);
- }
- }
+ dbg!(root_degree, root_label);
- #[derive(Copy, Clone)]
- enum StackElement {
- Seen(usize),
- Unseen(usize),
- }
+ // REVIEW: Why nodes?
+ let mut nodes = Vec::with_capacity(root_degree);
- use StackElement::{Seen, Unseen};
+ if root_label.is_packed() {
+ let mut all_completed_clones = Vec::with_capacity(root_degree);
- // maybe we do not need so much space?
- let mut stack = Vec::with_capacity(forest_nodes_len);
+ for clone in self.forest.children_of(root)? {
+ let mut all_completed = true;
- stack.push(Unseen(
- self.forest.root().ok_or(Error::IndexOutOfBounds(0, 0))?,
- ));
+ // The last child does not count.
+ let clone_child_degree_minus_one = std::cmp::max(self.forest.degree(clone)?, 1) - 1;
- let mut seen_nodes = HashSet::with_capacity(forest_nodes_len);
+ // The loop to determine whether or not it is all
+ // completed, except for the last child.
+ 'completion_det_loop: for clone_child in self
+ .forest
+ .children_of(clone)?
+ .take(clone_child_degree_minus_one)
+ {
+ let clone_child_label = self
+ .forest
+ .vertex_label(clone_child)?
+ .ok_or(Error::NodeNoLabel(clone_child))?;
- // depth-first search
- while let Some(top) = stack.pop() {
- match top {
- Seen(top) => {
- // Every of its children should be processed
- // already.
+ if clone_child_label.label().end().is_none() {
+ all_completed = false;
- if !node_ends.get(top).unwrap().is_empty() {
- continue;
+ break 'completion_det_loop;
}
-
- let last_index = self.forest.degree(top).unwrap() - 1;
- let last_child = self.forest.nth_child(top, last_index)?;
-
- *node_ends.get_mut(top).unwrap() = node_ends
- .get(last_child)
- .ok_or(Error::IndexOutOfBounds(last_child, forest_nodes_len))?
- .clone();
}
- Unseen(top) => {
- if seen_nodes.contains(&top) {
- continue;
- }
- seen_nodes.insert(top);
- // NOTE: After the above check we can safely unwrap.
+ if all_completed {
+ all_completed_clones.push(clone);
+ }
+ }
- let mut top_no_children = true;
+ if all_completed_clones.is_empty() {
+ // Nothing to do
+ return Ok(result);
+ }
- for child in self.forest.children_of(top).unwrap() {
- if top_no_children {
- stack.push(Seen(top));
- }
+ for clone in all_completed_clones {
+ nodes.push(
+ self.forest
+ .reduction(clone, pos, ter, self.atom.borrow(), true)?,
+ );
+ }
+ } else if root_label.clone_index().is_some() {
+ panic!(
+ "If the root of the forest is cloned, \
+ the root should be changed to the packed node."
+ );
+ } else {
+ // The last child does not count.
+ let root_degree_minus_one = std::cmp::max(root_degree, 1) - 1;
- top_no_children = false;
+ dbg!(root_degree_minus_one);
- stack.push(Unseen(child));
- }
+ // The loop to determine whether or not it is all
+ // completed, except for the last child.
+ for root_child in self.forest.children_of(root)?.take(root_degree_minus_one) {
+ let root_child_label = self
+ .forest
+ .vertex_label(root_child)?
+ .ok_or(Error::NodeNoLabel(root_child))?;
- if !top_no_children {
- continue;
- }
+ if root_child_label.label().end().is_none() {
+ return Ok(result);
}
}
+
+ nodes.push(
+ self.forest
+ .reduction(root, pos, ter, self.atom.borrow(), true)?,
+ );
}
- Ok(())
+ dbg!(&nodes);
+
+ // self.forest
+ // .print_viz("forest before extraction.gv")
+ // .unwrap();
+
+ result = self.forest.extract(pos)?;
+
+ // result.print_viz("extracted forest.gv").unwrap();
+
+ Ok(result)
}
}
@@ -1051,7 +1032,7 @@ mod test_chain {
chain.chain(0, 12, no_item)?;
if !no_item {
- chain.end_of_input()?;
+ let _output = chain.end_of_input(13, 0)?;
chain.forest.print_viz("forest.gv")?;
chain.graph.print_viz("chain.gv")?;
@@ -1079,6 +1060,36 @@ mod test_chain {
}
#[test]
+ fn test_assumption() -> Result<(), Box<dyn std::error::Error>> {
+ use grammar::Grammar;
+
+ dbg!();
+ let grammar_str = std::fs::read_to_string(
+ "/Users/durand/Desktop/Centre/A propos de programmes/Rust/rep/grammar/abnf grammars/test.abnf",
+ )
+ .unwrap();
+ dbg!();
+ let grammar: Grammar = grammar_str.parse()?;
+
+ let atom = DefaultAtom::from_grammar(grammar)?;
+ let mut chain = DefaultChain::unit(atom)?;
+
+ let no_item = false;
+
+ let input: &[usize] = &[3, 0, 2, 2, 2, 1, 1, 1, 0, 1];
+
+ for (pos, t) in input.iter().copied().enumerate().take(2) {
+ chain.chain(t, pos, no_item)?;
+ chain.forest().print_viz(&format!("forest {pos}.gv"))?;
+ dbg!(pos, t);
+ }
+
+ item::default::print_labels(&chain.atom, &chain.forest)?;
+
+ Ok(())
+ }
+
+ #[test]
fn test_ambiguity() -> Result<(), Box<dyn std::error::Error>> {
if std::fs::metadata("debug.log").is_ok() {
std::fs::remove_file("debug.log")?;
@@ -1125,7 +1136,7 @@ mod test_chain {
chain.chain(1, 4, no_item)?;
if !no_item {
- // chain.end_of_input()?;
+ let _output = chain.end_of_input(5, 1)?;
chain.forest.print_viz("forest.gv")?;
// chain.forest.print_closed_viz("closed.gv")?;
}
diff --git a/chain/src/item/default/extract.rs b/chain/src/item/default/extract.rs
new file mode 100644
index 0000000..96f5119
--- /dev/null
+++ b/chain/src/item/default/extract.rs
@@ -0,0 +1,509 @@
+//! This module defines a function for extracting the "completed part"
+//! of a forest.
+//!
+//! # Completed sub-forest
+//!
+//! The completed part of a forest refers to the sub-forest of the
+//! forest that consists entirely of nodes whose ending positions are
+//! set, and whose subtrees also consist of such nodes.
+//!
+//! # Definition
+//!
+//! To be more rigorous, we call a node *n* of a forest *F* complete
+//! if its ending position is set. We call a sub-forest *S* of *F*
+//! completed if the following two conditions are satisfied:
+//!
+//! 1. Every node in *S* is complete.
+//! 2. For every node *n* in *S*, the subtree of *F* rooted at *n*
+//! consists entirely of complete nodes.
+//!
+//! # Uniqueness of the maximal completed sub-forest
+//!
+//! **Lemma**
+//!
+//! For any forest *F*, there is only one maximal completed sub-forest
+//! *S* of *F*. Here the maximality means that if *T* is a completed
+//! sub-forest of *F* which contains *S*, then *S* is equal to *T*.
+//!
+//! Then by **the completed part** of a forest *F* we refer to the
+//! unuique maximal completed sub-forest of *F*.
+//!
+//! **Proof**
+//!
+//! Note that if *S_1* and *S_2* are two completed sub-forests of *F*,
+//! and if *S_1* is not contained in *S_2*, say *n* is a node in *S_1*
+//! but not in *S_2*, then by adjoining the subtree of *S_2* rooted at
+//! *n* to *S_1* we obtain a completed sub-forest *S_3* which contains
+//! *S_1*, contradicting the maximality of *S_1*. Thus there can only
+//! be one maximal completed sub-forest of a forest.
+//!
+//! # Connected component
+//!
+//! The extraction function actually returns the connected component
+//! of the completed part of a forest which contains its root, as that
+//! is what we care about the most.
+
+use super::*;
+
+impl DefaultForest<ForestLabel<GrammarLabel>> {
+ pub(crate) fn extract(&self, pos: usize) -> Result<Self, Error> {
+ // Preparations
+
+ let nodes_len = self.nodes_len();
+
+ let mut graph = PLGraph::default();
+ let mut builder = PLGBuilderMut::from_graph_mut(&mut graph);
+ let root = Some(0);
+
+ // Now find the connected component of the completed part of
+ // `self` and clone that to graph by means of `builder`.
+
+ // REVIEW: Maybe a small_vec can help here.
+
+ // A fixed-size hash table, sort of.
+ let mut validity_array: Vec<bool> = std::iter::repeat(true).take(nodes_len).collect();
+
+ // Calculate the exact length to avoid too many allocations.
+ let mut stack_len = 0usize;
+
+ // Use an iterator to avoid checking bounds in accessing
+ // elements of the array.
+ for (node, validity_ptr) in self.nodes().zip(validity_array.iter_mut()) {
+ if self
+ .vertex_label(node)?
+ .ok_or(Error::NodeNoLabel(node))?
+ .label()
+ .end()
+ .is_none()
+ {
+ *validity_ptr = false;
+
+ stack_len += 1;
+ }
+ }
+
+ dbg!(&validity_array);
+
+ // A stack for propagating the falsehood to parents and
+ // children of incomplete nodes, like a plague. The only
+ // nodes that can stop the spread of this plague are packed
+ // nodes with a child that is not infected (yet) by the
+ // plague.
+
+ let mut stack: Vec<usize> = Vec::with_capacity(stack_len);
+
+ for (n, validity) in validity_array.iter().enumerate() {
+ if !*validity {
+ stack.push(n);
+ }
+ }
+
+ while let Some(top) = stack.pop() {
+ 'parent_loop: for parent in self.parents_of(top)?.map(|p| p.node()) {
+ if !*validity_array
+ .get(parent)
+ .ok_or(Error::IndexOutOfBounds(parent, nodes_len))?
+ {
+ // already infected by the plague
+
+ continue 'parent_loop;
+ }
+
+ let should_spread_p = if self
+ .vertex_label(parent)?
+ .ok_or(Error::NodeNoLabel(parent))?
+ .is_packed()
+ {
+ !self
+ .children_of(parent)?
+ .any(|node| validity_array.get(node).copied() == Some(true))
+ } else {
+ true
+ };
+
+ if should_spread_p {
+ *validity_array
+ .get_mut(parent)
+ .ok_or(Error::IndexOutOfBounds(parent, nodes_len))? = false;
+ stack.push(parent);
+ }
+ }
+ }
+
+ let validity_array = validity_array;
+
+ /// A little macro to produce a vector of valid children.
+ macro_rules! valid_children {
+ ($node:ident) => {
+ self.children_of($node)?
+ .filter(|child| validity_array.get(*child).copied() == Some(true))
+ .collect::<Vec<usize>>()
+ };
+ }
+
+ dbg!(&validity_array);
+
+ if validity_array.iter().all(|validity| !*validity) {
+ // every element is false
+
+ let root = None;
+
+ return Ok(Self { graph, root });
+ }
+
+ // Finally clone the connected component to the new graph.
+
+ let root_label = GrammarLabel::new_closed(TNT::Non(0), 0, pos);
+
+ let packed_label = ForestLabel::new(root_label, ForestLabelType::Packed);
+
+ let plain_label = ForestLabel::new(root_label, ForestLabelType::Plain);
+
+ let original_root_label;
+
+ let original_root = if let Some(packed_node) = self.query_label(packed_label) {
+ original_root_label = packed_label;
+
+ packed_node
+ } else if let Some(plain_node) = self.query_label(plain_label) {
+ original_root_label = plain_label;
+
+ plain_node
+ } else {
+ let root = None;
+ return Ok(Self { graph, root });
+ };
+
+ let mut roots_stack: Vec<usize>;
+
+ if original_root_label.is_packed() {
+ roots_stack = valid_children!(original_root);
+
+ match roots_stack.len() {
+ 0 => {
+ let root = None;
+
+ return Ok(Self { graph, root });
+ }
+ 1 => {
+ let child = *roots_stack.first().unwrap();
+
+ builder.add_vertex(self.vertex_label(child)?.ok_or(Error::NodeNoLabel(child))?);
+ }
+ _ => {
+ let builder_root = builder.add_vertex(original_root_label);
+
+ for child in roots_stack.iter().copied() {
+ let child_node = builder.add_vertex(
+ self.vertex_label(child)?.ok_or(Error::NodeNoLabel(child))?,
+ );
+
+ builder.add_edge(builder_root, child_node, original_root_label)?;
+ }
+ }
+ }
+ } else {
+ builder.add_vertex(original_root_label);
+
+ roots_stack = vec![original_root];
+ }
+
+ while let Some(top) = roots_stack.pop() {
+ let top_label = self.vertex_label(top)?.ok_or(Error::NodeNoLabel(top))?;
+
+ let top_in_builder = match builder.query_label(top_label) {
+ Some(top_node) => top_node,
+ None => {
+ // an old cloned node now becomes a plain node
+ let plain_label = ForestLabel::new(top_label.label(), ForestLabelType::Plain);
+
+ builder
+ .query_label(plain_label)
+ .unwrap_or_else(|| panic!("the top {top} should be planted already"))
+ }
+ };
+
+ 'children_loop: for child in self.children_of(top)? {
+ let child_label = self.vertex_label(child)?.ok_or(Error::NodeNoLabel(child))?;
+
+ // filter out invalid children
+ if validity_array.get(child).copied() != Some(true) {
+ continue 'children_loop;
+ }
+
+ // avoid unnecessary effort
+ if let Some(child_node) = builder.query_label(child_label) {
+ builder.add_edge(top_in_builder, child_node, child_label)?;
+
+ continue 'children_loop;
+ }
+
+ if child_label.is_packed() {
+ let child_valid_children = valid_children!(child);
+
+ match child_valid_children.len() {
+ 0 => {
+ panic!("this case should not happen");
+ }
+ 1 => {
+ // If a packed node only has one valid
+ // child, we clone a plain node instead.
+
+ let child_child = *child_valid_children.first().unwrap();
+
+ let child_plain_label =
+ ForestLabel::new(child_label.label(), ForestLabelType::Plain);
+
+ let child_plain_node = builder.add_vertex(child_plain_label);
+
+ builder.add_edge(
+ top_in_builder,
+ child_plain_node,
+ child_plain_label,
+ )?;
+
+ roots_stack.push(child_child);
+ }
+ _ => {
+ let child_node = builder.add_vertex(child_label);
+
+ builder.add_edge(top_in_builder, child_node, child_label)?;
+
+ roots_stack.push(child);
+ }
+ }
+
+ continue 'children_loop;
+ }
+
+ let child_node = builder.add_vertex(child_label);
+
+ builder.add_edge(top_in_builder, child_node, child_label)?;
+
+ roots_stack.push(child);
+ }
+ }
+
+ Ok(Self { graph, root })
+ }
+}
+
+#[cfg(test)]
+mod extract_tests {
+ use super::*;
+
+ fn construct_test_forest(
+ ) -> Result<DefaultForest<ForestLabel<GrammarLabel>>, Box<dyn std::error::Error>> {
+ // node 0
+ let mut result: DefaultForest<ForestLabel<GrammarLabel>> = DefaultForest::new_leaf(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(0)), 0, 3),
+ );
+
+ // node 1
+ result.plant(
+ 0,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::Rule(15), 0, 3),
+ _
+ ),
+ false,
+ )?;
+
+ result.plant(
+ 1,
+ DefaultForest::new_leaf(GrammarLabel::new_closed(
+ GrammarLabelType::TNT(TNT::Non(0)),
+ 0,
+ 3,
+ )),
+ true,
+ )?;
+
+ // node 2
+ result.plant(
+ 0,
+ leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(6), 0, 1), _),
+ false,
+ )?;
+
+ // node 3
+ result.plant(
+ 2,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(0)), 0, 1),
+ _
+ ),
+ false,
+ )?;
+
+ // node 4
+ result.plant(
+ 0,
+ leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(7), 1, 3), _),
+ false,
+ )?;
+
+ // node 5
+ result.plant(
+ 4,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(0)), 1, 3),
+ _
+ ),
+ false,
+ )?;
+
+ // node 6
+ result.plant(
+ 5,
+ leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(3), 1, 2), _),
+ false,
+ )?;
+
+ // node 7
+ result.plant(
+ 6,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(1)), 1, 2),
+ _
+ ),
+ false,
+ )?;
+
+ // node 8
+ result.plant(
+ 7,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::Rule(11), 1, 2),
+ _
+ ),
+ false,
+ )?;
+
+ // node 9
+ result.plant(
+ 8,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(2)), 1, 2),
+ _
+ ),
+ false,
+ )?;
+
+ // Clone node 5 to have node 10 and node 11
+ result.clone_node(5, 0, false)?;
+
+ // node 12
+ result.plant(
+ 11,
+ leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(3), 1, 3), _),
+ false,
+ )?;
+
+ // node 13
+ result.plant(
+ 12,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(1)), 1, 3),
+ _
+ ),
+ false,
+ )?;
+
+ result.plant(
+ 13,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::Rule(11), 1, 2),
+ _
+ ),
+ true,
+ )?;
+
+ // node 14
+ result.plant(
+ 13,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::Rule(13), 2, 3),
+ _
+ ),
+ false,
+ )?;
+
+ // node 15
+ result.plant(
+ 14,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(2)), 2, 3),
+ _
+ ),
+ false,
+ )?;
+
+ // node 16
+ result.plant(
+ 5,
+ leaf!(GrammarLabel::new(GrammarLabelType::Rule(4), 2), _),
+ false,
+ )?;
+
+ // node 17
+ result.plant(
+ 16,
+ leaf!(GrammarLabel::new(GrammarLabelType::TNT(TNT::Non(0)), 2), _),
+ false,
+ )?;
+
+ // node 18
+ result.plant(
+ 17,
+ leaf!(GrammarLabel::new_closed(GrammarLabelType::Rule(3), 2, 3), _),
+ false,
+ )?;
+
+ // node 19
+ result.plant(
+ 18,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Non(1)), 2, 3),
+ _
+ ),
+ false,
+ )?;
+
+ // node 20
+ result.plant(
+ 19,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::Rule(11), 2, 3),
+ _
+ ),
+ false,
+ )?;
+
+ result.plant(
+ 20,
+ leaf!(
+ GrammarLabel::new_closed(GrammarLabelType::TNT(TNT::Ter(2)), 2, 3),
+ _
+ ),
+ true,
+ )?;
+
+ Ok(result)
+ }
+
+ #[test]
+ fn test_extract() -> Result<(), Box<dyn std::error::Error>> {
+ let forest = construct_test_forest()?;
+
+ assert_eq!(forest.nodes_len(), 21);
+
+ forest.print_viz("forest before extraction.gv")?;
+
+ let extract_result = forest.extract(3)?;
+
+ extract_result.print_viz("extracted forest.gv")?;
+
+ Ok(())
+ }
+}
diff --git a/chain/src/item/default/mod.rs b/chain/src/item/default/mod.rs
index 47e8c90..28bdbee 100644
--- a/chain/src/item/default/mod.rs
+++ b/chain/src/item/default/mod.rs
@@ -12,7 +12,7 @@ use graph_macro::Graph;
use std::collections::HashSet;
-use core::fmt::Display;
+use std::fmt::Display;
/// The type of errors for forest operations.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
@@ -216,6 +216,15 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<ForestLabel<T>> {
return Err(Error::IndexOutOfBounds(node_id, self.nodes_len()));
}
+ // FIXME: We should take the presence of packed nodes into
+ // consideration. That is, the fragment might have already
+ // been planted and packed, and we need to account for such
+ // possibilities.
+ //
+ // Moreover, the fragments might also contain packed nodes, in
+ // which case we need to check these multiple possibilities
+ // are properly contained.
+
// We do a depth-first traversal to determine if every node
// encountered has the same set of children (labels taken into
// the consideration).
@@ -450,7 +459,7 @@ impl<T: GraphLabel> Forest<T> for DefaultForest<ForestLabel<T>> {
// Make sure it only has one parent, which is the
// representative node.
- assert_eq!(parents.len(), 1);
+ // assert_eq!(parents.len(), 1);
// We checked its length is 1, so it is safe to unwrap
// here.
@@ -670,6 +679,8 @@ impl<T: GraphLabel> DefaultForest<ForestLabel<T>> {
pub mod splone;
+pub mod extract;
+
impl<T: GraphLabel> DefaultForest<T> {
/// Create a forest with only one leaf from a raw label, unlike
/// `new_leaf`, which transforms the label for convenience.
@@ -798,53 +809,6 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
Ok(())
}
- /// Find the last child of the given node whose start and end
- /// positions contain the given position. If no such child is
- /// found, return `Ok(None)`.
- ///
- /// The returned tuple is of the form (child, index), where
- /// `child` is the index of the child node in question, and
- /// `index` means that the child is the `index`-th child of the
- /// node.
- #[allow(dead_code)]
- pub(crate) fn position_search(
- &self,
- node: usize,
- pos: usize,
- ) -> Result<Option<(usize, usize)>, Error> {
- fn range_contains(label: GrammarLabel, pos: usize) -> bool {
- let start = label.start();
-
- if let Some(end) = label.end() {
- (start..end).contains(&pos)
- } else {
- (start..).contains(&pos)
- }
- }
-
- let node_label = self
- .vertex_label(node)?
- .ok_or(Error::NodeNoLabel(node))?
- .label();
-
- if !range_contains(node_label, pos) {
- return Ok(None);
- }
-
- for (index, child) in self.children_of(node)?.enumerate().rev() {
- let child_label = self
- .vertex_label(child)?
- .ok_or(Error::NodeNoLabel(child))?
- .label();
-
- if range_contains(child_label, pos) {
- return Ok(Some((child, index)));
- }
- }
-
- Ok(None)
- }
-
/// Set the position of every node.
///
/// If ALLP is non-nil or if the node is a terminal node, also set
diff --git a/chain/src/item/default/splone.rs b/chain/src/item/default/splone.rs
index 581d1dc..da13c56 100644
--- a/chain/src/item/default/splone.rs
+++ b/chain/src/item/default/splone.rs
@@ -516,7 +516,13 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
if let Some((fragment, _planted)) = fragment {
let modified_degree = std::cmp::max(child_degree, 1) - 1;
- if self.has_same_children(child, node, modified_degree, edge_index)?
+ dbg!(node, end, edge_index, modified_degree);
+
+ dbg!(child, child_degree);
+
+ dbg!(&fragment);
+
+ if self.has_same_children(child, node, modified_degree, edge_index + 1)?
&& child_degree != 0
{
let last_child = self.nth_child(child, child_degree - 1)?;
@@ -651,7 +657,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
} else if labela.clone_index().is_some() {
let mut parentsa = self.parents_of(nodea)?;
- assert_eq!(parentsa.len(), 1);
+ // assert_eq!(parentsa.len(), 1);
let parenta = parentsa.next().unwrap().node();
diff --git a/chain/src/item/genins.rs b/chain/src/item/genins.rs
index 19f835b..e409c3c 100644
--- a/chain/src/item/genins.rs
+++ b/chain/src/item/genins.rs
@@ -32,7 +32,7 @@ pub(crate) fn index_out_of_bounds_conversion(ge: GrammarError) -> Error {
/// Determine if a label is labelled by a terminal.
fn is_labelled_by_terminal(label: GrammarLabelType) -> bool {
- matches!(label.tnt(), Some(t) if matches!(t, TNT::Ter(_)))
+ matches!(label.tnt(), Some(tnt) if matches!(tnt, TNT::Ter(_)))
}
/// A helper function to generate a fragment of forest.
@@ -211,7 +211,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
// Whether or not to print detailed graphs of each step of
// operation for debugging purposes.
- let to_print = false;
+ let to_print = true;
let tnt_string = {
let empty_p = atom_child_iter.len() == 0;
@@ -376,7 +376,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
if let PaVi::Parent(node, edge, _) = pavi {
let nth_child = self.nth_child(node, edge)?;
- let reduced = self.reduction(nth_child, pos, ter, atom.borrow())?;
+ let reduced = self.reduction(nth_child, pos, ter, atom.borrow(), false)?;
if reduced != nth_child && !self.is_empty_node(reduced)? {
parents.clear();
@@ -700,10 +700,14 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
let reduction_fragment = atom.generate_virtual_frags(nt, t, None);
- // NOTE: the case of the root is exceptional
- if reduction_fragment.is_none() && self.root() != Some(node) {
- return Err(Error::CannotClose(nt, t, node, node_label_start));
- }
+ // Maybe we do not have to force the reduciton here?
+
+ // // NOTE: the case of the root is exceptional
+ // if reduction_fragment.is_none() && self.root() != Some(node) {
+ // dbg!(self.root());
+ // self.print_viz("cannot close.gv").unwrap();
+ // return Err(Error::CannotClose(nt, t, node, node_label_start));
+ // }
for frag in reduction_fragment.into_iter().flatten() {
let mut frag = frag.clone();
diff --git a/chain/src/item/mod.rs b/chain/src/item/mod.rs
index 54ca946..d2c3127 100644
--- a/chain/src/item/mod.rs
+++ b/chain/src/item/mod.rs
@@ -7,7 +7,7 @@
use graph::{error::Error as GError, GraphLabel, LabelGraph, Parent, ParentsGraph};
-use core::borrow::Borrow;
+use std::borrow::Borrow;
/// A parent or a virtual segment.
///
@@ -96,7 +96,7 @@ enum ForestLabelType {
Cloned(usize),
}
-impl core::fmt::Display for ForestLabelType {
+impl std::fmt::Display for ForestLabelType {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
Self::Packed => write!(f, "packed"),
@@ -113,8 +113,8 @@ pub struct ForestLabel<T: GraphLabel> {
status: ForestLabelType,
}
-impl<T: GraphLabel> core::fmt::Display for ForestLabel<T> {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+impl<T: GraphLabel> std::fmt::Display for ForestLabel<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if !matches!(self.status, ForestLabelType::Plain) {
write!(f, "{}, {}", self.label, self.status)
} else {
@@ -132,8 +132,8 @@ pub enum ForestLabelError {
ClonePack,
}
-impl core::fmt::Display for ForestLabelError {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+impl std::fmt::Display for ForestLabelError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::PackClone => write!(f, "cannot pack a cloned node"),
Self::ClonePack => write!(f, "cannot clone a packed node"),
@@ -230,6 +230,9 @@ impl<T: GraphLabel> From<T> for ForestLabel<T> {
}
}
+// REVIEW: Should we move this trait (and only this trait) to a
+// separate crate, or is it enough to keep it here?
+
/// The expected behaviours of an item derivation forest.
///
/// Note that it requires only a subset of the functionalities of
diff --git a/chain/src/item/reduction.rs b/chain/src/item/reduction.rs
index 89306e6..3c76c1e 100644
--- a/chain/src/item/reduction.rs
+++ b/chain/src/item/reduction.rs
@@ -122,6 +122,9 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
/// The parameter `ter` is used to fill in segments for virtual
/// nodes if they are found along the way.
///
+ /// The parameter `accept_root` controls whether we want to
+ /// perform reduction on the root.
+ ///
/// # Errors
///
/// 1. Index out of bounds: some node number is corrupted.
@@ -132,12 +135,13 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
pos: usize,
ter: usize,
atom: &DefaultAtom,
+ accept_root: bool,
) -> Result<usize, Error> {
let mut result = node;
// step 1: Determine if this needs reductions.
- if self.root() == Some(node) {
+ if !accept_root && self.root() == Some(node) {
return Ok(result);
}
@@ -152,9 +156,30 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
return Ok(result);
}
+ // Data type for representing the status when performing a
+ // search.
+
+ #[derive(PartialEq, Eq, Copy, Clone, Debug)]
+ enum Status {
+ Correct,
+ Incorrect,
+ Visited,
+ }
+
+ impl From<bool> for Status {
+ fn from(value: bool) -> Self {
+ match value {
+ true => Self::Correct,
+ false => Self::Incorrect,
+ }
+ }
+ }
+
+ use Status::*;
+
// step 2: Find descendents that need reductions.
- let mut correct_ends: Map<usize, bool> = Default::default();
+ let mut correct_ends: Map<usize, Status> = Default::default();
let mut order_of_correct_ends: Vec<usize> = Vec::new();
@@ -165,7 +190,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
use std::{borrow::BorrowMut, io::Write};
// Whether or not to write a debug file.
- let to_write = false;
+ let to_write = true;
if to_write {
if let Ok(ref mut file) = file.borrow_mut() {
@@ -181,14 +206,18 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
}
}
- if correct_ends.get(&top).is_some() {
+ let old_value = correct_ends.get(&top).copied();
+
+ if matches!(old_value, Some(Correct) | Some(Incorrect)) {
continue 'stack_loop;
}
+ correct_ends.insert(top, Visited);
+
let top_label = self.vertex_label(top)?.ok_or(Error::NodeNoLabel(top))?;
if let Some(end) = top_label.label().end() {
- correct_ends.insert(top, end == pos);
+ correct_ends.insert(top, (end == pos).into());
continue 'stack_loop;
}
@@ -216,8 +245,8 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
'child_loop: for child in self.children_of(top)? {
match correct_ends.get(&child).copied() {
- Some(true) => {
- correct_ends.insert(top, true);
+ Some(Correct) => {
+ correct_ends.insert(top, Correct);
inserted = true;
}
@@ -232,12 +261,15 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
if has_unexplored_children {
stack.push(top);
- stack.extend(self.children_of(top)?);
+ stack.extend(
+ self.children_of(top)?
+ .filter(|child| correct_ends.get(child).is_none()),
+ );
} else if !inserted {
- correct_ends.insert(top, false);
+ correct_ends.insert(top, Incorrect);
}
} else {
- correct_ends.insert(top, false);
+ correct_ends.insert(top, Incorrect);
}
continue 'stack_loop;
@@ -245,6 +277,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
if top_label.is_packed() {
let mut has_unexplored_children = false;
+ let mut inserted = false;
if to_write {
if let Ok(ref mut file) = file.borrow_mut() {
@@ -255,12 +288,13 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
for child in self.children_of(top)? {
match correct_ends.get(&child).copied() {
- Some(true) => {
+ Some(Correct) => {
// NOTE: This is not recorded in the
// correct orders, as we do not handle
// packed nodes directly.
- correct_ends.insert(top, true);
+ correct_ends.insert(top, Correct);
+ inserted = true;
}
None => {
has_unexplored_children = true;
@@ -280,9 +314,12 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
if has_unexplored_children {
stack.push(top);
- stack.extend(self.children_of(top)?);
- } else if correct_ends.get(&top).is_none() {
- correct_ends.insert(top, false);
+ stack.extend(
+ self.children_of(top)?
+ .filter(|child| correct_ends.get(child).is_none()),
+ );
+ } else if !inserted {
+ correct_ends.insert(top, Incorrect);
}
continue 'stack_loop;
@@ -309,7 +346,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
panic!("a terminal node {top} has no ending position?");
}
Some(TNT::Non(nt)) => {
- correct_ends.insert(top, true);
+ correct_ends.insert(top, Correct);
self.close_pavi(atom.borrow(), PaVi::Virtual(nt, ter, top), pos)?;
@@ -322,7 +359,7 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
Some(end) => end,
};
- correct_ends.insert(top, end == pos);
+ correct_ends.insert(top, (end == pos).into());
if end == pos {
order_of_correct_ends.push(top);
@@ -341,10 +378,12 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
let last_child = self.nth_child(top, last_index)?;
if let Some(child_correctness_value) = correct_ends.get(&last_child).copied() {
- correct_ends.insert(top, child_correctness_value);
+ if child_correctness_value != Visited {
+ correct_ends.insert(top, child_correctness_value);
- if child_correctness_value {
- order_of_correct_ends.push(top);
+ if child_correctness_value == Correct {
+ order_of_correct_ends.push(top);
+ }
}
} else {
stack.extend([top, last_child]);
@@ -375,12 +414,6 @@ impl DefaultForest<ForestLabel<GrammarLabel>> {
#[cfg(debug_assertions)]
{
- let label_label_label = label.label().label();
-
- assert!(label_label_label.rule().is_none());
-
- assert!(matches!(label_label_label.tnt(), Some(TNT::Non(_))));
-
assert!(label.label().end().is_none());
assert_ne!(degree, 0);
diff --git a/chain/src/lib.rs b/chain/src/lib.rs
index 9de1df7..bf3a8b4 100644
--- a/chain/src/lib.rs
+++ b/chain/src/lib.rs
@@ -240,6 +240,10 @@ pub trait Chain: LabelExtGraph<Edge> {
/// language.
fn unit(atom: Self::Atom) -> Result<Self, Self::Error>;
+ /// Produce the underlying atom, so that we can produce an initial
+ /// empty chain from the same atom again.
+ fn atom(&self) -> &Self::Atom;
+
/// Return true if and only if the language contains the empty
/// string.
fn epsilon(&self) -> Result<bool, Self::Error>;
@@ -283,7 +287,8 @@ pub trait Chain: LabelExtGraph<Edge> {
/// The argument `t` is the terminal we computet the derivative
/// with.
///
- /// The argument `pos` is the position within the input.
+ /// The argument `pos` is the zero-based position within the
+ /// input.
///
/// The argument `no_item` determines whether we want the item
/// derivation forest as well.
@@ -307,9 +312,29 @@ pub trait Chain: LabelExtGraph<Edge> {
Ok(())
}
+ // FIXME: I shall probably not use the trait of forests for this
+ // purpose, but I have not figured out yet wha the correct trait
+ // should be.
+ /// The type of output item that will be produced by this machine.
+ type Item: item::Forest<grammar::GrammarLabel>;
+
/// Signal to the parser that the end of the input is reached, so
/// that the parser knows to generate suitable forests.
- fn end_of_input(&mut self) -> Result<(), Self::Error>;
+ ///
+ /// This is not needed when recognizing instead of parsing.
+ ///
+ /// We also pass in the current position `pos` so that we have a
+ /// little flexibility in the position of the end of input. In
+ /// addition, this makes it easier to produce the suitable
+ /// forests.
+ ///
+ /// As a reminder, `pos` should be the position of the last
+ /// terminal, so if there are three terminals in total, the `pos`
+ /// should be `2` , instead of `3`.
+ ///
+ /// Similarly, we pass in the terminal at the position `pos` to
+ /// aid the production of the forest.
+ fn end_of_input(&mut self, pos: usize, ter: usize) -> Result<Self::Item, Self::Error>;
}
pub mod default;
diff --git a/grammar/src/abnf.rs b/grammar/src/abnf.rs
deleted file mode 100644
index eb56819..0000000
--- a/grammar/src/abnf.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-//! This file implements the function to read a grammar from a string,
-//! in the [augmented Backus Naur
-//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form).
-//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its
-//! specifications.
-//!
-//! In fact, the rules of ABNF are considered to be too strict. For
-//! example, only ASCII characters are allowed in the format, even
-//! within comments. I think any character should be allowed within
-//! comments, so the implementation here does that.
-
-use super::*;
-
-/// The type of errors encountered during parsing a grammar in the
-/// ABNF format. See the function [`abnf_to_grammar`] for the parsing
-/// function.
-#[derive(Debug, Clone, Copy)]
-pub enum Error {
- /// Something invalid has occurred.
- Invalid,
-}
-
-impl core::fmt::Display for Error {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
- match self {
- Error::Invalid => write!(f, "invalid"),
- }
- }
-}
-
-#[allow(unused)]
-/// This function converts a string to a grammar.
-pub fn abnf_to_grammar(str: impl AsRef<str>) -> Result<Grammar, Error> {
- let str = str.as_ref();
-
- todo!()
-}
diff --git a/grammar/src/abnf/boolean_fns.rs b/grammar/src/abnf/boolean_fns.rs
new file mode 100644
index 0000000..08f317c
--- /dev/null
+++ b/grammar/src/abnf/boolean_fns.rs
@@ -0,0 +1,42 @@
+//! This file collects some functions that return a boolean value from
+//! a character.
+
+pub(super) fn is_wsp(c: char) -> bool {
+ c == ' ' || c == '\t'
+}
+
+pub(crate) fn is_v_char(c: char) -> bool {
+ ('!'..='~').contains(&c)
+}
+
+pub(super) fn is_wsp_or_v_char(c: char) -> bool {
+ is_wsp(c) || is_v_char(c)
+}
+
+pub(super) fn is_not_newline(c: char) -> bool {
+ c != '\n' && c != '\r'
+}
+
+pub(super) fn is_alpha(c: char) -> bool {
+ c.is_ascii_alphabetic()
+}
+
+pub(super) fn is_decimal_digit(c: char) -> bool {
+ c.is_ascii_digit()
+}
+
+pub(super) fn is_binary_digit(c: char) -> bool {
+ c == '0' || c == '1'
+}
+
+pub(super) fn is_hexadecimal_digit(c: char) -> bool {
+ is_decimal_digit(c) || ('A'..='F').contains(&c) || ('a'..='f').contains(&c)
+}
+
+pub(super) fn is_alpha_or_digit_or_hyphen(c: char) -> bool {
+ is_alpha(c) || is_decimal_digit(c) || c == '-'
+}
+
+pub(super) fn is_char_value(c: char) -> bool {
+ (' '..='~').contains(&c) && c != '"'
+}
diff --git a/grammar/src/abnf/mod.rs b/grammar/src/abnf/mod.rs
new file mode 100644
index 0000000..f825bef
--- /dev/null
+++ b/grammar/src/abnf/mod.rs
@@ -0,0 +1,2536 @@
+//! This file implements the function to read a grammar from a string,
+//! in the [augmented Backus Naur
+//! form](https://en.wikipedia.org/wiki/Augmented_Backus–Naur_form).
+//! See [RFC5234](https://www.rfc-editor.org/rfc/rfc5234) for its
+//! specifications.
+//!
+//! In fact, the rules of ABNF are considered to be too strict. For
+//! example, only ASCII characters are allowed in the format, even
+//! within comments. I think any character should be allowed within
+//! comments, so the implementation here does that.
+
+#![allow(dead_code)]
+
+use super::*;
+
+/// The type of errors encountered during parsing a grammar in the
+/// ABNF format.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum Error {
+ /// A rule name without rule body.
+ HangingRuleName(usize),
+ /// The minimum is greater than maximum?
+ MinGreaterThanMax(usize),
+ /// A digit is expected, but is not encountered.
+ InvalidDigit(usize),
+ /// A character that is not allowed.
+ InvalidCharacter(usize),
+ /// A numeric value is not completed.
+ HangingNumeric(usize),
+ /// A repetition is hanging.
+ HangingRepetition(usize),
+ /// A double quote is hanging.
+ HangingDQuote(usize),
+ /// A percentage sign is hanging.
+ HangingPercent(usize),
+ /// An equal sign is hanging.
+ HangingEqualsSign(usize),
+ /// An operation is unsupported.
+ UnsupportedOperation,
+ /// Encounters an unknown node when constructing the regular
+ /// expression.
+ RegexUnknownNode(usize),
+ /// An index is out of bounds when constructing the regular
+ /// expression.
+ RegexIndexOutOfBounds(usize, usize),
+ /// A node is duplicated when constructing the regular expression.
+ RegexDuplicatedNode(usize),
+ /// An edge is duplicated when constructing the regular
+ /// expression.
+ RegexDuplicatedEdge(usize, usize),
+ /// The constructed regular expression graph has no root.
+ RegexNoRoot,
+ /// The constructed regular expression graph has cycles.
+ RegexCycle,
+ /// The stack illegally becomes empty when constructing a regular
+ /// expression.
+ RegexEmptyStack(usize),
+ /// The action stack illegally becomes empty when constructing a
+ /// regular expression.
+ RegexEmptyActionStack(usize),
+ /// An unbalanced parenthesis is encountered when constructing a
+ /// regular expression.
+ RegexUnbalancedParen(usize),
+ /// An invalid repetition specification is encountered when
+ /// constructing a regular expression.
+ RegexInvalidRepeat(usize),
+ /// An invalid comment character is encountered.
+ RegexInvalidComment(usize),
+ /// An invalid numeric value is encoutnered.
+ RegexInvalidNumeric(usize),
+ /// A rule should have an equal sign.
+ MissingEqualSign(usize),
+ /// An unimplemented feature is required.
+ Unimplemented(usize),
+ /// Input-Output error.
+ IOError(std::io::Error),
+ /// The grammar is empty.
+ EmptyGrammar,
+ /// Found an appending rule associated to a non-existent rule.
+ AppendToNothing(usize),
+ /// Found a rule declaration associated to an existing rule.
+ CreateAgain(usize),
+ /// This terminal is not found.
+ UnknownTerminal(String),
+ /// This non-terminal is not found.
+ UnknownNonterminal(String),
+ /// Something invalid has occurred.
+ Invalid,
+}
+
+impl std::error::Error for Error {}
+
+impl From<std::io::Error> for Error {
+ fn from(e: std::io::Error) -> Self {
+ Self::IOError(e)
+ }
+}
+
+impl From<graph::error::Error> for Error {
+ fn from(value: graph::error::Error) -> Self {
+ match value {
+ graph::error::Error::IndexOutOfBounds(index, bound) => {
+ Error::RegexIndexOutOfBounds(index, bound)
+ }
+ graph::error::Error::DuplicatedNode(n) => Error::RegexDuplicatedNode(n),
+ graph::error::Error::DuplicatedEdge(from, to) => Error::RegexDuplicatedEdge(from, to),
+ _ => Error::Invalid,
+ }
+ }
+}
+
+impl std::fmt::Display for Error {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Error::HangingRuleName(pos) => write!(f, "a rule name without body at {pos}"),
+ Error::MinGreaterThanMax(pos) => write!(
+ f,
+ "minimum should not be strictly greater than the maximum at {pos}."
+ ),
+ Error::InvalidDigit(pos) => {
+ write!(f, "a digit is expected at {pos}, but is not found.")
+ }
+ Error::InvalidCharacter(pos) => write!(f, "the character at {pos} is not allowed"),
+ Error::HangingNumeric(pos) => write!(f, "a numeric value is hanging at {pos}"),
+ Error::HangingRepetition(pos) => write!(f, "a repetition is hanging at {pos}"),
+ Error::HangingDQuote(pos) => write!(f, "a double quote is hanging at {pos}"),
+ Error::HangingPercent(pos) => write!(f, "a percentage sign is hanging at {pos}"),
+ Error::HangingEqualsSign(pos) => write!(f, "an equal sign is hanging at {pos}"),
+ Error::UnsupportedOperation => write!(f, "an unsupported operation is encountered"),
+ Error::RegexUnknownNode(n) => write!(f, "an unknown node {n} is encountered \
+ when constructing the regular expression."),
+ Error::RegexIndexOutOfBounds(index, bound) => write!(f, "an index {index} is out of \
+ bound {bound} when constructing the regular expression."),
+ Error::RegexDuplicatedNode(n) => write!(f, "a node {n} is duplicated \
+ when constructing the regular expression."),
+ Error::RegexDuplicatedEdge(from, to) => write!(f, "an edge from {from} to {to} is duplicated \
+ when constructing the regular expression."),
+ Error::RegexNoRoot => write!(f, "the constructed regular expression has no root."),
+ Error::RegexCycle => write!(f, "the constructed regular expression has cycles."),
+ Error::RegexEmptyStack(pos) => write!(f, "the stack illegally becomes empty at {pos} \
+ when constructing a regular expression."),
+ Error::RegexEmptyActionStack(pos) => write!(f, "the action stack illegally becomes empty at {pos} \
+ when constructing a regular expression."),
+ Error::RegexUnbalancedParen(pos) => write!(f, "An unbalanced parenthesis is encountered at {pos} \
+ when constructing a regular expression."),
+ Error::RegexInvalidRepeat(pos) => write!(f, "An invalid repetition specification is encountered \
+ at {pos} when constructing a regular expression."),
+ Error::RegexInvalidComment(pos) => write!(f, "An invalid character is encountered in a comment \
+ at {pos}"),
+ Error::RegexInvalidNumeric(pos) => write!(f, "An invalid numeric value is encoutnered at {pos}."),
+ Error::MissingEqualSign(pos) => write!(f, "A rule is missing an equal sign at {pos}"),
+ Error::Unimplemented(pos) => write!(f, "An unimplemented feature is required at {pos}."),
+ Error::IOError(e) => write!(f, "An IO error: {e}"),
+ Error::EmptyGrammar => write!(f, "the grammar is empty"),
+ Error::AppendToNothing(pos) => write!(f, "the rule at {pos} is appending to a non-existing rule."),
+ Error::CreateAgain(pos) => write!(f, "the rule at {pos} is creating a duplicated rule."),
+ Error::UnknownTerminal(name) => write!(f, "the terminal {name} is not found."),
+ Error::UnknownNonterminal(name) => write!(f, "the non-terminal {name} is not found."),
+ Error::Invalid => write!(f, "invalid"),
+ }
+ }
+}
+
+/// Return the number of bytes taken by the characters for which the
+/// function `f` returns `true`.
+fn skip_by(s: &str, f: impl Fn(char) -> bool) -> usize {
+ for (index, c) in s.char_indices() {
+ if !f(c) {
+ return index;
+ }
+ }
+
+ s.len()
+}
+
+mod boolean_fns;
+
+pub(crate) use boolean_fns::*;
+
+fn skip_c_nl(s: &str) -> Option<usize> {
+ if s.is_empty() {
+ return Some(0);
+ }
+
+ let s_len = s.len();
+
+ match s.chars().next().unwrap() {
+ ';' => {
+ let after_not_newline = skip_by(&s[1..], is_not_newline);
+
+ if 1 + after_not_newline >= s_len {
+ Some(s_len)
+ } else {
+ let s = &s[1 + after_not_newline..];
+
+ match s.chars().next().unwrap() {
+ '\n' | '\r' => Some(after_not_newline + 2),
+ _ => {
+ // We skipped anything not newline, so this
+ // character must be newline.
+ unreachable!();
+ }
+ }
+ }
+ }
+ '\n' | '\r' => Some(1),
+ _ => None,
+ }
+}
+
+fn skip_c_wsp(s: &str) -> usize {
+ let mut currently_skipped = 0usize;
+ let mut s = s;
+
+ let mut continue_skipping = true;
+
+ 'skipping_loop: while continue_skipping {
+ continue_skipping = false;
+
+ let skip_wsp = skip_by(s, is_wsp);
+
+ if skip_wsp > 0 {
+ if skip_wsp >= s.len() {
+ return currently_skipped + s.len();
+ }
+
+ continue_skipping = true;
+ s = &s[skip_wsp..];
+ currently_skipped += skip_wsp;
+
+ continue 'skipping_loop;
+ }
+
+ if let Some(skip_c_nl_size) = skip_c_nl(s) {
+ if skip_c_nl_size >= s.len() {
+ return currently_skipped + s.len();
+ }
+
+ continue_skipping = true;
+
+ s = &s[skip_c_nl_size..];
+ currently_skipped += skip_c_nl_size;
+
+ let skip_wsp_after_c_nl = skip_by(s, is_wsp);
+
+ s = &s[skip_wsp_after_c_nl..];
+ currently_skipped += skip_wsp_after_c_nl;
+ }
+ }
+
+ currently_skipped
+}
+
+#[derive(Debug, Eq, PartialEq)]
+enum RepetitionSpec {
+ Star,
+ Equal(usize),
+ Min(usize),
+ Max(usize),
+ MinMax(usize, usize),
+}
+
+#[derive(Debug)]
+enum RepetitionAction {
+ /// Ignore the following element.
+ Ignore,
+ /// The following is put in a star.
+ Star,
+ /// The following is put in an optional construct.
+ Optional,
+ /// The following is put in a plus construct.
+ Plus,
+ /// Copy the following some number of times.
+ Copy(usize),
+ /// Copy the following some number of times, and then add a plus
+ /// of that same element.
+ CopyPlus(usize),
+ /// Copy the following some number of times, and then add another
+ /// number of *nested* optional construts of that same element.
+ ///
+ /// # Why nested?
+ ///
+ /// The optional constructs are nested here as a sequence of
+ /// optional constructs is exponentially more ambiguous than the
+ /// nested equivalent: "a?a?a?" is more ambiguous than
+ /// "(a(a(a)?)?)?". For example, the input "a" can have three
+ /// derivations according to "a?a?a?", but has only one derivation
+ /// according to "(a(a(a)?)?)?".
+ CopyOptional(usize, usize),
+}
+
+impl TryFrom<Option<RepetitionSpec>> for RepetitionAction {
+ type Error = Error;
+
+ fn try_from(spec: Option<RepetitionSpec>) -> Result<Self, Self::Error> {
+ if let Some(r) = spec {
+ match r {
+ RepetitionSpec::Star => Ok(Self::Star),
+ RepetitionSpec::Equal(0) => Ok(Self::Ignore),
+ RepetitionSpec::Equal(n) => Ok(Self::Copy(n)),
+ RepetitionSpec::Min(0) => Ok(Self::Star),
+ RepetitionSpec::Min(1) => Ok(Self::Plus),
+ RepetitionSpec::Min(n) => Ok(Self::CopyPlus(n - 1)),
+ RepetitionSpec::Max(0) => Ok(Self::Ignore),
+ RepetitionSpec::Max(1) => Ok(Self::Optional),
+ RepetitionSpec::Max(n) => Ok(Self::CopyOptional(0, n)),
+ RepetitionSpec::MinMax(0, 0) => Ok(Self::Ignore),
+ RepetitionSpec::MinMax(0, 1) => Ok(Self::Optional),
+ RepetitionSpec::MinMax(n, m) if n > m => Err(Error::MinGreaterThanMax(0)),
+ RepetitionSpec::MinMax(n, m) if n == m => Ok(Self::Copy(n)),
+ RepetitionSpec::MinMax(n, m) => Ok(Self::CopyOptional(n, m - n)),
+ }
+ } else {
+ Ok(Self::Copy(1))
+ }
+ }
+}
+
+fn parse_repetition_spec(s: &str) -> Option<(usize, RepetitionSpec)> {
+ let mut s = s;
+
+ let digit_size = skip_by(s, is_decimal_digit);
+
+ let mut min: Option<usize> = None;
+ let mut max: Option<usize> = None;
+
+ if digit_size != 0 {
+ min = Some(s[..digit_size].parse::<usize>().unwrap());
+ }
+
+ let mut current_index = digit_size;
+
+ if digit_size == s.len() {
+ if let Some(m) = min {
+ return Some((digit_size, RepetitionSpec::Equal(m)));
+ } else {
+ return None;
+ };
+ }
+
+ s = &s[current_index..];
+
+ if s.starts_with('*') {
+ if s.len() == 1 {
+ if let Some(m) = min {
+ return Some((current_index + 1, RepetitionSpec::Min(m)));
+ } else {
+ return Some((current_index + 1, RepetitionSpec::Star));
+ }
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ let digit_size = skip_by(s, is_decimal_digit);
+
+ if digit_size != 0 {
+ max = Some(s[..digit_size].parse::<usize>().unwrap());
+ }
+
+ current_index += digit_size;
+
+ match (min, max) {
+ (Some(mi), Some(ma)) => Some((current_index, RepetitionSpec::MinMax(mi, ma))),
+ (Some(m), None) => Some((current_index, RepetitionSpec::Min(m))),
+ (None, Some(m)) => Some((current_index, RepetitionSpec::Max(m))),
+ _ => Some((current_index, RepetitionSpec::Star)),
+ }
+ } else {
+ min.map(|m| (current_index, RepetitionSpec::Equal(m)))
+ }
+}
+
+fn parse_rulename(s: &str) -> Result<usize, Error> {
+ let first_char = s.chars().next().unwrap();
+
+ if !is_alpha(first_char) {
+ return Ok(0);
+ }
+
+ let rulename_end = skip_by(s, is_alpha_or_digit_or_hyphen);
+
+ if rulename_end >= s.len() {
+ // dbg!();
+ return Err(Error::HangingRuleName(0));
+ }
+
+ Ok(rulename_end)
+}
+
+#[derive(Debug, Eq, PartialEq, Hash)]
+enum IntermediateConstruct {
+ Terminal(String),
+ Nonterminal(String),
+ Range(char, char),
+}
+
+impl Display for IntermediateConstruct {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ IntermediateConstruct::Terminal(t) => write!(f, "t{t}"),
+ IntermediateConstruct::Nonterminal(n) => write!(f, "n{n}"),
+ IntermediateConstruct::Range(min, max) => write!(f, "[{min}-{max}]"),
+ }
+ }
+}
+
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+enum Radix {
+ Binary,
+ Decimal,
+ Hexadecimal,
+}
+
+fn parse_numeric(s: &str, radix: Radix) -> Result<(usize, IntermediateConstruct), Error> {
+ // first read a number
+
+ let is_radix_digit = match radix {
+ Radix::Binary => is_binary_digit,
+ Radix::Decimal => is_decimal_digit,
+ Radix::Hexadecimal => is_hexadecimal_digit,
+ };
+
+ let radix = match radix {
+ Radix::Binary => 2,
+ Radix::Decimal => 10,
+ Radix::Hexadecimal => 16,
+ };
+
+ let mut s = s;
+ let mut current_index = 0;
+
+ let first_digit_size = skip_by(s, is_radix_digit);
+
+ if first_digit_size == 0 {
+ // dbg!();
+ return Err(Error::InvalidDigit(current_index));
+ }
+
+ let first_number =
+ match std::char::from_u32(u32::from_str_radix(&s[..first_digit_size], radix).unwrap()) {
+ Some(c) => c,
+ None => {
+ // dbg!();
+ return Err(Error::InvalidCharacter(current_index));
+ }
+ };
+
+ if first_digit_size >= s.len() {
+ return Ok((
+ current_index + s.len(),
+ IntermediateConstruct::Terminal(first_number.to_string()),
+ ));
+ }
+
+ s = &s[first_digit_size..];
+
+ current_index += first_digit_size;
+
+ // then read a dot or a dash
+
+ match s.chars().next().unwrap() {
+ '.' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let mut stop_reading = false;
+ let mut chars_vec = vec![first_number];
+
+ while !stop_reading {
+ if s.is_empty() {
+ return Err(Error::HangingNumeric(current_index - 1));
+ }
+
+ let digit_size = skip_by(s, is_radix_digit);
+
+ if digit_size == 0 {
+ // dbg!();
+ return Err(Error::InvalidDigit(current_index));
+ }
+
+ let a_number =
+ std::char::from_u32(u32::from_str_radix(&s[..digit_size], radix).unwrap())
+ .ok_or(Error::InvalidCharacter(current_index))?;
+
+ chars_vec.push(a_number);
+
+ s = &s[digit_size..];
+ current_index += digit_size;
+
+ if !s.starts_with('.') {
+ stop_reading = true;
+ // println!(
+ // "digit_size = {digit_size} a number = {a_number}, s = {}",
+ // &s[..10]
+ // );
+ } else {
+ s = &s[1..];
+ current_index += 1;
+ }
+ }
+
+ Ok((
+ current_index,
+ IntermediateConstruct::Terminal(chars_vec.into_iter().collect()),
+ ))
+ }
+ '-' => {
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ return Err(Error::HangingNumeric(current_index - 1));
+ }
+
+ let second_digit_size = skip_by(s, is_radix_digit);
+
+ if second_digit_size == 0 {
+ println!(
+ "s = {}",
+ s.chars()
+ .take(if s.len() >= 50 { 50 } else { s.len() })
+ .collect::<String>()
+ );
+ return Err(Error::InvalidDigit(current_index));
+ }
+
+ let _second_number =
+ std::char::from_u32(u32::from_str_radix(&s[..second_digit_size], radix).unwrap())
+ .ok_or(Error::InvalidCharacter(current_index))?;
+
+ Err(Error::Unimplemented(current_index - 1))
+
+ // Ok((
+ // current_index + second_digit_size,
+ // IntermediateConstruct::Range(first_number, second_number),
+ // ))
+ }
+ c if is_wsp(c) || c == '\n' || c == '\r' || c == ';' => Ok((
+ current_index,
+ IntermediateConstruct::Terminal(first_number.to_string()),
+ )),
+ _ => Err(Error::InvalidCharacter(current_index)),
+ }
+}
+
+#[derive(Debug)]
+struct AlternationConstruct {
+ intermediates: Vec<IntermediateConstruct>,
+ regex: DefaultRegex<usize>,
+}
+
+fn parse_alternation(s: &str, indentation: usize) -> Result<(usize, AlternationConstruct), Error> {
+ let mut stack: Vec<usize> = vec![0, 1];
+ let mut intermediates: Vec<IntermediateConstruct> = Vec::new();
+ let mut types: Vec<RegexType<usize>> = vec![RegexType::Or, RegexType::Empty];
+ let mut edges: Vec<Vec<usize>> = vec![vec![1], Vec::new()];
+
+ fn error_conversion(error: nfa::error::Error) -> Error {
+ match error {
+ nfa::error::Error::UnknownNode(n) => Error::RegexUnknownNode(n),
+ nfa::error::Error::UnsupportedOperation => Error::UnsupportedOperation,
+ nfa::error::Error::NoRoot => Error::RegexNoRoot,
+ nfa::error::Error::Graph(ge) => ge.into(),
+ nfa::error::Error::Cycle => Error::RegexCycle,
+ _ => Error::Invalid,
+ }
+ }
+
+ let mut s = s;
+ let mut current_index = 0usize;
+
+ let mut nonterminal_nodes: HashSet<usize> = HashSet::new();
+
+ if s.is_empty() {
+ let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?;
+
+ return Ok((
+ current_index,
+ AlternationConstruct {
+ intermediates,
+ regex,
+ },
+ ));
+ }
+
+ // A vector of tuples to record some information for each level of
+ // nesting.
+ //
+ // A tuple (len, action) means that the stack grows by LEN many
+ // tokens at this level, and the action ACTION is pending.
+ let mut repetition_action_stack: Vec<(usize, Option<RepetitionAction>)> = vec![];
+
+ while !stack.is_empty() {
+ // First read repetition specifications.
+ let mut repetition_spec = None;
+
+ if let Some((spec_size, spec)) = parse_repetition_spec(s) {
+ if spec_size >= s.len() {
+ return Err(Error::HangingRepetition(current_index));
+ }
+
+ s = &s[spec_size..];
+ current_index += spec_size;
+
+ repetition_spec = Some(spec);
+ }
+
+ let repetition_spec = repetition_spec;
+
+ // if a rulename
+ match parse_rulename(s) {
+ Ok(0) => {}
+ Ok(size) => {
+ let name = &s[..size].to_string();
+
+ intermediates.push(IntermediateConstruct::Nonterminal(name.clone()));
+
+ let nt_index = intermediates.len() - 1;
+
+ let action = repetition_spec.try_into()?;
+ // println!("rep_action = {action:?}");
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ match action {
+ RepetitionAction::Ignore => {}
+ RepetitionAction::Star => {
+ types.push(RegexType::Kleene);
+ types.push(RegexType::Lit(nt_index));
+
+ edges.push(vec![types.len() - 1]);
+ edges.push(Vec::new());
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::Optional => {
+ types.append(&mut vec![RegexType::Optional]);
+ types.push(RegexType::Lit(nt_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::Plus => {
+ types.append(&mut vec![RegexType::Plus]);
+ types.push(RegexType::Lit(nt_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::Copy(n) => {
+ let old_types_len = types.len();
+
+ nonterminal_nodes.extend((0..n).map(|i| old_types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| old_types_len + i));
+
+ types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(nt_index));
+ // edges.push(vec![]);
+
+ // nonterminal_nodes.insert(types.len() - 1);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let old_types_len = types.len();
+
+ nonterminal_nodes.extend((0..n).map(|i| old_types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| old_types_len + i));
+
+ types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(nt_index));
+ // edges.push(vec![]);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+
+ types.append(&mut vec![RegexType::Plus]);
+ types.push(RegexType::Lit(nt_index));
+
+ nonterminal_nodes.insert(types.len() - 1);
+
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let old_types_len = types.len();
+
+ nonterminal_nodes.extend((0..n).map(|i| old_types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| old_types_len + i));
+
+ types.extend(std::iter::repeat(nt_index).map(RegexType::Lit).take(n));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(nt_index));
+ // edges.push(vec![]);
+
+ // nonterminal_nodes.insert(types.len() - 1);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+
+ let types_len = types.len();
+
+ types.extend(
+ std::iter::repeat([RegexType::Optional, RegexType::Lit(nt_index)])
+ .take(m)
+ .flatten(),
+ );
+
+ edges.extend((0..(m - 1)).flat_map(|i| {
+ [
+ vec![types_len + 1 + 2 * i, types_len + 2 + 2 * i],
+ Vec::new(),
+ ]
+ }));
+
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ nonterminal_nodes.extend((0..m).map(|i| types_len + 2 * i + 1));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types_len);
+
+ // for _ in 0..m {
+ // types.extend([RegexType::Optional, RegexType::Lit(nt_index)]);
+ // edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ // nonterminal_nodes.insert(types.len() - 1);
+
+ // edges[stack_last].push(types.len() - 2);
+ // }
+ }
+ }
+
+ s = &s[size..];
+ current_index += size;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ Err(_) => {
+ intermediates.push(IntermediateConstruct::Nonterminal(s.to_owned()));
+
+ types.push(RegexType::Lit(intermediates.len() - 1));
+ edges.push(vec![]);
+
+ let stack_last = stack
+ .last()
+ .copied()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+ nonterminal_nodes.insert(types.len() - 1);
+
+ break;
+ }
+ }
+
+ // match against the first charcter
+
+ let first_char = s.chars().next().unwrap();
+
+ match first_char {
+ '(' => {
+ // group
+
+ s = &s[1..];
+ current_index += 1;
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ match action {
+ RepetitionAction::Star => {
+ types.extend([RegexType::Kleene, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ repetition_action_stack.push((3, None));
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+ }
+ RepetitionAction::Optional => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+ repetition_action_stack.push((3, None));
+ }
+ RepetitionAction::Plus => {
+ types.extend([RegexType::Plus, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+ repetition_action_stack.push((3, None));
+ }
+ RepetitionAction::Copy(1) => {
+ types.extend([RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+
+ stack.extend([types.len() - 2, types.len() - 1]);
+ repetition_action_stack.push((2, None));
+ }
+ RepetitionAction::CopyOptional(0, _) => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 3);
+
+ stack.extend([types.len() - 3, types.len() - 2, types.len() - 1]);
+
+ repetition_action_stack.push((3, Some(action)));
+ }
+ _ => {
+ types.extend([RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+
+ stack.extend([types.len() - 2, types.len() - 1]);
+
+ repetition_action_stack.push((2, Some(action)));
+ }
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ return Err(Error::RegexUnbalancedParen(current_index));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ ')' => {
+ // end group
+
+ s = &s[1..];
+ current_index += 1;
+
+ if repetition_spec.is_some() {
+ return Err(Error::RegexInvalidRepeat(current_index));
+ }
+
+ let (stack_growth, repetition_action) = repetition_action_stack
+ .pop()
+ .ok_or(Error::RegexEmptyActionStack(current_index))?;
+
+ match repetition_action {
+ Some(action) => {
+ if stack.len() <= stack_growth + 1 {
+ dbg!();
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth + 1);
+
+ // this is not going to fail because of the previous check
+ let stack_last = stack.pop().unwrap();
+ let stack_second_last = stack.iter().copied().last().unwrap();
+
+ match action {
+ RepetitionAction::Ignore => {
+ edges[stack_second_last].pop();
+
+ types.truncate(stack_last);
+ edges.truncate(stack_last);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_second_last].push(types.len() - 1);
+
+ // if stack_second_last == 3 && nodes.len() == 4 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ edges[stack_second_last].push(types.len());
+
+ let nodes_cloned_len = nodes_cloned.len();
+
+ for _ in 0..(m - 1) {
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len(), types.len() + nodes_cloned_len]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Star
+ | RepetitionAction::Optional
+ | RepetitionAction::Plus => {
+ // this will not happen here
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ None => {
+ if stack.len() <= stack_growth {
+ dbg!();
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ '[' => {
+ // option
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ match action {
+ RepetitionAction::Ignore => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ edges[stack_last].push(types.len() - 3);
+
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((3, Some(action)));
+ }
+ RepetitionAction::Optional => {
+ types.extend([
+ RegexType::Optional,
+ RegexType::Optional,
+ RegexType::Or,
+ RegexType::Empty,
+ ]);
+ edges.extend([
+ vec![types.len() - 3],
+ vec![types.len() - 2],
+ vec![types.len() - 1],
+ Vec::new(),
+ ]);
+
+ edges[stack_last].push(types.len() - 4);
+
+ stack.push(types.len() - 4);
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((4, None));
+ }
+ RepetitionAction::Star => {
+ types.extend([
+ RegexType::Kleene,
+ RegexType::Optional,
+ RegexType::Or,
+ RegexType::Empty,
+ ]);
+ edges.extend([
+ vec![types.len() - 3],
+ vec![types.len() - 2],
+ vec![types.len() - 1],
+ Vec::new(),
+ ]);
+
+ edges[stack_last].push(types.len() - 4);
+
+ stack.push(types.len() - 4);
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((4, None));
+ }
+ RepetitionAction::Plus => {
+ types.extend([
+ RegexType::Kleene,
+ RegexType::Optional,
+ RegexType::Or,
+ RegexType::Empty,
+ ]);
+ edges.extend([
+ vec![types.len() - 3],
+ vec![types.len() - 2],
+ vec![types.len() - 1],
+ Vec::new(),
+ ]);
+
+ edges[stack_last].push(types.len() - 4);
+
+ stack.push(types.len() - 4);
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((4, None));
+ }
+ RepetitionAction::Copy(1) => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ edges[stack_last].push(types.len() - 3);
+
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((3, None));
+ }
+ _ => {
+ types.extend([RegexType::Optional, RegexType::Or, RegexType::Empty]);
+ edges.extend([vec![types.len() - 2], vec![types.len() - 1], Vec::new()]);
+
+ edges[stack_last].push(types.len() - 3);
+
+ stack.push(types.len() - 3);
+ stack.push(types.len() - 2);
+ stack.push(types.len() - 1);
+
+ repetition_action_stack.push((3, Some(action)));
+ }
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ return Err(Error::RegexUnbalancedParen(current_index));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ ']' => {
+ // end option
+
+ s = &s[1..];
+ current_index += 1;
+
+ if repetition_spec.is_some() {
+ return Err(Error::RegexInvalidRepeat(current_index));
+ }
+
+ let (stack_growth, repetition_action) = repetition_action_stack
+ .pop()
+ .ok_or(Error::RegexEmptyActionStack(current_index))?;
+
+ match repetition_action {
+ Some(action) => {
+ if stack.len() <= stack_growth + 1 {
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth + 1);
+
+ // this is not going to fail because of the previous check
+ let stack_last = stack.pop().unwrap();
+ let stack_second_last = stack.iter().copied().last().unwrap();
+
+ match action {
+ RepetitionAction::Ignore => {
+ edges[stack_second_last].pop();
+
+ types.truncate(stack_last);
+ edges.truncate(stack_last);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Copy(n) | RepetitionAction::CopyOptional(0, n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_second_last].push(types.len() - 1);
+
+ // if stack_second_last == 3 && nodes.len() == 4 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let nodes_cloned =
+ types.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ let edges_cloned =
+ edges.iter().skip(stack_last).cloned().collect::<Vec<_>>();
+
+ for _ in 0..(n - 1) {
+ edges[stack_second_last].push(types.len());
+
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ let nodes_cloned_len = nodes_cloned.len();
+
+ for _ in 0..(m - 1) {
+ // if stack_second_last == 3 && nodes.len() == 3 {
+ // println!("haha");
+ // }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len(), types.len() + nodes_cloned_len]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+ }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ types.extend(nodes_cloned.iter().copied());
+
+ let edges_offset = edges.len() - stack_last;
+
+ edges.extend(edges_cloned.iter().map(|edges| {
+ edges
+ .iter()
+ .map(|des| des + edges_offset)
+ .collect::<Vec<_>>()
+ }));
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ RepetitionAction::Star
+ | RepetitionAction::Optional
+ | RepetitionAction::Plus => {
+ // this will not happen here
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ None => {
+ if stack.len() <= stack_growth {
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.truncate(stack.len() - stack_growth);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ }
+ }
+ '/' => {
+ // end concatenation
+
+ s = &s[1..];
+ current_index += 1;
+
+ if repetition_spec.is_some() {
+ return Err(Error::RegexInvalidRepeat(current_index));
+ }
+
+ if stack.len() <= 1 {
+ return Err(Error::RegexEmptyStack(current_index));
+ }
+
+ stack.pop().unwrap();
+
+ let stack_last = stack.iter().copied().last().unwrap();
+
+ edges[stack_last].push(types.len());
+
+ types.push(RegexType::Empty);
+ edges.push(vec![]);
+
+ stack.push(types.len() - 1);
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ '"' => {
+ // string
+
+ s = &s[1..];
+ current_index += 1;
+
+ let skip_char_value = skip_by(s, is_char_value);
+
+ if skip_char_value >= s.len() {
+ return Err(Error::HangingDQuote(current_index));
+ }
+
+ let fakes = &s[skip_char_value..];
+
+ if !fakes.starts_with('"') {
+ return Err(Error::HangingDQuote(current_index));
+ }
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ let slice_value = s[..skip_char_value].to_owned();
+
+ intermediates.push(IntermediateConstruct::Terminal(slice_value.clone()));
+
+ let t_index = intermediates.len() - 1;
+
+ match action {
+ RepetitionAction::Ignore => {}
+ RepetitionAction::Star => {
+ types.push(RegexType::Kleene);
+ types.push(RegexType::Lit(t_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ edges[stack_last].push(types.len() - 2);
+ }
+ RepetitionAction::Optional => {
+ types.append(&mut vec![RegexType::Optional]);
+ types.push(RegexType::Lit(t_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ edges[stack_last].push(types.len() - 2);
+ }
+ RepetitionAction::Plus => {
+ types.append(&mut vec![RegexType::Plus]);
+ types.push(RegexType::Lit(t_index));
+ edges.push(vec![types.len() - 1]);
+ edges.push(vec![]);
+
+ edges[stack_last].push(types.len() - 2);
+ }
+ RepetitionAction::Copy(n) => {
+ let types_len = types.len();
+
+ types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| types_len + i));
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(t_index));
+ // edges.push(vec![]);
+
+ // edges[stack_last].push(types.len() - 1);
+ // }
+ }
+ RepetitionAction::CopyPlus(n) => {
+ let types_len = types.len();
+
+ types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| types_len + i));
+
+ types.extend([RegexType::Plus, RegexType::Lit(t_index)]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 2);
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ let types_len = types.len();
+
+ types.extend(std::iter::repeat(t_index).take(n).map(RegexType::Lit));
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .extend((0..n).map(|i| types_len + i));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len());
+
+ for _ in 0..(m - 1) {
+ types.extend([RegexType::Optional, RegexType::Lit(t_index)]);
+ edges.extend([vec![types.len() - 1, types.len()], Vec::new()]);
+ }
+
+ types.extend([RegexType::Optional, RegexType::Lit(t_index)]);
+ edges.extend([vec![types.len() - 1], Vec::new()]);
+ }
+ }
+
+ s = &fakes[1..];
+ current_index += skip_char_value + 1;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ '%' => {
+ // numeric value
+
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ return Err(Error::HangingPercent(current_index - 1));
+ }
+
+ let inner_first_char = s.chars().next().unwrap();
+
+ let intermediate_value: IntermediateConstruct;
+
+ match inner_first_char {
+ 'b' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Binary)?;
+
+ intermediate_value = parsed_construct;
+
+ s = &s[parsed_size..];
+ current_index += parsed_size;
+ }
+ 'd' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Decimal)?;
+
+ intermediate_value = parsed_construct;
+
+ s = &s[parsed_size..];
+ current_index += parsed_size;
+ }
+ 'x' => {
+ s = &s[1..];
+ current_index += 1;
+
+ let (parsed_size, parsed_construct) = parse_numeric(s, Radix::Hexadecimal)?;
+ // println!("size = {parsed_size}, construct = {parsed_construct:#?}");
+
+ intermediate_value = parsed_construct;
+
+ s = &s[parsed_size..];
+ current_index += parsed_size;
+ }
+ _ => {
+ return Err(Error::RegexInvalidNumeric(current_index - 1));
+ }
+ }
+
+ let action = repetition_spec.try_into()?;
+
+ let stack_last = stack
+ .iter()
+ .copied()
+ .last()
+ .ok_or(Error::RegexEmptyStack(current_index))?;
+
+ intermediates.push(intermediate_value);
+
+ let inter_index = intermediates.len() - 1;
+
+ match action {
+ RepetitionAction::Ignore => {}
+ RepetitionAction::Star => {
+ types.push(RegexType::Kleene);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::Optional => {
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::Plus => {
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::Copy(n) => {
+ types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit));
+
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+
+ // for _ in 0..n {
+ // types.push(RegexType::Lit(inter_index));
+ // edges.push(vec![]);
+ // edges[stack_last].push(types.len() - 1);
+ // }
+ }
+ RepetitionAction::CopyPlus(n) => {
+ types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit));
+
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+
+ types.push(RegexType::Plus);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ RepetitionAction::CopyOptional(n, m) => {
+ types.extend(std::iter::repeat(inter_index).take(n).map(RegexType::Lit));
+
+ edges.extend(std::iter::repeat_with(Default::default).take(n));
+
+ let edges_len = edges.len();
+
+ edges
+ .get_mut(stack_last)
+ .ok_or(Error::RegexIndexOutOfBounds(stack_last, edges_len))?
+ .push(types.len() - 1);
+
+ for _ in 0..(m - 1) {
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len(), types.len() + 1]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+
+ types.push(RegexType::Optional);
+ edges.push(vec![types.len()]);
+
+ edges[stack_last].push(types.len() - 1);
+
+ types.push(RegexType::Lit(inter_index));
+ edges.push(vec![]);
+ }
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ break;
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ continue;
+ }
+ '<' => {
+ // prose value
+ return Err(Error::Unimplemented(current_index));
+ }
+ ';' => {
+ // Possibly extend to the next line if the indentation
+ // of the next line is greater than `indentation`.
+
+ s = &s[1..];
+ current_index += 1;
+
+ let mut in_comment = true;
+
+ loop {
+ let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp };
+
+ let mut after_wsp_and_v_char = skip_by(s, skip_fn);
+
+ if after_wsp_and_v_char >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ s = &s[after_wsp_and_v_char..];
+ current_index += after_wsp_and_v_char;
+
+ let first_char = s.chars().next().unwrap();
+
+ let mut skip_again = false;
+
+ match first_char {
+ '\n' | '\r' => {
+ s = &s[1..];
+ current_index += 1;
+ skip_again = true;
+ }
+ _ if in_comment => {
+ // dbg!("1815");
+ return Err(Error::RegexInvalidComment(current_index));
+ }
+ _ => {}
+ }
+
+ if skip_again {
+ let skip_wsp = skip_by(s, is_wsp);
+
+ if skip_wsp >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ after_wsp_and_v_char = skip_wsp;
+
+ s = &s[skip_wsp..];
+ current_index += skip_wsp;
+ }
+
+ let first_char = s.chars().next().unwrap();
+
+ match first_char {
+ ';' => {
+ // continue looping
+ s = &s[1..];
+ current_index += 1;
+ in_comment = true;
+
+ continue;
+ }
+ '\n' | '\r' => {
+ in_comment = false;
+ s = &s[1..];
+ current_index += 1;
+
+ continue;
+ }
+ _ if after_wsp_and_v_char <= indentation => {
+ stack = vec![];
+ break;
+ }
+ _ => {
+ break;
+ }
+ }
+ }
+ }
+ '\n' | '\r' => {
+ // Possibly extend to the next line if the indentation
+ // of the next line is greater than `indentation`.
+ // println!("s = {}", s.chars().take(10).collect::<String>());
+
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ break;
+ }
+
+ let mut in_comment = false;
+
+ loop {
+ let skip_fn = if in_comment { is_wsp_or_v_char } else { is_wsp };
+
+ let mut after_wsp = skip_by(s, skip_fn);
+
+ if after_wsp >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ s = &s[after_wsp..];
+ current_index += after_wsp;
+
+ let first_char = s.chars().next().unwrap();
+
+ let mut skip_again = false;
+
+ match first_char {
+ '\n' | '\r' => {
+ s = &s[1..];
+ current_index += 1;
+ skip_again = true;
+ }
+ _ if in_comment => {
+ // dbg!("1903");
+ return Err(Error::RegexInvalidComment(current_index));
+ }
+ _ => {}
+ }
+
+ if skip_again {
+ let skip_wsp = skip_by(s, is_wsp);
+
+ if skip_wsp >= s.len() {
+ current_index += s.len();
+ stack = vec![];
+ break;
+ }
+
+ s = &s[skip_wsp..];
+ current_index += skip_wsp;
+
+ after_wsp = skip_wsp;
+ }
+
+ let first_char = s.chars().next().unwrap();
+
+ match first_char {
+ ';' => {
+ // continue looping
+ s = &s[1..];
+ current_index += 1;
+ in_comment = true;
+
+ continue;
+ }
+ '\n' | '\r' => {
+ in_comment = false;
+ s = &s[1..];
+ current_index += 1;
+
+ continue;
+ }
+ _ if after_wsp <= indentation => {
+ stack = vec![];
+ break;
+ }
+ _ => {
+ break;
+ }
+ }
+ }
+ }
+ _ => {
+ // invalid
+ println!("s = {}", &s[..(if s.len() > 50 { 50 } else { s.len() })]);
+ return Err(Error::InvalidCharacter(current_index));
+ }
+ }
+ }
+
+ let regex = DefaultRegex::new(edges.into(), types).map_err(error_conversion)?;
+
+ Ok((
+ current_index,
+ AlternationConstruct {
+ intermediates,
+ regex,
+ },
+ ))
+}
+
+type RuleConstruct = (
+ // Name
+ String,
+ // Data
+ Vec<IntermediateConstruct>,
+ // Rule
+ DefaultRegex<usize>,
+ // extra_alternation_or_not
+ bool,
+);
+
+fn parse_one_rule(s: &str, indentation: usize) -> Result<(usize, RuleConstruct), Error> {
+ if s.is_empty() {
+ return Ok((0, ("".to_owned(), Vec::new(), Default::default(), false)));
+ }
+
+ let mut s = s;
+
+ let mut current_index = 0usize;
+
+ let mut appending_alternation = false;
+
+ // rule name
+
+ let rulename_end = parse_rulename(s)?;
+ let rule_name = s[..rulename_end].to_string();
+ s = &s[rulename_end..];
+ current_index += rulename_end;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_by(s, is_wsp);
+
+ if skip_c_wsp_size >= s.len() {
+ // dbg!();
+ return Err(Error::HangingRuleName(0));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ // equal or equal forward slash
+
+ let next_char = s.chars().next().unwrap();
+
+ if next_char != '=' {
+ // println!("s = {}", &s[..50]);
+ // println!("rule_name = {rule_name}");
+ return Err(Error::MissingEqualSign(current_index + 1));
+ }
+
+ s = &s[1..];
+ current_index += 1;
+
+ if s.is_empty() {
+ return Err(Error::HangingEqualsSign(current_index - 1));
+ }
+
+ let next_char = s.chars().next().unwrap();
+
+ if next_char == '/' {
+ appending_alternation = true;
+ s = &s[1..];
+ current_index += 1;
+ }
+
+ if s.is_empty() {
+ return Err(Error::HangingEqualsSign(current_index - 1));
+ }
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_c_wsp(s);
+
+ if skip_c_wsp_size >= s.len() {
+ // dbg!();
+ return Err(Error::HangingRuleName(0));
+ }
+
+ s = &s[skip_c_wsp_size..];
+ current_index += skip_c_wsp_size;
+
+ // elements
+
+ // alternation
+
+ let (alternation_size, alternation_result) = parse_alternation(s, indentation)?;
+
+ let intermediates_data = alternation_result.intermediates;
+ let regex = alternation_result.regex;
+
+ // println!("size = {alternation_size}, result = {alternation_result:?}");
+
+ if alternation_size >= s.len() {
+ s = "";
+ } else {
+ s = &s[alternation_size..];
+ }
+
+ current_index += alternation_size;
+
+ // c wsp
+
+ let skip_c_wsp_size = skip_c_wsp(s);
+
+ if skip_c_wsp_size >= s.len() {
+ s = "";
+ } else {
+ s = &s[skip_c_wsp_size..];
+ }
+
+ current_index += skip_c_wsp_size;
+
+ // c-nl
+
+ if let Some(skip_c_nl) = skip_c_nl(s) {
+ current_index += skip_c_nl;
+ }
+
+ Ok((
+ current_index,
+ (rule_name, intermediates_data, regex, appending_alternation),
+ ))
+}
+
+impl std::str::FromStr for Grammar {
+ // First skip whitespaces, linefeeds, carraige returns and
+ // semicolons until the first rule appears. The basic indentation
+ // level is set to the indentation level of the first rule's line.
+ //
+ // Then enter identifier reading mode, reading a letter followed
+ // by a sequence of letters, digits and hyphens.
+ //
+ // After skipping white spaces, read an equal sign.
+ //
+ // Then enter alternation reading mode, reading alternations of
+ // concatenations.
+ //
+ // Then enter element reading mode, reading optionally repetition
+ // specification followed by an identifier, or a numeral value, or
+ // a quoted string.
+
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ let mut before_first_rule = true;
+ let mut basic_indentation: usize = 0;
+ let mut current_skipped_index: usize = 0;
+
+ let mut s = s;
+
+ while before_first_rule {
+ before_first_rule = false;
+
+ let after_wsp = skip_by(s, is_wsp);
+
+ if after_wsp >= s.len() {
+ // The entire string consists of white-spaces and
+ // comments.
+ return Err(Error::EmptyGrammar);
+ }
+
+ s = &s[after_wsp..];
+ current_skipped_index += after_wsp;
+
+ match skip_c_nl(s) {
+ Some(skipped_size) => {
+ if skipped_size >= s.len() {
+ return Err(Error::EmptyGrammar);
+ }
+
+ // Repeat skipping until we find the first rule.
+ before_first_rule = true;
+
+ s = &s[skipped_size..];
+ current_skipped_index += skipped_size;
+ }
+ None => {
+ // Found the first rule.
+ basic_indentation = after_wsp;
+ }
+ }
+ }
+
+ // TODO: Core rules are not implemented for now.
+
+ #[allow(unused_macros)]
+ macro_rules! log {
+ () => {
+ let mut file = std::fs::OpenOptions::new()
+ .create(true)
+ .append(true)
+ .open("debug.log")
+ .unwrap();
+
+ let _ = file.write_all(format!("{}\n", line!()).as_bytes());
+ };
+ }
+
+ let mut nonterminals_vec: Vec<String> = vec![];
+ let mut nonterminals_map: HashMap<String, usize> = HashMap::new();
+
+ let mut all_rules: HashMap<String, DefaultRegex<TNT>> = Default::default();
+
+ let mut terminals_vec: Vec<String> = vec![];
+ let mut terminals_map: HashMap<String, usize> = HashMap::new();
+
+ let mut temp = s;
+
+ // First collect non-terminals in the order they are defined.
+ loop {
+ let (rule_size, (rule_name, _rule_data, _rule_regex, appending_p)) =
+ parse_one_rule(temp, basic_indentation)?;
+
+ if rule_size == 0 {
+ break;
+ }
+
+ if appending_p && nonterminals_map.get(&rule_name).is_none() {
+ return Err(Error::AppendToNothing(current_skipped_index));
+ } else if !appending_p && nonterminals_map.get(&rule_name).is_some() {
+ return Err(Error::CreateAgain(current_skipped_index));
+ }
+
+ nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len());
+ nonterminals_vec.push(rule_name);
+
+ if rule_size > 0 && rule_size < temp.len() {
+ temp = &temp[rule_size..];
+ current_skipped_index += rule_size;
+ } else {
+ break;
+ }
+ }
+
+ let mut temp = s;
+
+ // Then gather the data about names of terminals and
+ // non-terminals.
+ loop {
+ let (rule_size, (_rule_name, rule_data, _rule_regex, _appending_p)) =
+ parse_one_rule(temp, basic_indentation)?;
+
+ if rule_size == 0 {
+ break;
+ }
+
+ for data in rule_data {
+ match data {
+ IntermediateConstruct::Terminal(name) => {
+ if terminals_map.get(&name).is_none() {
+ terminals_map.insert(name.clone(), terminals_vec.len());
+ terminals_vec.push(name);
+ }
+ }
+ IntermediateConstruct::Nonterminal(name) => {
+ if nonterminals_map.get(&name).is_none() {
+ return Err(Error::UnknownNonterminal(name));
+ }
+ }
+ IntermediateConstruct::Range(_, _) => unimplemented!(),
+ }
+ }
+
+ if rule_size > 0 && rule_size < temp.len() {
+ temp = &temp[rule_size..];
+ current_skipped_index += rule_size;
+ } else {
+ break;
+ }
+ }
+
+ // Then collect the actual rules.
+ loop {
+ let (rule_size, (rule_name, rule_data, rule_regex, appending_p)) =
+ parse_one_rule(s, basic_indentation)?;
+ // println!("rule name = {rule_name}");
+ // println!("regexp = {rule_regexp:#?}");
+
+ let rule_transform = |old_label: usize| -> Result<TNT, Error> {
+ if let Some(old_value) = rule_data.get(old_label) {
+ match old_value {
+ IntermediateConstruct::Terminal(name) => {
+ if let Some(index) = terminals_map.get(name) {
+ Ok(TNT::Ter(*index))
+ } else {
+ dbg!();
+ Err(Error::UnknownTerminal(name.clone()))
+ }
+ }
+ IntermediateConstruct::Nonterminal(name) => {
+ if let Some(index) = nonterminals_map.get(name) {
+ Ok(TNT::Non(*index))
+ } else {
+ dbg!();
+ Err(Error::UnknownNonterminal(name.clone()))
+ }
+ }
+ IntermediateConstruct::Range(_, _) => {
+ dbg!();
+ Err(Error::UnsupportedOperation)
+ }
+ }
+ } else {
+ dbg!();
+ Err(Error::RegexIndexOutOfBounds(old_label, rule_data.len()))
+ }
+ };
+
+ let transformed_rule = rule_regex.maps_to(rule_transform)?;
+
+ if rule_size == 0 {
+ break;
+ }
+
+ if !appending_p {
+ nonterminals_map.insert(rule_name.clone(), nonterminals_vec.len());
+ nonterminals_vec.push(rule_name.clone());
+
+ all_rules.insert(rule_name, transformed_rule);
+ } else {
+ all_rules
+ .get_mut(&rule_name)
+ .unwrap()
+ .append(transformed_rule);
+ }
+
+ if rule_size > 0 && rule_size < s.len() {
+ s = &s[rule_size..];
+ current_skipped_index += rule_size;
+ } else {
+ break;
+ }
+ }
+
+ let rules_vec: Vec<Rule> = nonterminals_vec
+ .iter()
+ .map(|name| {
+ all_rules
+ .get(name)
+ .map(|regex| Rule::new(regex.clone()))
+ .ok_or(Error::UnknownNonterminal(name.clone()))
+ })
+ .collect::<Result<_, _>>()?;
+
+ let terminals: Vec<Terminal> = terminals_vec.into_iter().map(Terminal::new).collect();
+ let nonterminals: Vec<Nonterminal> =
+ nonterminals_vec.into_iter().map(Nonterminal).collect();
+
+ Ok(Self::new(terminals, nonterminals, rules_vec))
+ }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/grammar/src/abnf/tests.rs b/grammar/src/abnf/tests.rs
new file mode 100644
index 0000000..67dfaaa
--- /dev/null
+++ b/grammar/src/abnf/tests.rs
@@ -0,0 +1,291 @@
+//! This module contains unit tests for the abnf grammar parser.
+
+use super::*;
+
+#[test]
+fn test_skip_by() -> Result<(), Box<dyn std::error::Error>> {
+ let string = std::iter::repeat('a')
+ .take(10)
+ .chain(std::iter::repeat('哈').take(10))
+ .chain(" \t\n".chars())
+ .collect::<String>();
+
+ let mut s: &str = string.as_str();
+
+ dbg!(&s);
+
+ let alpha_len = skip_by(s, is_alpha);
+
+ assert_eq!(alpha_len, 10);
+
+ s = &s[alpha_len..];
+
+ dbg!(&s);
+
+ let not_newline_len = skip_by(s, is_not_newline);
+
+ // 哈 takes 3 bytes, so the count should be 30 plus two bytes for
+ // the space and the tab.
+ assert_eq!(not_newline_len, 32);
+
+ Ok(())
+}
+
+#[test]
+fn test_skip_c_nl() -> Result<(), Box<dyn std::error::Error>> {
+ let string = std::iter::repeat(' ')
+ .take(10)
+ .chain(std::iter::repeat('哈').take(10))
+ .chain(" \t\n".chars())
+ .collect::<String>();
+
+ let s = string.as_str();
+
+ dbg!(s);
+
+ let c_nl = skip_c_nl(s);
+
+ assert!(c_nl.is_none());
+
+ let new_string = std::iter::once(';')
+ .chain(string.chars())
+ .chain(string.chars())
+ .collect::<String>();
+
+ let s = new_string.as_str();
+
+ dbg!(s);
+
+ let c_nl = skip_c_nl(s);
+
+ assert_eq!(c_nl, Some(string.len() + 1));
+
+ let c_nl = skip_c_nl("\n");
+
+ assert_eq!(c_nl, Some(1));
+
+ Ok(())
+}
+
+#[test]
+fn test_skip_c_wsp() -> Result<(), Box<dyn std::error::Error>> {
+ let mut string: String = std::iter::repeat(' ')
+ .take(10)
+ .chain(std::iter::repeat('\t').take(10))
+ .collect();
+
+ string.extend(
+ std::iter::once(';')
+ .chain(
+ std::iter::repeat(std::iter::once('\n').chain(std::iter::once(';')))
+ .take(10)
+ .flatten(),
+ )
+ .chain(std::iter::once('\n'))
+ .chain(std::iter::repeat('a').take(10)),
+ );
+
+ let string = string;
+
+ dbg!(&string);
+
+ let skip_c_wsp_size = skip_c_wsp(&string);
+
+ assert_eq!(skip_c_wsp_size, string.len() - 10);
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_repetition() -> Result<(), Box<dyn std::error::Error>> {
+ let strs = ["*A", "0*A", "*0A", "2*A", "*2A", "5*10A", "A"];
+
+ let results = strs.map(parse_repetition_spec);
+
+ dbg!(&results);
+
+ let answers = [
+ Some((1, RepetitionSpec::Star)),
+ Some((2, RepetitionSpec::Min(0))),
+ Some((2, RepetitionSpec::Max(0))),
+ Some((2, RepetitionSpec::Min(2))),
+ Some((2, RepetitionSpec::Max(2))),
+ Some((4, RepetitionSpec::MinMax(5, 10))),
+ None,
+ ];
+
+ for (result, answer) in results.iter().zip(answers.iter()) {
+ assert_eq!(result, answer);
+ }
+
+ Ok(())
+}
+
+use super::Error;
+
+#[test]
+fn test_parse_rulename() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "NONTERMINAL";
+
+ assert!(matches!(parse_rulename(s), Err(Error::HangingRuleName(0))));
+
+ let s = " NONTERMINAL = ";
+
+ assert!(matches!(parse_rulename(s), Ok(0)));
+
+ let s = "NONTERMINAL = ";
+
+ assert!(matches!(parse_rulename(s), Ok(11)));
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_numeric() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "haha";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Binary),
+ Err(Error::InvalidDigit(0))
+ ));
+
+ let s = "124";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Binary),
+ Err(Error::InvalidCharacter(1))
+ ));
+
+ let s = "32";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Ok((2, IntermediateConstruct::Terminal(string))) if string == " ".to_string()
+ ));
+
+ let s = "32.65.66.66.65";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Ok((14, IntermediateConstruct::Terminal(string))) if string == " ABBA".to_string()
+ ));
+
+ let s = "32-";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Err(Error::HangingNumeric(2))
+ ));
+
+ let s = "32-124";
+
+ assert!(matches!(
+ parse_numeric(s, Radix::Decimal),
+ Ok((6, IntermediateConstruct::Range(from, to))) if from == ' ' && to == '|',
+ ));
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_alternation() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "0*10nonterminal\n";
+
+ let (len, alt) = parse_alternation(s, 0)?;
+
+ assert_eq!(len, s.len());
+
+ let regex = alt.regex;
+
+ println!("regex = {regex}");
+
+ assert_eq!(
+ format!("{regex}"),
+ "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?)"
+ );
+
+ let s = "0*10nonterminal ( *nonterminal nonterminal / [ nonterminal ] 1*5nonterminal )\n";
+
+ let (len, alt) = parse_alternation(s, 0)?;
+
+ assert_eq!(len, s.len());
+
+ let regex = alt.regex;
+
+ println!("regex = {regex}");
+
+ assert_eq!(
+ format!("{regex}"),
+ "((0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?)?((1)*2|((3))?4(4(4(4(4)?)?)?)?))"
+ );
+
+ let s = "5nonterminal\n";
+
+ let (len, alt) = parse_alternation(s, 0)?;
+
+ assert_eq!(len, s.len());
+
+ let regex = alt.regex;
+
+ println!("regex = {regex}");
+
+ assert_eq!(format!("{regex}"), "(00000)");
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_onerule() -> Result<(), Box<dyn std::error::Error>> {
+ let s = "start = 1*10nonterminal [ nonterminal ] / 1*nonterminal\n";
+
+ let (len, (name, data, regex, extra_p)) = parse_one_rule(s, 0)?;
+
+ println!("name = {name}");
+ print!("data = ");
+
+ for (index, non_terminal) in data.iter().enumerate() {
+ print!("({index}, {non_terminal})");
+ }
+
+ println!();
+
+ println!("regex = {regex}");
+
+ println!("extra_p = {extra_p}");
+
+ assert_eq!(len, s.len());
+
+ assert_eq!(name, "start".to_string());
+
+ let answers = ["nnonterminal", "nnonterminal", "nnonterminal"];
+
+ assert_eq!(data.len(), answers.len());
+
+ for (tnt, answer) in data.into_iter().zip(answers) {
+ assert_eq!(format!("{tnt}"), answer.to_string());
+ }
+
+ assert_eq!(
+ format!("{regex}"),
+ "(0(0(0(0(0(0(0(0(0(0)?)?)?)?)?)?)?)?)?((1))?|(2)+)".to_string()
+ );
+
+ assert!(!extra_p);
+
+ Ok(())
+}
+
+#[test]
+fn test_parse_grammar() -> Result<(), Box<dyn std::error::Error>> {
+ let grammar_file_name = "abnf grammars/test.abnf";
+
+ let file_content = std::fs::read_to_string(grammar_file_name)?;
+
+ let grammar: Grammar = file_content.parse()?;
+
+ println!("print grammar:");
+
+ println!("{grammar}");
+
+ Ok(())
+}
diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs
index 11cb161..ea1299f 100644
--- a/grammar/src/lib.rs
+++ b/grammar/src/lib.rs
@@ -93,7 +93,7 @@ impl Display for TNT {
}
/// Errors related to grammar operations.
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum Error {
/// The operation requires the grammar to be after a certain
@@ -101,6 +101,8 @@ pub enum Error {
WrongState(GrammarState, GrammarState),
/// The first component is the index, and the second the bound.
IndexOutOfBounds(usize, usize),
+ /// The given name of a terminal or a non-terminal is unknown.
+ UnknownTNTName(String),
/// Fail to build the N-th regular expression, due to the
/// ParseError.
BuildFail(usize, ParseError),
@@ -123,6 +125,11 @@ impl Display for Error {
"Failed to build the {n}-th regular expression due to error: {pe}"
),
Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
+ Error::UnknownTNTName(name) => write!(
+ f,
+ "the name {name} is unknown \
+ for a terminal or a non-terminal."
+ ),
Error::WrongState(current, threshold) => {
write!(f, "require state {threshold}, but in state {current}")
}
@@ -158,6 +165,12 @@ impl Rule {
pub fn len(&self) -> usize {
self.regex.len()
}
+
+ /// Wrap a regular expression into a rule.
+ #[inline]
+ pub fn new(regex: DefaultRegex<TNT>) -> Self {
+ Self { regex }
+ }
}
/// The state of Grammar.
@@ -190,6 +203,37 @@ impl Display for GrammarState {
}
}
+/// This enum represents the name of either a terminal or a
+/// non-terminal.
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+pub enum TNTName {
+ /// The name of a terminal.
+ Ter(String),
+ /// The name of a non-terminal.
+ Non(String),
+}
+
+impl TNTName {
+ /// Construct the name of a terminal or a non-terminal.
+ #[inline]
+ pub fn new(name: String, terminal_p: bool) -> Self {
+ if terminal_p {
+ Self::Ter(name)
+ } else {
+ Self::Non(name)
+ }
+ }
+
+ /// Return the underlying name.
+ #[inline]
+ pub fn name(&self) -> &str {
+ match self {
+ TNTName::Ter(name) => name,
+ TNTName::Non(name) => name,
+ }
+ }
+}
+
/// The type of a grammar.
#[derive(Debug, Clone, Default)]
pub struct Grammar {
@@ -197,6 +241,8 @@ pub struct Grammar {
ter: Vec<Terminal>,
/// A list of non-terminals.
non: Vec<Nonterminal>,
+ /// A map from the names of terminals or non-terminals to TNT.
+ tnt_map: HashMap<TNTName, TNT>,
/// A list of rules.
///
/// The length of the list must match that of the list of
@@ -286,6 +332,22 @@ impl Grammar {
let expansion_map = Default::default();
let reduction_map = Default::default();
+ let mut tnt_map: HashMap<TNTName, TNT> = Default::default();
+
+ for (index, ter_element) in ter.iter().enumerate() {
+ tnt_map.insert(
+ TNTName::new(ter_element.name().to_string(), true),
+ TNT::Ter(index),
+ );
+ }
+
+ for (index, non_element) in non.iter().enumerate() {
+ tnt_map.insert(
+ TNTName::new(non_element.name().to_string(), false),
+ TNT::Non(index),
+ );
+ }
+
// NOTE: We cannot calculate accumulators here, as we want the
// accumulators of the regular expression of the left-closure,
// not of the original one.
@@ -294,6 +356,7 @@ impl Grammar {
Self {
ter,
non,
+ tnt_map,
rules,
firsts,
first_nodes,
@@ -304,6 +367,16 @@ impl Grammar {
}
}
+ /// Convert from the name of a terminal or a non-terminal to a
+ /// struct TNT.
+ #[inline]
+ pub fn name_to_tnt(&self, name: &TNTName) -> Result<TNT, Error> {
+ self.tnt_map
+ .get(name)
+ .copied()
+ .ok_or_else(|| Error::UnknownTNTName(name.name().to_string()))
+ }
+
/// Return the name of a terminal or a non-terminal.
pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> {
match tnt {
@@ -313,6 +386,14 @@ impl Grammar {
.get(t)
.ok_or(Error::IndexOutOfBounds(t, self.ter.len()))?
.name()
+ .chars()
+ .map(|c| if crate::abnf::is_v_char(c) {
+ c.to_string()
+ } else {
+ format!("{:#x}", c as usize)
+ })
+ .collect::<Vec<_>>()
+ .join("")
)),
TNT::Non(n) => Ok(format!(
"N{}",
@@ -822,7 +903,7 @@ impl Display for Grammar {
f,
"{}",
rule.regex.to_string_with(|tnt| format!(
- "({})",
+ " {} ",
self.name_of_tnt(tnt)
.unwrap_or_else(|_| format!("Unknown {tnt:?}"))
))?
diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs
index ba3077e..d36b250 100644
--- a/graph/src/adlist.rs
+++ b/graph/src/adlist.rs
@@ -3,9 +3,16 @@
//! [`Graph`][super::Graph]. This data type represents graphs using
//! adjacency lists internally.
-use std::ops::{Deref, DerefMut};
+use std::{
+ borrow::{Borrow, BorrowMut},
+ ops::{Deref, DerefMut},
+};
-use crate::{builder::Builder, error::Error, ExtGraph, Graph};
+use crate::{
+ builder::{Builder, BuilderMut},
+ error::Error,
+ ExtGraph, Graph,
+};
// #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
// struct ALEdge {
@@ -118,6 +125,99 @@ impl From<Vec<Vec<usize>>> for ALGraph {
}
}
+/// A builder that modifies ALGraph in place.
+#[derive(Debug)]
+pub struct ALGBuilderMut<'a> {
+ graph: &'a mut ALGraph,
+}
+
+impl<'a> std::ops::Deref for ALGBuilderMut<'a> {
+ type Target = ALGraph;
+
+ fn deref(&self) -> &Self::Target {
+ self.graph.borrow()
+ }
+}
+
+impl<'a> std::ops::DerefMut for ALGBuilderMut<'a> {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ self.graph.borrow_mut()
+ }
+}
+
+impl<'a> BuilderMut for ALGBuilderMut<'a> {
+ type Label = ();
+
+ type Graph = ALGraph;
+
+ type ResultBuilder<'b> = ALGBuilderMut<'b>
+ where
+ Self::Label: 'b;
+
+ fn from_graph_mut(graph: &mut Self::Graph) -> Self::ResultBuilder<'_> {
+ ALGBuilderMut { graph }
+ }
+
+ fn add_vertex(&mut self, _label: Self::Label) -> usize {
+ self.nodes.push(Default::default());
+
+ self.nodes.len() - 1
+ }
+
+ fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> {
+ if self.graph.has_edge(source, target)? {
+ return Ok(());
+ }
+
+ self.graph
+ .nodes
+ .get_mut(source)
+ .unwrap()
+ .children
+ .push(target);
+
+ Ok(())
+ }
+
+ fn append(&mut self, other: Self::Graph) {
+ let self_len = self.nodes_len();
+
+ self.graph
+ .nodes
+ .extend(other.nodes.into_iter().map(|mut node| {
+ for child in node.children.iter_mut() {
+ *child += self_len;
+ }
+
+ node
+ }));
+ }
+
+ fn set_label(&mut self, _node_id: usize, _label: Self::Label) -> Result<(), Error> {
+ Ok(())
+ }
+
+ fn remove_edge<F>(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error>
+ where
+ F: FnMut(Self::Label) -> bool,
+ {
+ let nodes_len = self.nodes.len();
+
+ let source_children = self
+ .nodes
+ .get_mut(source)
+ .ok_or(Error::IndexOutOfBounds(source, nodes_len))?;
+
+ if !(0..nodes_len).contains(&target) {
+ return Err(Error::IndexOutOfBounds(target, nodes_len));
+ }
+
+ source_children.children.retain(|child| *child != target);
+
+ Ok(())
+ }
+}
+
// Builder for this implementation of graph
/// A builder for adjacency list graphs.
diff --git a/graph/src/builder.rs b/graph/src/builder.rs
index c5f9252..1278e26 100644
--- a/graph/src/builder.rs
+++ b/graph/src/builder.rs
@@ -82,6 +82,11 @@ pub trait BuilderMut {
/// Add an edge from the source to the target.
fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error>;
+ /// Append another graph to this builder.
+ fn append(&mut self, _other: Self::Graph) {
+ unimplemented!()
+ }
+
/// Set the label of an existing node to a new label.
fn set_label(&mut self, node_id: usize, label: Self::Label) -> Result<(), Error>;
diff --git a/graph/src/labelled/binary.rs b/graph/src/labelled/binary.rs
index 9f3afa8..ccbd5cb 100644
--- a/graph/src/labelled/binary.rs
+++ b/graph/src/labelled/binary.rs
@@ -214,9 +214,11 @@ impl<T: GraphLabel> Graph for PLGraph<T> {
let mut post = String::new();
- // FIXME: Find a way to print only used nodes. Maybe remove
+ // SOLVED: Find a way to print only used nodes. Maybe remove
// unwanted edges from unwanted nodes, so that we can print
// out only those used nodes.
+ //
+ // This is solved as we can extract the forest out.
for node in self.nodes() {
post.push_str(&format!(
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs
index 1b1b325..9b5fab1 100644
--- a/nfa/src/default/regex.rs
+++ b/nfa/src/default/regex.rs
@@ -1,6 +1,9 @@
//! This file provides a default implementation of Regex.
-use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel};
+use graph::{
+ adlist::ALGBuilderMut, builder::BuilderMut, error::Error as GError, ALGraph, ExtGraph, Graph,
+ GraphLabel,
+};
use crate::{desrec::DesRec, error::Error, Regex};
@@ -10,6 +13,7 @@ use graph_macro::Graph;
use receme::{algebra::Algebra, catana::Cata};
use std::{
+ borrow::BorrowMut,
collections::HashMap,
fmt::{Debug, Display, Write},
marker::PhantomData,
@@ -271,10 +275,8 @@ impl<T: GraphLabel> DefaultRegex<T> {
if !top.is_seen() {
stack.push(Seen(top.index()));
- if self.degree(top.index()).unwrap() > 1 {
- write!(result, "(")?;
- stack.push(Unseen(types.len() - 1));
- }
+ write!(result, "(")?;
+ stack.push(Unseen(types.len() - 1));
stack.extend(
self.graph
@@ -289,7 +291,11 @@ impl<T: GraphLabel> DefaultRegex<T> {
}
RegexType::Plus => {
if !top.is_seen() {
+ write!(result, "(")?;
+
stack.push(Seen(top.index()));
+ stack.push(Unseen(types.len() - 1));
+
stack.extend(
self.graph
.children_of(top.index())
@@ -303,7 +309,11 @@ impl<T: GraphLabel> DefaultRegex<T> {
}
RegexType::Optional => {
if !top.is_seen() {
+ write!(result, "(")?;
+
stack.push(Seen(top.index()));
+ stack.push(Unseen(types.len() - 1));
+
stack.extend(
self.graph
.children_of(top.index())
@@ -504,6 +514,56 @@ impl<T: GraphLabel> DefaultRegex<T> {
Ok(result)
}
+
+ /// Use a function transform the labels of a regular expression.
+ pub fn maps_to<S: GraphLabel, E>(
+ &self,
+ f: impl Fn(T) -> Result<S, E>,
+ ) -> Result<DefaultRegex<S>, E> {
+ let root = self.root();
+
+ let graph = self.internal_graph();
+
+ let types: Vec<RegexType<S>> = self
+ .types
+ .iter()
+ .map(|element| match element {
+ RegexType::Lit(value) => f(*value).map(RegexType::Lit),
+ RegexType::Kleene => Ok(RegexType::Kleene),
+ RegexType::Plus => Ok(RegexType::Plus),
+ RegexType::Optional => Ok(RegexType::Optional),
+ RegexType::Or => Ok(RegexType::Or),
+ RegexType::Paren => Ok(RegexType::Paren),
+ RegexType::Empty => Ok(RegexType::Empty),
+ })
+ .collect::<Result<_, E>>()?;
+
+ Ok(DefaultRegex { graph, types, root })
+ }
+
+ /// Append another regular expression to the end of this regular
+ /// expression.
+ pub fn append(&mut self, other: Self) {
+ if self.is_empty() {
+ // copy the other here
+ self.graph = other.graph;
+ self.types = other.types;
+ self.root = other.root;
+ } else if other.is_empty() {
+ // nothing to do
+ } else {
+ let mut builder_mut = ALGBuilderMut::from_graph_mut(self.graph.borrow_mut());
+
+ let old_len = builder_mut.nodes_len();
+
+ builder_mut.append(other.graph);
+
+ if let Some(root) = self.root {
+ // Deliberately ignore errors here.
+ let _ = builder_mut.add_edge(root, old_len, ());
+ }
+ }
+ }
}
impl<T: GraphLabel> Display for DefaultRegex<T> {
diff --git a/semiring/src/lib.rs b/semiring/src/lib.rs
index 6c0e0d5..b3e26b7 100644
--- a/semiring/src/lib.rs
+++ b/semiring/src/lib.rs
@@ -33,6 +33,8 @@ pub trait Semiring {
/// The addition.
fn add(&mut self, other: &Self);
+ // FIXME: The multiplication needs another information: the parent
+ // of the other value in the item derivation tree.
/// The multiplication.
fn mul(&mut self, other: &Self);
diff --git a/src/big_endian.c b/src/big_endian.c
new file mode 100644
index 0000000..2db8624
--- /dev/null
+++ b/src/big_endian.c
@@ -0,0 +1,28 @@
+#include "big_endian.h"
+
+void to_big_endian(uint64_t num, unsigned char *result)
+{
+ *(result+0) = (num >> 56) & 0xff;
+ *(result+1) = (num >> 48) & 0xff;
+ *(result+2) = (num >> 40) & 0xff;
+ *(result+3) = (num >> 32) & 0xff;
+ *(result+4) = (num >> 24) & 0xff;
+ *(result+5) = (num >> 16) & 0xff;
+ *(result+6) = (num >> 8) & 0xff;
+ *(result+7) = (num >> 0) & 0xff;
+}
+
+uint64_t from_big_endian(unsigned char *num)
+{
+ uint64_t result = ((uint64_t) (*num) << 56);
+
+ result += ((uint64_t) (*(num+1)) << 48);
+ result += ((uint64_t) (*(num+2)) << 40);
+ result += ((uint64_t) (*(num+3)) << 32);
+ result += ((uint64_t) (*(num+4)) << 24);
+ result += ((uint64_t) (*(num+5)) << 16);
+ result += ((uint64_t) (*(num+6)) << 8);
+ result += ((uint64_t) (*(num+7)) << 0);
+
+ return result;
+}
diff --git a/src/big_endian.h b/src/big_endian.h
new file mode 100644
index 0000000..e6c587e
--- /dev/null
+++ b/src/big_endian.h
@@ -0,0 +1,9 @@
+#ifndef BIG_ENDIAN_H
+#define BIG_ENDIAN_H
+#include <stdint.h>
+
+void to_big_endian(uint64_t num, unsigned char *result);
+
+uint64_t from_big_endian(unsigned char *num);
+
+#endif
diff --git a/src/big_endian.o b/src/big_endian.o
new file mode 100644
index 0000000..6d77a13
--- /dev/null
+++ b/src/big_endian.o
Binary files differ
diff --git a/src/binding.c b/src/binding.c
new file mode 100644
index 0000000..43f73ba
--- /dev/null
+++ b/src/binding.c
@@ -0,0 +1,286 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <emacs-module.h>
+#include "helper.h"
+
+int plugin_is_GPL_compatible = 3;
+
+emacs_value
+rep_new_parser(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data)
+{
+ char *buffer = NULL;
+ ptrdiff_t len = 0;
+ emacs_value error_data[1];
+
+ unsigned char error_vec_len[8] = { 0 };
+ unsigned char error_vec_cap[8] = { 0 };
+
+ struct SignedVec error_vec = { 0 };
+
+ error_vec.len = error_vec_len;
+ error_vec.capacity = error_vec_cap;
+
+ env->copy_string_contents(env, *args, NULL, &len);
+ len++;
+ buffer = malloc(sizeof(char) * len);
+
+ if (buffer == NULL) return env->intern(env, "nil");
+
+ env->copy_string_contents(env, *args, buffer, &len);
+
+ struct parser *new_parser_result = new_parser(buffer, &error_vec);
+
+ free(buffer);
+
+ uint64_t error_length = from_big_endian(error_vec.len);
+
+ if (error_length) {
+ error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length);
+
+ clean_signed(&error_vec, 4);
+
+ env->non_local_exit_signal(env, env->intern(env, "error"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ return env->make_user_ptr(env, clean_parser, new_parser_result);
+}
+
+emacs_value
+rep_parser_recognize(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data)
+{
+ struct parser *parser = env->get_user_ptr(env, *args);
+
+ if (env->non_local_exit_check(env) != emacs_funcall_exit_return)
+ return env->intern(env, "nil");
+
+ unsigned char *buffer = NULL;
+ ptrdiff_t len = 0;
+
+ emacs_value error_data[1];
+
+ if (!env->eq(env,
+ env->type_of(env, *(args+1)),
+ env->intern(env, "vector"))) {
+ error_data[0] = env->make_string
+ (env, "INPUT should be a vector of integers", 36);
+
+ env->non_local_exit_signal(env, env->intern(env, "wrong-type-argument"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ len = env->vec_size(env, *(args+1));
+
+ buffer = malloc(sizeof(unsigned char) * len * 8);
+
+ if (buffer == NULL) {
+ error_data[0] = env->make_string(env, "out of memory", 13);
+
+ env->non_local_exit_signal(env, env->intern(env, "error"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ for (ptrdiff_t i = 0; i < len; i++) {
+ emacs_value element = env->vec_get(env, *(args+1), i);
+
+ to_big_endian((uint64_t) env->extract_integer(env, element),
+ buffer + 8 * i);
+ }
+
+ if (env->non_local_exit_check(env) != emacs_funcall_exit_return) {
+ free(buffer);
+ return env->intern(env, "nil");
+ }
+
+ unsigned char error_vec_len[8] = { 0 };
+ unsigned char error_vec_cap[8] = { 0 };
+
+ struct SignedVec error_vec = { 0 };
+
+ error_vec.len = error_vec_len;
+ error_vec.capacity = error_vec_cap;
+
+ unsigned char input_len[8] = { 0 };
+
+ to_big_endian((uint64_t) len * 8, input_len);
+
+ struct UnsignedVec input_vec = (struct UnsignedVec)
+ { input_len, NULL, buffer };
+
+ int result = parser_recognize(parser, &input_vec, &error_vec, (unsigned char) 1);
+
+ free(buffer);
+
+ uint64_t error_length = from_big_endian(error_vec.len);
+
+ if (error_length) {
+ error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length);
+
+ clean_signed(&error_vec, 4);
+
+ env->non_local_exit_signal(env, env->intern(env, "error"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ return (result) ? env->intern(env, "t") : env->intern(env, "nil");
+}
+
+void
+clean_forest(void *ptr)
+{
+ clean_unsigned((struct UnsignedVec *) ptr, 15);
+}
+
+emacs_value
+rep_parser_parse(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data)
+{
+ struct parser *parser = env->get_user_ptr(env, *args);
+
+ if (env->non_local_exit_check(env) != emacs_funcall_exit_return)
+ return env->intern(env, "nil");
+
+ unsigned char *buffer = NULL;
+ ptrdiff_t len = 0;
+
+ emacs_value error_data[1];
+
+ if (!env->eq(env,
+ env->type_of(env, *(args+1)),
+ env->intern(env, "vector"))) {
+ error_data[0] = env->make_string
+ (env, "INPUT should be a vector of integers", 36);
+
+ env->non_local_exit_signal(env, env->intern(env, "wrong-type-argument"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ len = env->vec_size(env, *(args+1));
+
+ buffer = malloc(sizeof(char) * len * 8);
+
+ if (buffer == NULL) {
+ error_data[0] = env->make_string(env, "out of memory", 13);
+
+ env->non_local_exit_signal(env, env->intern(env, "error"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ for (ptrdiff_t i = 0; i < len; i++) {
+ emacs_value element = env->vec_get(env, *(args+1), i);
+
+ to_big_endian((uint64_t) env->extract_integer(env, element),
+ buffer + 8 * i);
+ }
+
+ if (env->non_local_exit_check(env) != emacs_funcall_exit_return) {
+ free(buffer);
+ return env->intern(env, "nil");
+ }
+
+ unsigned char error_vec_len[8] = { 0 };
+ unsigned char error_vec_cap[8] = { 0 };
+
+ struct SignedVec error_vec = { 0 };
+
+ error_vec.len = error_vec_len;
+ error_vec.capacity = error_vec_cap;
+
+ unsigned char input_len[8] = { 0 };
+
+ to_big_endian((uint64_t) len * 8, input_len);
+
+struct UnsignedVec input_vec = (struct UnsignedVec)
+ { input_len, NULL, buffer };
+
+ struct UnsignedVec *result = parser_parse(parser, &input_vec, &error_vec, (unsigned char) 1);
+
+ free(buffer);
+
+ uint64_t error_length = from_big_endian(error_vec.len);
+
+ if (error_length) {
+ error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length);
+
+ clean_signed(&error_vec, 4);
+
+ env->non_local_exit_signal(env, env->intern(env, "error"),
+ env->funcall(env, env->intern(env, "list"),
+ 1, error_data));
+
+ return env->intern(env, "nil");
+ }
+
+ if (result) {
+ return env->make_user_ptr(env, clean_forest, result);
+ }
+
+ return env->intern(env, "nil");
+}
+
+int
+emacs_module_init(struct emacs_runtime *ert)
+{
+ emacs_env *env = ert->get_environment(ert);
+
+ unsigned char emacs_version = 0;
+
+ if ((unsigned long) env->size >= sizeof(struct emacs_env_27))
+ emacs_version = 27;
+ else if ((unsigned long) env->size >= sizeof(struct emacs_env_26))
+ emacs_version = 26;
+ else if ((unsigned long) env->size >= sizeof(struct emacs_env_25))
+ emacs_version = 25;
+ else return 2;
+
+ emacs_value newfunc = env->make_function
+ (env, 1, 1, rep_new_parser,
+ "Create a new parser from the grammar string GRAMMAR_STRING."
+ "\n\n\(fn grammar_string)",
+ NULL);
+
+ env->funcall(env, env->intern(env, "defalias"),
+ 2, (emacs_value[]){ env->intern(env, "rep-new-parser"), newfunc });
+
+ emacs_value recognizefunc = env->make_function
+ (env, 2, 2, rep_parser_recognize,
+ "Recognize an INPUT using PARSER."
+ "\n\n\(fn parser INPUT)",
+ NULL);
+
+ env->funcall(env, env->intern(env, "defalias"),
+ 2, (emacs_value[]){ env->intern(env, "rep-recognize"), recognizefunc });
+
+ emacs_value parsefunc = env->make_function
+ (env, 2, 2, rep_parser_parse,
+ "Parse an INPUT using PARSER."
+ "\n\n\(fn parser input)",
+ NULL);
+
+ env->funcall(env, env->intern(env, "defalias"),
+ 2, (emacs_value[]){ env->intern(env, "rep-parse"), parsefunc });
+
+ env->funcall(env, env->intern(env, "provide"),
+ 1, (emacs_value[]){ env->intern(env, "rep") });
+
+ return 0;
+}
diff --git a/src/bytes.rs b/src/bytes.rs
new file mode 100644
index 0000000..f764077
--- /dev/null
+++ b/src/bytes.rs
@@ -0,0 +1,175 @@
+//! This module defines functions to turn a forest of forest labels
+//! into a sequence of bytes.
+//!
+//! To be more specific, a forest consists of two parts: a vector of
+//! node labels and a graph specifying the relations between nodes.
+//!
+//! # Number format (endianness)
+//!
+//! Every number mentionned in this format is an unsigned integer of
+//! 64 bits, or 8 bytes. Every such number is specified in the big
+//! endian format.
+//!
+//! # Parts of the sequence of bytes
+//!
+//! The sequence of bytes has three parts:
+//!
+//! ## Header
+//!
+//! The header specifies metadata of this forest.
+//!
+//! ### Special mark
+//!
+//! The first three bytes form the string "rep", as a special mark.
+//!
+//! ### Number of nodes
+//!
+//! The next 8 bytes specifies the number of nodes of this forest.
+//!
+//! ## Graph
+//!
+//! Next comes the underlying graph for this forest. This part
+//! consists of a vector of vectors of numbers. Each vector of
+//! numbers represents the list of children of a node. So the number
+//! of vectors of numbers is equal to the number of nodes.
+//!
+//! ### Vector of vectors of numbers
+//!
+//! The vectors are not simply concatenated one after another, as that
+//! way one cannot read a random node in a constant amount of time.
+//!
+//! Instead, we first specify the number of children of each node
+//! first, along with the offset for that node of the vector of
+//! children.
+//!
+//! As an example, if a graph has three nodes, represented as the
+//! adjacency list: `[[1, 2], [2], []]`, then its representation as a
+//! sequence of bytes is as follows:
+//! ```text
+//! 2, x, 1, y, 0, z, 1, 2, 2,
+//! ^ ^ ^
+//! x y z
+//! ```
+//!
+//! This has the advantage that we can read the children of the `n`-th
+//! node in constant time.
+//!
+//! ## Vector of labels
+//!
+//! Each label occupies a fixed number of bytes, so we simply put the
+//! labels one after another. The only thing to note here is the
+//! format of the labels.
+//!
+//! ### Labels
+//!
+//! Each label has 3 parts:
+//!
+//! 1. Status: either Packed, Cloned, or Plain. If the node is
+//! cloned, it has a clone index. So in total this part occupies 1
+//! byte for the status and 8 bytes for the clone index.
+//! 2. Start and end: the range in the input sentence. We just store
+//! two numbers here. Hence this part occupies 16 bytes.
+//! 3. Grammar label: either a terminal, a non-terminal, or a rule.
+//! Each variant needs a number to speify its index. So in total this
+//! part occupies 1 byte for the variant discriminant and 8 bytes for
+//! the number.
+//!
+//! To sum up, each label occupies 34 bytes.
+
+use chain::item::{default::DefaultForest, ForestLabel};
+use grammar::{GrammarLabel, GrammarLabelType, TNT};
+use graph::{Graph, LabelGraph};
+
+pub(super) fn forest_to_bytes(forest: &DefaultForest<ForestLabel<GrammarLabel>>) -> Vec<u8> {
+ // First calculate the total number of bytes.
+ let nodes_len = forest.nodes_len();
+
+ let degrees: Vec<_> = forest
+ .nodes()
+ .map(|node| forest.degree(node).unwrap_or(0))
+ .collect();
+
+ let total_degree: usize = degrees.iter().copied().sum();
+
+ let len: usize = 8 // total number of bytes at the start
+ + 3 // special mark
+ + 8 // number of nodes
+ + 8 // offset of labels
+ + 16 * nodes_len // degree & offset for each node
+ + 8 * total_degree // children of each node
+ + 34 * nodes_len // labels
+ ;
+
+ // Then fill in the bytes.
+
+ let mut bytes: Vec<u8> = Vec::with_capacity(len);
+
+ // First the headers
+
+ bytes.extend(len.to_be_bytes());
+ bytes.extend([114, 101, 112]); // rep
+ bytes.extend(nodes_len.to_be_bytes());
+ bytes.extend((len - 34 * nodes_len).to_be_bytes());
+
+ let mut accumulated: usize = 0;
+
+ // Then degrees and offsets
+
+ for degree in degrees.iter().copied() {
+ bytes.extend(degree.to_be_bytes());
+ bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes());
+
+ accumulated += degree;
+ }
+
+ // Then the children
+
+ bytes.extend(forest.nodes().flat_map(|node| {
+ forest
+ .children_of(node)
+ .unwrap_or_default()
+ .flat_map(|child| child.to_be_bytes())
+ }));
+
+ // Finally the labels
+
+ 'nodes_loop: for node in forest.nodes() {
+ let label = match forest.vertex_label(node) {
+ Ok(Some(label)) => label,
+ _ => continue 'nodes_loop,
+ };
+
+ if label.is_plain() {
+ bytes.extend(std::iter::repeat(0).take(9));
+ } else if label.is_packed() {
+ bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8)));
+ } else if let Some(index) = label.clone_index() {
+ bytes.extend(std::iter::once(2).chain(index.to_be_bytes()));
+ }
+
+ let label = label.label();
+
+ bytes.extend(label.start().to_be_bytes());
+ bytes.extend(label.end().unwrap_or(0).to_be_bytes());
+
+ let label = label.label();
+
+ match label {
+ GrammarLabelType::TNT(TNT::Ter(t)) => {
+ bytes.extend(std::iter::once(0).chain(t.to_be_bytes()));
+ }
+ GrammarLabelType::TNT(TNT::Non(n)) => {
+ bytes.extend(std::iter::once(1).chain(n.to_be_bytes()));
+ }
+ GrammarLabelType::Rule(r) => {
+ bytes.extend(std::iter::once(2).chain(r.to_be_bytes()));
+ }
+ }
+ }
+
+ if bytes.len() != len {
+ dbg!();
+ }
+
+ bytes
+}
diff --git a/src/helper.c b/src/helper.c
new file mode 100644
index 0000000..5d7e9f8
--- /dev/null
+++ b/src/helper.c
@@ -0,0 +1,73 @@
+#include "helper.h"
+
+struct Label
+read_label(unsigned char *ptr)
+{
+ struct Label label = { 0 };
+
+ switch (*ptr) {
+ case Plain:
+ label.status = Plain;
+ break;
+ case Packed:
+ label.status = Packed;
+ break;
+ default:
+ label.status = Clone;
+ label.clone_index = from_big_endian(ptr+1);
+ break;
+ }
+
+ label.start = from_big_endian(ptr+9);
+ label.end = from_big_endian(ptr+17);
+
+ switch (*(ptr+25)) {
+ case Terminal:
+ label.variant = Terminal;
+ break;
+ case Nonterminal:
+ label.variant = Nonterminal;
+ break;
+ default:
+ label.variant = Rule;
+ break;
+ }
+
+ label.content = from_big_endian(ptr+26);
+
+ return label;
+}
+
+void
+print_label(struct Label label)
+{
+ switch (label.status) {
+ case Plain:
+ printf("a plain node ");
+ break;
+ case Packed:
+ printf("a packed node ");
+ break;
+ default:
+ printf("a cloned node with index %llu ", label.clone_index);
+ break;
+ }
+
+ printf("spanning (%llu, %llu) ", label.start, label.end);
+
+ printf("labelled as a ");
+
+ switch (label.variant) {
+ case Terminal:
+ printf("terminal ");
+ break;
+ case Nonterminal:
+ printf("non-terminal ");
+ break;
+ default:
+ printf("rule ");
+ break;
+ }
+
+ printf("%llu\n", label.content);
+}
diff --git a/src/helper.h b/src/helper.h
new file mode 100644
index 0000000..8be7497
--- /dev/null
+++ b/src/helper.h
@@ -0,0 +1,75 @@
+#ifndef HELPER_H
+#define HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include "big_endian.h"
+
+struct SignedVec {
+ unsigned char *len;
+ unsigned char *capacity;
+ char *data;
+};
+
+struct UnsignedVec {
+ unsigned char *len;
+ unsigned char *capacity;
+ unsigned char *data;
+};
+
+struct parser;
+
+struct parser *new_parser(char *grammar_string,
+ struct SignedVec *error_vec);
+
+void clean_parser(void *parser);
+
+int parser_recognize(struct parser *parser,
+ struct UnsignedVec *input_vec,
+ struct SignedVec *error_vec,
+ unsigned char reset_p);
+
+struct UnsignedVec *
+parser_parse
+(struct parser *parser,
+ struct UnsignedVec *input_vec,
+ struct SignedVec *error_vec,
+ unsigned char reset_p );
+
+struct UnsignedVec *
+parser_parse(struct parser *parser,
+ struct UnsignedVec *input_vec,
+ struct SignedVec *error_vec,
+ unsigned char reset_p);
+
+void clean_signed(struct SignedVec *vec, unsigned char flag);
+void clean_unsigned(struct UnsignedVec *vec, unsigned char flag);
+
+typedef enum Status {
+ Plain = 0,
+ Packed = 1,
+ Clone = 2,
+} Status;
+
+typedef enum Variant {
+ Terminal = 0,
+ Nonterminal = 1,
+ Rule = 2,
+} Variant;
+
+struct Label {
+ Status status;
+ uint64_t clone_index;
+ uint64_t start;
+ uint64_t end;
+ Variant variant;
+ uint64_t content;
+};
+
+struct Label read_label(unsigned char *ptr);
+
+void print_label(struct Label label);
+
+#endif
diff --git a/src/helper.o b/src/helper.o
new file mode 100644
index 0000000..e8801dd
--- /dev/null
+++ b/src/helper.o
Binary files differ
diff --git a/src/lib.rs b/src/lib.rs
index f5457c3..685c66e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,16 +1,548 @@
-// TODO: Add Emacs bindings
+#![warn(missing_docs)]
+//! This top level package provides necessary functions for Emacs to
+//! call.
-pub fn add(left: usize, right: usize) -> usize {
- left + right
+extern crate chain;
+extern crate grammar;
+
+use chain::{atom::DefaultAtom, default::DefaultChain, Chain};
+use grammar::Grammar;
+
+/// This struct is the representation of a parser.
+///
+/// When the user constructs a parser, an instance of this struct will
+/// be constructed and the user will receive an opaque pointer to this
+/// struct.
+#[derive(Debug, Clone)]
+#[repr(C)]
+pub struct Parser {
+ chain: DefaultChain,
+}
+
+impl Parser {
+ /// Construct a parser from the grammar string.
+ ///
+ /// The grammar is supposed to conform to the Augmented
+ /// Backus-Naur Format. See RFC 5234 for exact details of this
+ /// format.
+ pub fn new(s: &str) -> Result<Self, String> {
+ let grammar: Grammar = s.parse().map_err(|err| format!("{err}"))?;
+ let atom: DefaultAtom =
+ DefaultAtom::from_grammar(grammar).map_err(|err| format!("{err}"))?;
+
+ DefaultChain::unit(atom)
+ .map_err(|err| format!("{err}"))
+ .map(|chain| Self { chain })
+ }
+}
+
+/// Actual function that is called through C ABI.
+///
+/// The parameter `ERROR_LEN` is supposed to point to an integer,
+/// which will be set to the length of the error message, if and only
+/// if an error occurs, in which case the `ERROR_STR` will be set to
+/// point to the actual error message.
+///
+/// It is expected that `*ERROR_STR` should hold the value `NULL` .
+#[no_mangle]
+extern "C" fn new_parser(
+ grammar_string: *mut std::os::raw::c_char,
+ error_vec: *mut LenVec<std::os::raw::c_char>,
+) -> *mut Parser {
+ let parsed_str;
+
+ let error_len = unsafe { (*error_vec).len };
+ let error_cap = unsafe { (*error_vec).capacity };
+
+ unsafe {
+ match std::ffi::CStr::from_ptr(grammar_string).to_str() {
+ Ok(ccstr) => {
+ parsed_str = ccstr.to_string();
+ }
+ Err(e) => {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+
+ std::mem::forget(e_string);
+
+ return std::ptr::null_mut();
+ }
+ }
+ }
+
+ match Parser::new(&parsed_str) {
+ Ok(result) => unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = 0;
+ }
+
+ Box::into_raw(Box::new(result))
+ },
+ Err(e) => unsafe {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+
+ std::mem::forget(e_string);
+
+ std::ptr::null_mut()
+ },
+ }
+}
+
+#[no_mangle]
+extern "C" fn clean_parser(parser: *const std::ffi::c_void) {
+ unsafe {
+ drop(Box::from_raw(parser as *mut Parser));
+ }
+}
+
+/// To make it easier to pass arrays to and from C.
+#[repr(C)]
+pub struct LenVec<T> {
+ /// This must be an array of unsigned char of length 8.
+ ///
+ /// A less length leads to access to invalid memory location,
+ /// while a longer length will be ignored, possibly leading to
+ /// wrong length.
+ ///
+ /// The length will be interpreted as a 64-bits integer stored in
+ /// big endian format.
+ pub len: *mut std::os::raw::c_uchar,
+ /// This must be an array of unsigned chars of length 8.
+ ///
+ /// This is only used so that Rust knows how to reconstruct the
+ /// data from C, in order for Rust to deallocate the objects
+ /// exposed by this library.
+ pub capacity: *mut std::os::raw::c_uchar,
+ /// The actual pointer to the data.
+ ///
+ /// In case this should be set by the function on the Rust end,
+ /// this field should be `NULL`.
+ pub data: *mut T,
+}
+
+#[no_mangle]
+extern "C" fn clean_signed(vec: *mut LenVec<std::os::raw::c_char>, flag: std::os::raw::c_uchar) {
+ let len = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*vec).len, 8) }
+ .try_into()
+ .unwrap(),
+ );
+ let capacity = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*vec).capacity, 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ if (flag & 1) != 0 {
+ drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) });
+ }
+
+ if (flag & 2) != 0 {
+ drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) });
+ }
+
+ if (flag & 4) != 0 {
+ drop(unsafe { String::from_raw_parts((*vec).data as *mut u8, len, capacity) });
+ }
+
+ if (flag & 8) != 0 {
+ drop(unsafe { Box::from_raw(vec) });
+ }
+}
+
+#[no_mangle]
+extern "C" fn clean_unsigned(vec: *mut LenVec<std::os::raw::c_uchar>, flag: std::os::raw::c_uchar) {
+ let len = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*vec).len, 8) }
+ .try_into()
+ .unwrap(),
+ );
+ let capacity = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*vec).capacity, 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ if (flag & 1) != 0 {
+ drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) });
+ }
+
+ if (flag & 2) != 0 {
+ drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) });
+ }
+
+ if (flag & 4) != 0 {
+ drop(unsafe { Vec::<u8>::from_raw_parts((*vec).data, len, capacity) });
+ }
+
+ if (flag & 8) != 0 {
+ drop(unsafe { Box::from_raw(vec) });
+ }
+}
+
+#[no_mangle]
+extern "C" fn parser_recognize(
+ parser: *mut Parser,
+ input_vec: *mut LenVec<std::os::raw::c_uchar>,
+ error_vec: *mut LenVec<std::os::raw::c_char>,
+ reset_p: std::os::raw::c_uchar,
+) -> std::os::raw::c_int {
+ let input_len = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*input_vec).len, 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let mut parser_box;
+ let input_array_len = input_len;
+ let input_array;
+
+ let error_len = unsafe { (*error_vec).len };
+ let error_cap = unsafe { (*error_vec).capacity };
+
+ unsafe {
+ parser_box = Box::from_raw(parser);
+
+ input_array = std::slice::from_raw_parts((*input_vec).data, input_array_len);
+ }
+
+ // If the parser has already been used before, reset it to the
+ // initial state.
+
+ if reset_p != 0 && !parser_box.chain.history().is_empty() {
+ match DefaultChain::unit(parser_box.chain.atom().clone()) {
+ Ok(chain) => {
+ parser_box.chain = chain;
+ }
+ Err(e) => {
+ let mut e_string = format!("error: {e}");
+
+ Box::leak(parser_box);
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return 0;
+ }
+ }
+ }
+
+ if input_array_len.rem_euclid(8) != 0 {
+ let mut e_string =
+ format!("error: input length should be divisible by 8, but got {input_array_len}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return 0;
+ }
+
+ #[cfg(target_pointer_width = "64")]
+ let input_iter = input_array
+ .chunks_exact(8)
+ .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap()));
+
+ #[cfg(not(target_pointer_width = "64"))]
+ compile_error!("this program assumes to be run on 64-bits machines");
+
+ for (index, token) in input_iter.enumerate() {
+ let chain_result = parser_box.chain.chain(token, index, true);
+
+ if let Err(e) = chain_result {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return 0;
+ }
+ }
+
+ match parser_box.chain.epsilon() {
+ Ok(result) => {
+ Box::leak(parser_box);
+
+ result as std::os::raw::c_int
+ }
+ Err(e) => {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ 0
+ }
+ }
}
-#[cfg(test)]
-mod tests {
- use super::*;
+#[no_mangle]
+extern "C" fn parser_parse(
+ parser: *mut Parser,
+ input_vec: *mut LenVec<std::os::raw::c_uchar>,
+ error_vec: *mut LenVec<std::os::raw::c_char>,
+ reset_p: std::os::raw::c_uchar,
+) -> *mut LenVec<std::os::raw::c_uchar> {
+ let input_len = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*input_vec).len, 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let mut parser_box;
+ let input_array;
- #[test]
- fn it_works() {
- let result = add(2, 2);
- assert_eq!(result, 4);
+ let error_len = unsafe { (*error_vec).len };
+ let error_cap = unsafe { (*error_vec).capacity };
+
+ unsafe {
+ parser_box = Box::from_raw(parser);
+
+ input_array = std::slice::from_raw_parts((*input_vec).data, input_len);
+ }
+
+ // If the parser has already been used before, reset it to the
+ // initial state.
+
+ if reset_p != 0 && !parser_box.chain.history().is_empty() {
+ match DefaultChain::unit(parser_box.chain.atom().clone()) {
+ Ok(chain) => {
+ parser_box.chain = chain;
+ }
+ Err(e) => {
+ let mut e_string = format!("error: {e}");
+
+ Box::leak(parser_box);
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return std::ptr::null_mut();
+ }
+ }
+ }
+
+ if input_len.rem_euclid(8) != 0 {
+ let mut e_string =
+ format!("error: input length should be divisible by 8, but got {input_len}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return std::ptr::null_mut();
+ } else if input_len == 0 {
+ Box::leak(parser_box);
+
+ return std::ptr::null_mut();
+ }
+
+ #[cfg(target_pointer_width = "64")]
+ let input_iter = input_array
+ .chunks_exact(8)
+ .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap()));
+
+ #[cfg(not(target_pointer_width = "64"))]
+ compile_error!("this program assumes to be run on 64-bits machines");
+
+ let mut last_pos: usize = 0;
+ let mut last_token: usize = 0;
+
+ for (index, token) in input_iter.enumerate() {
+ last_pos = index;
+ last_token = token;
+
+ let chain_result = parser_box.chain.chain(token, index, false);
+
+ if let Err(e) = chain_result {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return std::ptr::null_mut();
+ }
+ }
+
+ match parser_box.chain.epsilon() {
+ Ok(result) => {
+ if result {
+ let forest = parser_box.chain.end_of_input(last_pos + 1, last_token);
+
+ match forest {
+ Ok(forest) => {
+ Box::leak(parser_box);
+
+ let mut bytes = bytes::forest_to_bytes(&forest);
+
+ let bytes_len = bytes.len().to_be_bytes().to_vec();
+
+ let bytes_capacity = bytes.capacity().to_be_bytes().to_vec();
+
+ let bytes_vec: LenVec<std::os::raw::c_uchar> = LenVec {
+ len: Box::leak(bytes_len.into_boxed_slice()).as_mut_ptr(),
+ capacity: Box::leak(bytes_capacity.into_boxed_slice()).as_mut_ptr(),
+ data: bytes.as_mut_ptr(),
+ };
+
+ std::mem::forget(bytes);
+
+ Box::into_raw(Box::new(bytes_vec))
+ }
+ Err(e) => {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ std::ptr::null_mut()
+ }
+ }
+ } else {
+ Box::leak(parser_box);
+
+ std::ptr::null_mut()
+ }
+ }
+ Err(e) => {
+ let mut e_string = format!("error: {e}");
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ Box::leak(parser_box);
+
+ unsafe {
+ for i in 0..8 {
+ *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ std::ptr::null_mut()
+ }
}
}
+
+pub mod bytes;
diff --git a/src/old binding.txt b/src/old binding.txt
new file mode 100644
index 0000000..0311f49
--- /dev/null
+++ b/src/old binding.txt
@@ -0,0 +1,218 @@
+//! This file provides the Emacs binding for the core.
+
+#![allow(unused_imports)]
+
+extern crate repcore;
+
+use repcore::{
+ derive::stdag::{PFRecorder, STDAGParser},
+ grammar::{Grammar, GrammarInfo},
+ semiring::{IdentityRecorder, Nat, PFS},
+ tokenizer::to_tokenizer,
+ valuation::{Count, ParseForest},
+};
+
+use repcore::parser::Parser as PParser;
+use repcore::parser::ParserReportConfig as PParserConfig;
+
+// TODO: Write Emacs binding
+
+// Let's design the API for Emacs to use.
+//
+// Emacs offers the grammar file or grammar string, and then we
+// generate a parser.
+//
+// Thereafter when we want to parse some input, Emacs gives the input
+// file name or the input file string to this package, and then the
+// package gives back the parse result.
+//
+// Besides, we need a data type to deal with errors.
+//
+// Handling a string is not a problem for us, but to return a parser
+// to Emacs, and to return the parse result, are kind of troublesome.
+//
+// To achieve this, we need data types for a parser and for the parse
+// result. Since a parser is just an implementation detail and should
+// not be of concern to the user, we mught just represent a parser as
+// just a custom user-ptr in Emacs.
+//
+// As to the parse result, we return a set of tuples:
+//
+// (label, i, k, j),
+//
+// where LABEL is represented as a tuple
+//
+// (INDEX, STRING),
+//
+// where index is the index of the rule and the string is the name of
+// the non-terminal or the terminal that is spanned by this packed
+// node.
+//
+// This is not intended for the user to inspect and play with. The
+// package should provide another function to interpret this data, for
+// example to search something or to display it. The reason we don't
+// directly return something to display is that the plain forest might
+// contain cyclic data, and is inconvenient to represent in Emacs. So
+// we only try to display this information to the user if the need
+// arises.
+
+#[derive(Debug, Clone)]
+#[repr(C)]
+pub struct Parser {
+ g: Grammar,
+ gi: GrammarInfo,
+ stparser: STDAGParser<PFS, ParseForest, PFRecorder>,
+}
+
+impl Parser {
+ fn new(gs: &str) -> Result<Self, Box<dyn std::error::Error>> {
+ let g: Grammar = gs.parse()?;
+
+ let mut stparser = STDAGParser::new();
+
+ let mut gi = g.generate_info()?;
+
+ stparser.init(&g, &mut gi)?;
+
+ Ok(Self { g, gi, stparser })
+ }
+}
+
+#[no_mangle]
+extern "C" fn new_parser(
+ gs: *mut std::os::raw::c_char,
+ error: *mut std::os::raw::c_int,
+) -> *mut Parser {
+ let parsed_str;
+ let parser_result;
+
+ unsafe {
+ let cstr = std::ffi::CStr::from_ptr(gs).to_str();
+
+ if cstr.is_err() {
+ *error = 1;
+ return std::ptr::null_mut();
+ }
+
+ parsed_str = cstr.unwrap().to_owned();
+
+ parser_result = Parser::new(&parsed_str);
+
+ if parser_result.is_err() {
+ *error = 2;
+ eprintln!("failed: {parser_result:?}");
+ return std::ptr::null_mut();
+ }
+
+ *error = 0;
+ }
+
+ Box::into_raw(Box::new(parser_result.unwrap()))
+}
+
+#[no_mangle]
+extern "C" fn clean_parser(parser: *const std::ffi::c_void) {
+ unsafe {
+ Box::from_raw(parser as *mut Parser);
+ }
+}
+
+// #[no_mangle]
+// extern "C" fn reset_parser(parser: *mut Parser) {
+// let mut parser_box;
+// unsafe {
+// parser_box = Box::from_raw(parser);
+// }
+
+// parser_box
+// .stparser
+// .init(&parser_box.g, &parser_box.gi)
+// .unwrap();
+
+// std::mem::forget(parser_box);
+// }
+
+// FIXME: Use a pointer to int to indicate the type of errors, and use
+// a pointer to a custom error struct to convey the error messages.
+#[no_mangle]
+extern "C" fn parser_parse(
+ parser: *mut Parser,
+ file: *const std::os::raw::c_char,
+) -> std::os::raw::c_int {
+ let mut parser_box;
+ let cstr;
+
+ unsafe {
+ parser_box = Box::from_raw(parser);
+ cstr = std::ffi::CStr::from_ptr(file).to_str();
+
+ if cstr.is_err() {
+ eprintln!("error reading string: {:?}", cstr);
+ return 0;
+ }
+ }
+
+ parser_box.stparser.reinit();
+
+ let file_string = std::fs::read_to_string(cstr.unwrap());
+
+ if file_string.is_err() {
+ eprintln!("error reading file: {:?}", file_string);
+ return 0;
+ }
+
+ let file_string = file_string.unwrap();
+
+ let parse_result = parser_box
+ .stparser
+ .parse(&file_string.chars().collect::<Vec<_>>());
+
+ if parse_result.is_err() {
+ eprintln!("failed to parse: {:?}", parse_result);
+ return 0;
+ }
+
+ let (accepting, _) = parse_result.unwrap();
+
+ std::mem::forget(parser_box);
+
+ accepting as std::os::raw::c_int
+}
+
+#[no_mangle]
+extern "C" fn parser_parse_string(
+ parser: *mut Parser,
+ input: *const std::os::raw::c_char,
+) -> std::os::raw::c_int {
+ let mut parser_box;
+ let cstr;
+
+ unsafe {
+ parser_box = Box::from_raw(parser);
+ cstr = std::ffi::CStr::from_ptr(input).to_str();
+
+ if cstr.is_err() {
+ eprintln!("error reading string: {:?}", cstr);
+ return 0;
+ }
+ }
+
+ parser_box.stparser.reinit();
+
+ let file_string = cstr.unwrap().to_owned();
+
+ let parse_result = parser_box
+ .stparser
+ .parse(&file_string.chars().collect::<Vec<_>>());
+
+ if parse_result.is_err() {
+ eprintln!("failed to parse: {:?}", parse_result);
+ return 0;
+ }
+
+ let (accepting, _) = parse_result.unwrap();
+
+ std::mem::forget(parser_box);
+
+ accepting as std::os::raw::c_int
+}
diff --git a/src/rep.so b/src/rep.so
new file mode 100755
index 0000000..2bc412c
--- /dev/null
+++ b/src/rep.so
Binary files differ
diff --git a/src/test b/src/test
new file mode 100755
index 0000000..5806c2d
--- /dev/null
+++ b/src/test
Binary files differ
diff --git a/src/test.c b/src/test.c
new file mode 100644
index 0000000..d309fb7
--- /dev/null
+++ b/src/test.c
@@ -0,0 +1,224 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include <emacs-module.h>
+
+#include "big_endian.h"
+#include "helper.h"
+
+int
+main(int argc, char **argv)
+{
+ unsigned char error_vec_len[8] = { 0 };
+ unsigned char error_vec_cap[8] = { 0 };
+
+ struct SignedVec error_vec = { 0 };
+
+ error_vec.len = error_vec_len;
+ error_vec.capacity = error_vec_cap;
+
+ struct parser *parser = new_parser("start = \"a\"\"b\"\n", &error_vec);
+
+ uint64_t error_length = from_big_endian(error_vec.len);
+
+ if (error_length) {
+ printf("error: ");
+
+ char *error_str = error_vec.data;
+
+ for (int i = 0; i < error_length; i++) {
+ printf("%c", *(error_str+i));
+ }
+
+ printf("\n");
+
+ clean_signed(&error_vec, 4);
+
+ return 1;
+ }
+
+ unsigned char *input = malloc (sizeof(unsigned char) * 16);
+
+ for (int i = 0; i < 16; i++) *(input+i) = 0;
+
+ *(input+15) = 1;
+
+ unsigned char input_len[8] = { 0 };
+ input_len[7] = 16;
+
+ struct UnsignedVec input_vec = (struct UnsignedVec) { input_len, NULL, input };
+
+ int result = parser_recognize
+ (parser,
+ &input_vec,
+ &error_vec,
+ (unsigned char) 0);
+
+ error_length = from_big_endian(error_vec.len);
+
+ if (error_length) {
+ clean_parser((void *) parser);
+ free(input);
+
+ printf("error: ");
+
+ char *error_str = error_vec.data;
+
+ for (uint64_t i = 0; i < error_length; i++) {
+ printf("%c", *(error_str + (unsigned long) i));
+ }
+
+ printf("\n");
+
+ clean_signed(&error_vec, 4);
+
+ return 1;
+ }
+
+ if (result)
+ printf("result = true\n");
+ else
+ printf("result = false\n");
+
+ for (int i = 0; i < 8; i++) {
+ error_vec.len[i] = 0;
+ error_vec.capacity[i] = 0;
+ }
+
+ struct UnsignedVec *forest_ptr = parser_parse
+ (parser,
+ &input_vec,
+ &error_vec,
+ (unsigned char) 1);
+
+ error_length = from_big_endian(error_vec.len);
+
+ if (error_length) {
+ clean_parser((void *) parser);
+ free(input);
+
+ printf("error: ");
+
+ char *error_str = error_vec.data;
+
+ for (uint64_t i = 0; i < error_length; i++) {
+ printf("%c", *(error_str + (unsigned long) i));
+ }
+
+ printf("\n");
+
+ clean_signed(&error_vec, 4);
+
+ return 1;
+ }
+
+ free(input);
+ clean_parser((void *) parser);
+
+ if (forest_ptr) {
+ uint64_t forest_len = from_big_endian(forest_ptr->len);
+
+ /* forest_len++; */
+
+ printf("a non-empty forest of length %llu\n", forest_len);
+
+ unsigned char *forest = forest_ptr->data;
+
+ uint64_t forest_real_len = from_big_endian(forest);
+
+ if (forest_real_len != forest_len) {
+ fprintf(stderr, "wrong length: %llu\n", forest_real_len);
+
+ clean_unsigned(forest_ptr, 15);
+
+ return 1;
+ }
+
+ if (forest_len < 27) {
+ fprintf(stderr, "length too small: %llu\n", forest_len);
+
+ clean_unsigned(forest_ptr, 15);
+
+ return 1;
+ }
+
+ printf("the lengths match\n");
+
+ if (*(forest+8) != 114 || *(forest+9) != 101 || *(forest+10) != 112) {
+ fprintf(stderr, "the forest does not begin with the special mark\n");
+
+ fprintf(stderr, "the first bytes are: ");
+ fprintf(stderr, "%c", *(forest+8));
+ fprintf(stderr, "%c", *(forest+9));
+ fprintf(stderr, "%c\n", *(forest+10));
+
+ clean_unsigned(forest_ptr, 15);
+
+ return 1;
+ }
+
+ printf("the special mark is correct.\n");
+
+ uint64_t nodes_len = from_big_endian(forest+11);
+
+ printf("forest has %llu nodes\n", nodes_len);
+
+ uint64_t labels_offset = from_big_endian(forest+19);
+
+ printf("the offset of labels is %llu\n", labels_offset);
+
+ if (forest_len < labels_offset ||
+ forest_len < (27 + 16 * nodes_len)) {
+ fprintf(stderr, "length too small: %llu\n", forest_len);
+
+ clean_unsigned(forest_ptr, 15);
+
+ return 1;
+ }
+
+ uint64_t total_degree = 0;
+
+ for (uint64_t i = 0; i < nodes_len; i++) {
+ uint64_t offset = 27 + 16 * i;
+
+ uint64_t degree = from_big_endian(forest+offset);
+ uint64_t node_offset = from_big_endian(forest+offset+8);
+
+ total_degree += degree;
+
+ printf("the %llu-th node has degree %llu and offset %llu\n",
+ i, degree, node_offset);
+
+ printf("label: ");
+ print_label(read_label(forest + labels_offset + 34 * i));
+
+ for (uint64_t j = 0; j < degree; j++) {
+ uint64_t child = from_big_endian(forest+node_offset+8*j);
+
+ printf("the %llu-th child is %llu\n", j, child);
+ }
+
+ printf("\n");
+ }
+
+ uint64_t correct = 27 + 50 * nodes_len + 8 * total_degree;
+
+ if (forest_len != correct) {
+ fprintf(stderr, "length does not conform to the format: %llu\n", correct);
+
+ clean_unsigned(forest_ptr, 15);
+
+ return 1;
+ }
+
+ printf("the forest has the correct length according to the format\n");
+
+ clean_unsigned(forest_ptr, 15);
+ } else {
+ printf("no forest\n");
+ }
+
+ return 0;
+}
diff --git a/src/test.el b/src/test.el
new file mode 100644
index 0000000..d106a03
--- /dev/null
+++ b/src/test.el
@@ -0,0 +1,28 @@
+(load-file "rep.so")
+(setq rep-dir (vc-call-backend 'Git 'root default-directory))
+
+(setq test-parser
+ (rep-new-parser
+ (with-temp-buffer
+ (insert-file-contents
+ (expand-file-name
+ "test.abnf"
+ (expand-file-name
+ "abnf grammars"
+ (expand-file-name "grammar" rep-dir))))
+ (buffer-string))))
+
+(defvar input nil "A vector that represents a testing input.")
+
+(setq input (vector 3 0 2 2 2 1 1 1 0 1))
+
+(rep-recognize test-parser input)
+(rep-parse test-parser input)
+
+;; (rep-parse-string
+;; test-parser
+;; "* title\nprice: 512\nnote: this is a test\n")
+
+;; (rep-parse-string
+;; test-parser
+;; "183t3ru")