#![warn(missing_docs)]
//! This file implements the extected behaviours of grammars.

// NOTE: We shall first start with a parser that works at the level of
// characters.  The purpose is to first experiment with the workings
// and the performance of the algorithms, before optimising by using
// regular expressions to classify inputs into tokens.  In other
// words, the current focus is not on the optimisations, whereas
// scanners are for optimisations only, so to speak.

// REVIEW: Separate contents into modules.

use nfa::{
    default::{
        nfa::DefaultNFA,
        regex::{DefaultRegex, ParseError, RegexType},
    },
    LabelType, Nfa, NfaLabel, Regex, SoC, TwoEdges,
};

use graph::{adlist::ALGBuilder, builder::Builder, Graph};

use std::{
    collections::{HashMap, HashSet},
    fmt::Display,
};

/// The type of a terminal.
///
/// For the time being this is a wrapper around a string, but in the
/// future it may hold more information of scanners.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Terminal {
    // If we want to use scanners, per chance add them as a new field
    // here.
    name: String,
}

impl Terminal {
    /// Create a terminal with the given name.
    #[inline]
    pub fn new(name: String) -> Self {
        Self { name }
    }

    /// Return the name of the terminal.
    #[inline]
    pub fn name(&self) -> &str {
        &self.name
    }
}

/// The type of a non-terminal.
///
/// This is just a wrapper around a string.
#[derive(Debug, Clone)]
pub struct Nonterminal(String);

impl Nonterminal {
    /// Return the name of the nonterminal.
    ///
    /// Just to improve readability.
    #[inline]
    pub fn name(&self) -> &str {
        &self.0
    }
}

/// The type of a terminal or a non-terminal.
///
/// Only an index is stored here.  Actual data are stored in two other
/// arrays.
#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
pub enum TNT {
    /// Terminal variant
    Ter(usize),
    /// Nonterminal variant
    Non(usize),
}

impl Display for TNT {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TNT::Ter(t) => write!(f, "T({t})"),
            TNT::Non(n) => write!(f, "N({n})"),
        }
    }
}

/// Errors related to grammar operations.
#[derive(Debug, Copy, Clone)]
#[non_exhaustive]
pub enum Error {
    /// The operation requires the grammar to be after a certain
    /// state, but the grammar is not after that state yet.
    WrongState(GrammarState, GrammarState),
    /// The first component is the index, and the second the bound.
    IndexOutOfBounds(usize, usize),
    /// Fail to build the N-th regular expression, due to the
    /// ParseError.
    BuildFail(usize, ParseError),
    /// fail to build NFA
    NFAFail(nfa::error::Error),
}

impl From<nfa::error::Error> for Error {
    fn from(nfae: nfa::error::Error) -> Self {
        Self::NFAFail(nfae)
    }
}

impl Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"),
            Error::BuildFail(n, pe) => write!(
                f,
                "Failed to build the {n}-th regular expression due to error: {pe}"
            ),
            Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
            Error::WrongState(current, threshold) => {
                write!(f, "require state {threshold}, but in state {current}")
            }
        }
    }
}

impl std::error::Error for Error {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        if let Error::NFAFail(error) = self {
            Some(error)
        } else {
            None
        }
    }
}

/// A rule is a regular expression of terminals or non-terminals.
#[derive(Debug, Clone)]
pub struct Rule {
    regex: DefaultRegex<TNT>,
}

impl Rule {
    /// Return true if and only if the rule is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.regex.is_empty()
    }

    /// Return the length of the rule.
    #[inline]
    pub fn len(&self) -> usize {
        self.regex.len()
    }
}

/// The state of Grammar.
///
/// This is used to ensure that the grammar preparation methods are
/// called in the correct order.
#[derive(Debug, Copy, Clone, Default)]
pub enum GrammarState {
    /// Just initialized
    #[default]
    Initial,
    /// compute_firsts has been called
    AfterComputeFirst,
    /// left_closure has been called.
    AfterLeftClosure,
    /// left_closure_to_nfa has been called.
    AfterNFA,
}

impl Display for GrammarState {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use GrammarState::*;

        match self {
            Initial => write!(f, "initial"),
            AfterComputeFirst => write!(f, "after computation of first set"),
            AfterLeftClosure => write!(f, "after computation of closure"),
            AfterNFA => write!(f, "after computation of NFA"),
        }
    }
}

/// The type of a grammar.
#[derive(Debug, Clone, Default)]
pub struct Grammar {
    /// A list of terminals.
    ter: Vec<Terminal>,
    /// A list of non-terminals.
    non: Vec<Nonterminal>,
    /// A list of rules.
    ///
    /// The length of the list must match that of the list of
    /// non-terminals.
    rules: Vec<Rule>,
    /// The list of successive sums of lengths of rules.
    accumulators: Vec<usize>,
    // The following two attributes are empty until we call
    // `compute_firsts` on the grammar.
    /// The list of sets of "first terminals".
    ///
    /// The length must match that of the list of non-terminals.
    firsts: Vec<HashSet<Option<usize>>>,
    /// The list of lists of nodes that are reachable after a nullable
    /// transition in the regular expression.
    ///
    /// The length must match that of the list of non-terminals.
    first_nodes: Vec<Vec<usize>>,
    // The following attribute is empty until we call `closure` on the
    // NFA with `transform_label_null_epsilon` as the transformer.
    /// A hash map that maps a tuple `(pos1, pos2)` of positions
    /// `pos1` and `pos2` in the rules to a vector of rule positions.
    /// This vector means that in order to expand from `pos1` to
    /// `pos`, it is necessary to expand according to the positions in
    /// the vector, so we need to add all these expansions into the
    /// parse forest later.
    expansion_map: HashMap<(usize, usize), Vec<usize>>,
    /// The state of the grammar, which tells us what information has
    /// been computed for this grammar.
    state: GrammarState,
}

/// A private type to aid the recursive looping of rergular
/// expressions.
#[derive(Copy, Clone)]
enum StackElement {
    Seen(usize),
    Unseen(usize),
}

impl StackElement {
    fn index(self) -> usize {
        match self {
            Self::Seen(index) => index,
            Self::Unseen(index) => index,
        }
    }

    fn is_seen(self) -> bool {
        matches!(self, Self::Seen(_))
    }
}

impl Grammar {
    /// Construct a grammar from a vector of terminals, a vector of
    /// non-terminals, and a vector of rules for the non-temrinals.
    ///
    /// # Panic
    ///
    /// If the length of `non` is not equal to that of `rules`, then
    /// the function panics.
    pub fn new(ter: Vec<Terminal>, non: Vec<Nonterminal>, rules: Vec<Rule>) -> Self {
        assert_eq!(non.len(), rules.len());

        // One more room is reserved for the `None` value.
        let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1))
            .take(non.len())
            .collect();

        let first_nodes = rules
            .iter()
            .map(|rule| Vec::with_capacity(rule.len()))
            .collect();

        let state = Default::default();

        let expansion_map = Default::default();

        // NOTE: We cannot calculate accumulators here, as we want the
        // accumulators of the regular expression of the left-closure,
        // not of the original one.
        let accumulators = Vec::new();

        Self {
            ter,
            non,
            rules,
            firsts,
            first_nodes,
            state,
            expansion_map,
            accumulators,
        }
    }

    /// Return the name of a terminal or a non-terminal.
    pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> {
        match tnt {
            TNT::Ter(t) => Ok(format!(
                "T{}",
                self.ter
                    .get(t)
                    .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))?
                    .name()
            )),
            TNT::Non(n) => Ok(format!(
                "N{}",
                self.non
                    .get(n)
                    .ok_or(Error::IndexOutOfBounds(n, self.non.len()))?
                    .name()
            )),
        }
    }

    /// Return true if and only if there are no non-terminals in the
    /// grammar.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.non.is_empty()
    }

    /// Return the total length of all rules.
    #[inline]
    pub fn total(&self) -> usize {
        self.accumulators.last().copied().unwrap_or(0)
    }

    /// Query if a position is the starting position of a
    /// non-terminal.  If it is, return the non-terminal, else return
    /// `None` .
    #[inline]
    pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option<usize> {
        for (index, accumulator) in self.accumulators.iter().copied().enumerate() {
            let shifted_accumulator = accumulator << 1;

            // NOTE: Clippy suggests to call `cmp`, but it seems
            // compiler might not yet be smart enough to inline that
            // call, so I just silence clippy here.
            #[allow(clippy::comparison_chain)]
            if pos == shifted_accumulator {
                return Some(index);
            } else if pos < shifted_accumulator {
                break;
            }
        }

        None
    }

    /// Return the number of terminals.
    #[inline]
    pub fn ter_num(&self) -> usize {
        self.ter.len()
    }

    /// Return the number of non-terminals.
    #[inline]
    pub fn non_num(&self) -> usize {
        self.non.len()
    }

    /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a
    /// single number to represent terminals and non-terminals.
    ///
    /// # Bounds
    ///
    /// If a terminal index is greater than or equal to the number of
    /// terminals, then this signals an error; mutatis mutandis for
    /// non-terminals.
    ///
    /// # Related
    ///
    /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt].
    #[inline]
    pub fn pack_tnt(&self, tnt: TNT) -> Result<usize, Error> {
        let ter_num = self.ter.len();
        let non_num = self.non.len();

        match tnt {
            TNT::Ter(t) => {
                if t >= ter_num {
                    Err(Error::IndexOutOfBounds(t, ter_num))
                } else {
                    Ok(t)
                }
            }
            TNT::Non(n) => {
                if n >= non_num {
                    Err(Error::IndexOutOfBounds(n, non_num))
                } else {
                    Ok(n + ter_num)
                }
            }
        }
    }

    /// Convert a single number to either a terminal or a
    /// non-terminal.
    ///
    /// # Bounds
    ///
    /// If the number is greater than or equal to the sum of the
    /// numbers of terminals and of non-terminals, then this signals
    /// an error.
    ///
    /// # Related
    ///
    /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt].
    ///
    /// # Errors
    ///
    /// This function is supposed to return only one type of errors,
    /// namely, the IndexOutOfBounds error that results from a bounds
    /// check.  Breaking this is breaking the guarantee of this
    /// function, and is considered a bug.  This behaviour can and
    /// should be tested.  But I have not found a convenient test
    /// method for testing various grammars.
    #[inline]
    pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> {
        let ter_num = self.ter.len();
        let non_num = self.non.len();

        if flat < ter_num {
            Ok(TNT::Ter(flat))
        } else if flat < ter_num + non_num {
            Ok(TNT::Non(flat - ter_num))
        } else {
            Err(Error::IndexOutOfBounds(flat, ter_num + non_num))
        }
    }

    /// Return true if and only if the non-terminal is nullable.
    #[inline]
    pub fn is_nullable(&self, non_terminal: usize) -> Result<bool, Error> {
        Ok(self
            .firsts
            .get(non_terminal)
            .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
            .contains(&None))
    }

    /// Query the expansion information from the position `pos1` to
    /// the position `pos2` .
    #[inline]
    pub fn query_expansion(&self, pos1: usize, pos2: usize) -> Result<Option<&[usize]>, Error> {
        if pos1 >= self.total() {
            return Err(Error::IndexOutOfBounds(pos1, self.total()));
        }

        if pos2 >= self.total() {
            return Err(Error::IndexOutOfBounds(pos2, self.total()));
        }

        match self.state {
            GrammarState::AfterLeftClosure => {}
            _ => {
                return Err(Error::WrongState(
                    self.state,
                    GrammarState::AfterLeftClosure,
                ));
            }
        }

        Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..]))
    }

    // REVIEW: Do we have a better way to record expansion information
    // than to compute the transitive closure?

    /// A transformer of labels to be fed into
    /// [`closure`][nfa::default::nfa::DefaultNFA::closure], with the
    /// predicate that returns true if and only if the label of the
    /// first edge is either empty or a nullable non-terminal.
    pub fn transform_label_null_epsilon(
        &mut self,
        two_edges: TwoEdges<LabelType<TNT>>,
    ) -> LabelType<TNT> {
        let (first_source, first_target, first_label) = two_edges.first_edge();
        let (second_source, second_target, second_label) = two_edges.second_edge();

        #[cfg(debug_assertions)]
        {
            assert_eq!(first_target, second_source);

            if let Some(tnt) = *first_label.get_value() {
                assert!(matches!(tnt, TNT::Non(n) if matches!(self.is_nullable(n), Ok(true))));
            }
        }

        // Compute if this is from left-linear expansion: it is so if
        // and only if one if either the edges comes from left-linear
        // expansion or we are moving across a non-terminal expansion,
        // that is to say, the source of the second edge is the
        // starting edge of a non-terminal.

        let mut left_p = first_label.get_left_p() || second_label.get_left_p();

        // Record left-linear expansion information.

        if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) {
            left_p = true;

            if !self
                .expansion_map
                .contains_key(&(first_source, second_target))
            {
                let original_expansion = self.expansion_map.get(&(second_source, second_target));

                self.expansion_map.insert(
                    (first_source, second_target),
                    if let Some(original_expansion) = original_expansion {
                        let mut result = original_expansion.clone();
                        result.push(second_nt);

                        result
                    } else {
                        vec![second_nt]
                    },
                );
            }
        }

        NfaLabel::new(second_label.get_value(), second_label.get_moved(), left_p)
    }

    /// For a NON_TERMINAL, return an iterator that goes over the
    /// nodes that are reachable from the non-terminal through an
    /// empty transition of the nondeterministic finite automaton.
    #[inline]
    pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> {
        match self.state {
            GrammarState::Initial => {
                return Err(Error::WrongState(
                    self.state,
                    GrammarState::AfterComputeFirst,
                ));
            }
            GrammarState::AfterComputeFirst
            | GrammarState::AfterLeftClosure
            | GrammarState::AfterNFA => {}
        }

        Ok(self
            .first_nodes
            .get(non_terminal)
            .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
            .iter())
    }
}

pub mod first_set;

pub mod left_closure;

impl Display for Grammar {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        assert_eq!(self.non.len(), self.rules.len());

        for (nt, rule) in self.non.iter().zip(self.rules.iter()) {
            write!(f, "{}: ", nt.name())?;

            writeln!(
                f,
                "{}",
                rule.regex.to_string_with(|tnt| format!(
                    "({})",
                    self.name_of_tnt(tnt)
                        .unwrap_or_else(|_| format!("Unknown {tnt:?}"))
                ))?
            )?;
        }

        Ok(())
    }
}

// A helper module that provides some grammars for testing.
#[cfg(feature = "test-helper")]
pub mod test_grammar_helper;

#[cfg(test)]
mod tests;