;;; generalized-suffiex-tree.el --- Building a generalized suffix tree -*- lexical-binding: t; -*- ;;; Author: Durand ;;; Version: 0.0.2 ;;; Commentary: ;; Our node is represented as a list of the following elements: ;; number , from which string this edge comes ;; start , which is the starting index of the edge going from its parent ;; node to this node ;; end , the index of the end of the above edge ;; suffix-link, the index of the node this node's suffix-link points to ;; children , a hash-table of pairs of integers and indices of its ;; children ;; leaf-labels, a list of pairs (number start), where ;; number means the number-th string and ;; start is the starting position of this edge. ;; This element is nil for the internal nodes. ;; To compute the length of the edge going into NODE we use: ;; (- (min end (1+ position)) start) ;; which is actually how far the position is on that edge, ;; if it is on that edge. ;; We use only one terminating symbol, and that is -1. ;;; Code: ;;;###autoload (defun gst-min (&rest args) "Return the minimum among ARGS. If an argument is 'infty, then it is considered greater than every number." (apply #'min (delq nil (mapcar (lambda (arg) (cond ((number-or-marker-p arg) arg) ((eq arg 'infty) nil))) args)))) ;;;###autoload (defun gst-edge-length (node position num str-lens) "Return the length of the edge into NODE. See the comment above this function for the reason POSITION and NUM are here. And STR-LENS is here so that we know the lengths of previous strings." (let* ((node-active-num (car node)) (leafp (eq (car (cdr (cdr node))) 'infty)) (leaf-labels (cond (leafp (car (nthcdr 5 node)))))) (- (cond ((and (/= node-active-num num) leafp) (nth node-active-num str-lens)) ((/= node-active-num num) (caddr node)) (t (gst-min (caddr node) (1+ position)))) (cond ((and (= node-active-num num) leafp) (or (cadr (assoc num leaf-labels #'eq)) (cadr node))) (t (cadr node)))))) ;;;###autoload (defun gst-new-node (tree last-added number start &optional end suffix-start) "Make a new node with START and END as the coming edge. NUMBER is the number of string the label belongs to. Then add the new node to TREE. LAST-ADDED is the number of elements already in the TREE." (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf ) (suffix-link 0) ;; The suffix link is initially 0 (new-node (cond ((eq end 'infty) (list number start end suffix-link (make-hash-table) (list (list number suffix-start)))) (t (list number start end suffix-link (make-hash-table)))))) (puthash (1+ last-added) new-node tree) (1+ last-added))) ;;;###autoload (defun gst-add-leaf-label (tree leaf number start) "Add a label to LEAF in TREE. NUMBER and START represent the label." (let* ((actual-node (gethash leaf tree)) (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node))))))) (cond ((and (consp leaf-labels) (not (eq (caaar leaf-labels) number))) (setcar leaf-labels (cons (list number start) (car leaf-labels))))))) ;;;###autoload (defun gst-add-suffix-link (tree need-sl node) "If NEED-SL is positive, then add the suffix link. In particular, the node corresponding to NEED-SL in TREE gets a suffix link pointing to NODE. This always returns NODE." (cond ((and (> need-sl 1) (/= need-sl node)) (setcar (cdr (cdr (cdr (gethash need-sl tree)))) node))) node) ;;;###autoload (defun gst-canonize (tree node position active-number active-edge-index active-length active-node str-lens) "Walk down TREE to find the correct active point. To be precise, we start from NODE, use POSITION, ACTIVE-NUMBER, and STR-LENS to calculate the length of the edge, and compare with ACTIVE-LENGTH. If it is less than that, then that means we shall continue walking down the tree, so we return t in the first element, while setting other variables to the correct values so that the caller of this function can use the updated values. ACTIVE-EDGE-INDEX is for the purpose of updating solely." (let* ((actual-node (gethash node tree)) (node-edge-length (gst-edge-length actual-node position active-number str-lens))) (cond ((>= active-length node-edge-length) (list t active-number (+ active-edge-index node-edge-length) (- active-length node-edge-length) node)) (t (list nil active-number active-edge-index active-length active-node))))) ;;;###autoload (defsubst gst-aref (strs num index) "Return the INDEX th element in NUM th element of STRS." (aref (nth num strs) index)) ;;;###autoload (defun gst-extend-tree (tree last-added position remain active-node active-number active-edge-index active-length character strs num str-lens) "Extend TREE by CHARACTER in NUM-th string of STRS. The return value is \(tree last-added remain active-node active-number active-edge-index active-length) Other parameters: LAST-ADDED, POSITION, REMAIN, ACTIVE-NODE, ACTIVE-NUMBER, ACTIVE-EDGE-INDEX, and ACTIVE-LENGTH have special meanings in the algorithm. STR-LENS can be supplied so that we don't have to calculate the lengths repeatedly." (let* ((need-sl 0) (remain (1+ remain)) continue-p breakp terminating-no-split) (while (and (not breakp) (> remain 0)) (setq continue-p nil breakp nil) (setq terminating-no-split nil) (cond ((= active-length 0) (setq active-edge-index position) (setq active-number num))) (let* ((actual-node (gethash active-node tree)) (nxt (cond (actual-node (gethash (gst-aref strs active-number active-edge-index) (car (nthcdr 4 actual-node))))))) (cond ((null nxt) (let ((leaf (gst-new-node tree last-added num position nil (- position remain -1)))) (setq last-added leaf) (puthash (gst-aref strs active-number active-edge-index) leaf (car (nthcdr 4 (gethash active-node tree)))) (setq need-sl (gst-add-suffix-link tree need-sl active-node))) ;; rule 2 ) (t (let* ((result (gst-canonize tree nxt position active-number active-edge-index active-length active-node str-lens))) (cond ((car result) ;; observation 2 (setq active-number (car (cdr result))) (setq active-edge-index (car (cdr (cdr result)))) (setq active-length (car (cdr (cdr (cdr result))))) (setq active-node (car (cdr (cdr (cdr (cdr result)))))) (setq continue-p t)) (t (cond ((eq (gst-aref strs (car (gethash nxt tree)) (+ active-length (cadr (gethash nxt tree)))) character) ;; observation 1 (setq active-length (1+ active-length)) (setq need-sl (gst-add-suffix-link tree need-sl active-node)) ;; (setq breakp t) ;; terminating symbol special handling (cond ((eq character -1) ; terminating symbol (gst-add-leaf-label tree nxt num (- position remain -1)) (setq active-length (1- active-length)) ;; We don't want to split since this is a match. But ;; we don't want to break since this is a ;; terminating symbol. So we invent a new variable ;; to do this. (setq terminating-no-split t)) (t (setq breakp t))))) (cond ((or breakp terminating-no-split)) (t ; splitting (let ((split (gst-new-node tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree)) (+ (cadr (gethash nxt tree)) active-length)))) (setq last-added split) (puthash (gst-aref strs active-number active-edge-index) split (cadr (cdr (cdr (cdr (gethash active-node tree)))))) (let ((leaf (gst-new-node tree last-added num position nil (- position remain -1)))) (setq last-added leaf) (puthash character leaf (cadddr (cdr (gethash split tree)))) (setcar (cdr (gethash nxt tree)) (min (+ (cadr (gethash nxt tree)) active-length) (1- (nth (car (gethash nxt tree)) str-lens)))) (puthash (gst-aref strs (car (gethash nxt tree)) (cadr (gethash nxt tree))) nxt (cadddr (cdr (gethash split tree)))) ;; rule 2 (setq need-sl (gst-add-suffix-link tree need-sl split))))))))))) (cond ((or continue-p breakp)) (t (setq remain (1- remain)) (cond ((and (eq active-node 1) ; root (> active-length 0)) (setq active-length (1- active-length)) (setq active-edge-index (1+ (- position remain)))) (t (setq active-node (let ((slink (cadddr (gethash active-node tree)))) (cond ; follow the suffix link or go to root ((> slink 1) slink) (t 1)))))))))) (list tree last-added remain active-node active-number active-edge-index active-length))) ;;;###autoload (defun gst-build-for-strs (strs &optional str-lens) "Build the generalized suffix tree for STRS. One can optionally provide STR-LENS to avoid calculating the lengths again." (let* ((len (length strs)) (index 0) (tree (make-hash-table)) (last-added 0) (active-node (gst-new-node tree last-added index -1 -1)) (str-lens (or str-lens (mapcar #'length strs)))) (setq last-added active-node) (while (< index len) (let* ((position 0) (character (ignore-errors (gst-aref strs index position))) (remain 0) (active-node 1) ; start from the root (active-number index) (active-edge-index 0) (active-length 0) result) ; temporary holder (while character ;; (setq old-character character) (setq result (gst-extend-tree tree last-added position remain active-node active-number active-edge-index active-length character strs index str-lens)) (setq tree (pop result)) (setq last-added (pop result)) (setq remain (pop result)) (setq active-node (pop result)) (setq active-number (pop result)) (setq active-edge-index (pop result)) (setq active-length (pop result)) (setq position (1+ position)) (setq character (ignore-errors (gst-aref strs index position))) ;; (cond ;; ((characterp old-character) ;; (insert (format "After adding character %c:\n" old-character)) ;; (gst-print-tree tree ;; (append (seq-take strs index) ;; (list (concat ;; (seq-take (nth index strs) position) ;; "$")))) ;; (insert "\n\n")) ;; ((= old-character -1) ;; (insert (format "After adding character -1:\n")) ;; (gst-print-tree tree ;; (seq-take strs (1+ index))) ;; (insert "\n\n"))) )) (setq index (1+ index)) ;; (insert "\n \n") ) tree)) ;;; Printing ;; This section is not necessary. It is here just to print the trees ;; to make sure we don't make some strange errors. (require 'hierarchy) (require 'seq) ;;;###autoload (defun gst-print-strs (strs) "Print strings STRS nicely." (mapc (lambda (str) (mapc (lambda (char) (cond ((characterp char) (insert char)) (t (insert ", ")))) str)) strs)) ;;;###autoload (defun gst-pretty-hash-table (table) "Only return the useful parts from TABLE. To be precise, this returns the keys and values of the hash-table, in the form of an alist." (let (keys-values-alist) (maphash (lambda (key value) (setq keys-values-alist (cons (cons key value) keys-values-alist))) table) keys-values-alist)) ;;;###autoload (defun gst-pretty-node (node) "Outputs a prettified NODE." (format "(%d %d %s %d %s %S)" (car node) (cadr node) (caddr node) (cadddr node) (gst-pretty-hash-table (car (nthcdr 4 node))) (car (nthcdr 5 node)))) ;;;###autoload (defun gst-print-tree-for-strs (strs) "Print the generalized suffix tree for STRS." (let* ((strs (mapcar (lambda (str) (vconcat str (list -1))) strs)) (symbol (make-symbol "gtree"))) (set symbol (gst-build-for-strs strs)) (insert "Generalized suffix tree for: ") (gst-print-strs strs) (delete-region (- (point) 2) (point)) (insert ":\n") (gst-print-tree (symbol-value symbol) strs))) ;;;###autoload (defvar gst-full-error-p nil "Whether to print full nodes in the output.") ;;;###autoload (defun gst-print-tree (tree strs) "Print TREE with the aid of STRS." (let* ((symbol-tree (make-symbol "new-hierarchy")) (strs (mapcar (lambda (str) (mapcar (lambda (c) (cond ((eq c -1) ?$) (c))) str)) strs))) (set symbol-tree (hierarchy-new)) (maphash (lambda (key value) (hierarchy-add-tree (symbol-value symbol-tree) key nil (lambda (item) (hash-table-values (car (nthcdr 4 (gethash item tree))))))) tree) (hierarchy-print (symbol-value symbol-tree) (lambda (item) (cond ((= item 1) "root") (t (let ((actual-str (let* ((node (gethash item tree)) (leafp (eq (caddr node) 'infty)) (leaf-labels (cond (leafp (car (nthcdr 5 node)))))) (concat (seq-subseq (nth (car node) strs) (min (cadr node) (1- (length (nth (car node) strs)))) (cond (leafp -1) (t (caddr node)))) (cond (leafp (apply #'concat (mapcar (lambda (label) (let ((first (car label)) (second (cadr label))) (format "$ (%d : %d)" first second))) leaf-labels)))))))) (cond (gst-full-error-p (concat (format "key: %d, %s, %s" item actual-str (gst-pretty-node (gethash item tree))))) (t actual-str))))))))) (provide 'generalized-suffix-tree) ;;; generalized-suffix-tree.el ends here