diff options
Diffstat (limited to 'suffix tree/generalized-suffix-tree.el')
-rw-r--r-- | suffix tree/generalized-suffix-tree.el | 398 |
1 files changed, 398 insertions, 0 deletions
diff --git a/suffix tree/generalized-suffix-tree.el b/suffix tree/generalized-suffix-tree.el new file mode 100644 index 0000000..227aea9 --- /dev/null +++ b/suffix tree/generalized-suffix-tree.el @@ -0,0 +1,398 @@ +;;; generalized-suffiex-tree.el --- Building a generalized suffix tree -*- lexical-binding: t; -*- + +;;; Author: Durand +;;; Version: 0.0.2 + +;;; Commentary: + +;; Our node is represented as a list of the following elements: + +;; number , from which string this edge comes +;; start , which is the starting index of the edge going from its parent +;; node to this node +;; end , the index of the end of the above edge +;; suffix-link, the index of the node this node's suffix-link points to +;; children , a hash-table of pairs of integers and indices of its +;; children +;; leaf-labels, a list of pairs (number start), where +;; number means the number-th string and +;; start is the starting position of this edge. +;; This element is nil for the internal nodes. + +;; To compute the length of the edge going into NODE we use: +;; (- (min end (1+ position)) start) +;; which is actually how far the position is on that edge, +;; if it is on that edge. + +;; We use only one terminating symbol, and that is -1. + +;;; Code: + +;;;###autoload +(defun gst-min (&rest args) + "Return the minimum among ARGS. +If an argument is 'infty, then it is considered greater than +every number." + (apply #'min (delq nil + (mapcar + (lambda (arg) + (cond + ((number-or-marker-p arg) arg) + ((eq arg 'infty) nil))) + args)))) + +;;;###autoload +(defun gst-edge-length (node position num str-lens) + "Return the length of the edge into NODE. +See the comment above this function for the reason +POSITION and NUM are here." + (let* ((node-active-num (car node)) + (leafp (eq (car (cdr (cdr node))) 'infty)) + (leaf-labels (cond (leafp (car (nthcdr 5 node)))))) + (- (cond + ((and (/= node-active-num num) + leafp) + (nth node-active-num str-lens)) + ((/= node-active-num num) + (caddr node)) + (t (gst-min (caddr node) (1+ position)))) + (cond + ((and (= node-active-num num) + leafp) + (or (cadr (assoc num leaf-labels #'eq)) + (cadr node))) + (t (cadr node)))))) + +;;;###autoload +(defun gst-new-node (tree last-added number start &optional end suffix-start) + "Make a new node with START and END as the coming edge. +NUMBER is the number of string the label belongs to. +Then add the new node to TREE. +LAST-ADDED is the number of elements already in the TREE." + (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf + ) + (suffix-link 0) ;; The suffix link is initially 0 + (new-node + (cond ((eq end 'infty) + (list number start end suffix-link (make-hash-table) + (list (list number suffix-start)))) + (t + (list number start end suffix-link (make-hash-table)))))) + (puthash (1+ last-added) new-node tree) + (1+ last-added))) + +;;;###autoload +(defun gst-add-leaf-label (tree leaf number start) + "Add a label to LEAF." + (let* ((actual-node (gethash leaf tree)) + (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node))))))) + (cond ((consp leaf-labels) + (setcar leaf-labels + (cons (list number start) + (car leaf-labels))))))) + +;;;###autoload +(defun gst-add-suffix-link (tree need-sl node) + "If NEED-SL is positive, then add the suffix link. +In particular, the node corresponding to NEED-SL in TREE +gets a suffix link pointing to NODE. + +This always returns NODE." + (cond + ((and (> need-sl 1) (/= need-sl node)) + (setcar (cdr (cdr (cdr (gethash need-sl tree)))) node))) + node) + +;;;###autoload +(defun gst-canonize (tree node position active-number active-edge-index active-length active-node + str-lens) + "Walk down TREE to find the correct active point." + (let* ((actual-node (gethash node tree)) + (node-edge-length (gst-edge-length actual-node position active-number + str-lens))) + (cond + ((>= active-length node-edge-length) + (list t + active-number + (+ active-edge-index + node-edge-length) + (- active-length + node-edge-length) + node)) + (t + (list nil + active-number + active-edge-index + active-length + active-node))))) + +;;;###autoload +(defsubst gst-aref (strs num index) + "Return the INDEX-th element in NUM-th element of STRS." + (aref (nth num strs) index)) + +;;;###autoload +(defun gst-extend-tree (tree last-added position remain + active-node active-number active-edge-index + active-length character strs num str-lens) + "Extend a tree by CHARACTER in NUM-th string of STRS. +The return value is +(tree + last-added remain active-node + active-number active-edge-index active-length)" + (let* ((need-sl 0) + (remain (1+ remain)) + continue-p breakp) + (while (and (not breakp) (> remain 0)) + (setq continue-p nil breakp nil) + (cond + ((= active-length 0) + (setq active-edge-index position) + (setq active-number num))) + (let* ((actual-node (gethash active-node tree)) + (nxt (cond + (actual-node + (gethash (gst-aref strs active-number active-edge-index) + (cadr (cdr (cdr (cdr actual-node))))))))) + (cond + ((null nxt) + (let ((leaf (gst-new-node tree last-added num position nil + (- position remain -1)))) + (setq last-added leaf) + (puthash (gst-aref strs active-number active-edge-index) + leaf + (cadr (cdr (cdr (cdr (gethash active-node tree)))))) + (setq need-sl (gst-add-suffix-link tree need-sl active-node))) + ;; rule 2 + ) + (t + (let* ((result (gst-canonize + tree nxt position active-number + active-edge-index active-length active-node + str-lens))) + (cond + ((car result) + ;; observation 2 + (setq active-number (car (cdr result))) + (setq active-edge-index (car (cdr (cdr result)))) + (setq active-length (car (cdr (cdr (cdr result))))) + (setq active-node (car (cdr (cdr (cdr (cdr result)))))) + (setq continue-p t)) + (t + (cond + ((eq (gst-aref strs + (car (gethash nxt tree)) + (+ active-length + (cadr (gethash nxt tree)))) + character) + ;; observation 1 + (setq active-length (1+ active-length)) + (setq need-sl (gst-add-suffix-link tree need-sl active-node)) + (setq breakp t) + ;; add a label + (cond + ((eq character -1) ; terminating symbol + (gst-add-leaf-label tree nxt num (- position remain -1)))))) + (cond + (breakp) + (t ;; splitting + (let ((split (gst-new-node + tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree)) + (+ (cadr (gethash nxt tree)) active-length)))) + (setq last-added split) + (puthash + (gst-aref strs active-number active-edge-index) + split (cadr (cdr (cdr (cdr (gethash active-node tree)))))) + (let ((leaf (gst-new-node tree last-added num position + nil (- position remain -1)))) + (setq last-added leaf) + (puthash character leaf + (cadddr (cdr (gethash split tree)))) + (setcar (cdr (gethash nxt tree)) + (+ (cadr (gethash nxt tree)) + active-length)) + (puthash (gst-aref strs + (car (gethash nxt tree)) + (cadr (gethash nxt tree))) + nxt + (cadddr (cdr (gethash split tree)))) + ;; rule 2 + (setq need-sl + (gst-add-suffix-link tree need-sl split))))))))))) + (cond + ((or continue-p breakp)) + (t + (setq remain (1- remain)) + (cond + ((and (eq active-node 1) ; root + (> active-length 0)) + (setq active-length (1- active-length)) + (setq active-edge-index + (1+ (- position remain)))) + (t + (setq active-node + (let ((slink (cadddr (gethash active-node tree)))) + (cond ; follow the suffix link or go to root + ((> slink 1) slink) + (t 1)))))))))) + (list tree last-added remain active-node + active-number active-edge-index active-length))) + +;;;###autoload +(defun gst-build-for-strs (strs &optional str-lens) + "Build the generalized suffix tree for STRS. +One can optionally provide STR-LENS to avoid calculating the +lengths again." + (let* ((len (length strs)) + (index 0) + (tree (make-hash-table)) + (last-added 0) + (active-node (gst-new-node tree last-added index -1 -1)) + (str-lens (or str-lens (mapcar #'length strs)))) + (setq last-added active-node) + (while (< index len) + (let* ((position 0) + (character (ignore-errors (gst-aref strs index position))) + (remain 0) + (active-node 1) ; start from the root + (active-number index) + (active-edge-index 0) + (active-length 0) + old-character result) ; temporary holder + (while character + (setq old-character character) + (setq result (gst-extend-tree tree last-added position remain + active-node active-number active-edge-index + active-length character strs index + str-lens)) + (setq tree (pop result)) + (setq last-added (pop result)) + (setq remain (pop result)) + (setq active-node (pop result)) + (setq active-number (pop result)) + (setq active-edge-index (pop result)) + (setq active-length (pop result)) + (setq position (1+ position)) + (setq character (ignore-errors (gst-aref strs index position))) + (cond + ((characterp old-character) + (insert (format "After adding character %c:\n" old-character)) + (gst-print-tree tree + (append (seq-take strs index) + (list (concat + (seq-take (nth index strs) position) + "$")))) + (insert "\n\n")) + ((= old-character -1) + (insert (format "After adding character -1:\n")) + (gst-print-tree tree + (seq-take strs (1+ index))) + (insert "\n\n"))))) + (setq index (1+ index)) + (insert "\n\n")) + tree)) + +;;; Printing + +;; This section is not necessary. It is here just to print the trees +;; to make sure we don't make some strange errors. + +(require 'hierarchy) +(require 'seq) + +;;;###autoload +(defun gst-print-strs (strs) + "Print strings STRS nicely." + (mapc (lambda (str) + (mapc (lambda (char) + (cond ((characterp char) (insert char)) + (t (insert ", ")))) + str)) + strs)) + +;;;###autoload +(defun gst-pretty-hash-table (table) + "Only returns the useful parts from a table." + (let (keys-values-alist) + (maphash + (lambda (key value) + (setq keys-values-alist + (cons + (cons key value) + keys-values-alist))) + table) + keys-values-alist)) + +;;;###autoload +(defun gst-pretty-node (node) + "Outputs a prettified NODE." + (format "(%d %d %s %d %s %S)" + (car node) (cadr node) + (caddr node) (cadddr node) + (gst-pretty-hash-table (car (nthcdr 4 node))) + (car (nthcdr 5 node)))) + +;;;###autoload +(defun gst-print-tree-for-strs (strs) + "Print the generalized suffix tree for STRS." + (let* ((strs (mapcar (lambda (str) + (vconcat str (list -1))) + strs)) + (symbol (make-symbol "gtree"))) + (set symbol (gst-build-for-strs strs)) + (insert "Generalized suffix tree for: ") + (gst-print-strs strs) + (delete-region (- (point) 2) (point)) + (insert ":\n") + (gst-print-tree (symbol-value symbol) strs))) + +;;;###autoload +(defvar gst-full-error-p nil + "Whether to print full nodes in the output.") + +;;;###autoload +(defun gst-print-tree (tree strs) + "Print TREE with the aid of STRS." + (let* ((symbol-tree (make-symbol "new-hierarchy"))) + (set symbol-tree (hierarchy-new)) + (maphash + (lambda (key value) + (hierarchy-add-tree + (symbol-value symbol-tree) key nil + (lambda (item) + (hash-table-values (car (nthcdr 4 (gethash item tree))))))) + tree) + (hierarchy-print + (symbol-value symbol-tree) + (lambda (item) + (cond ((= item 1) "root") + (t (let ((actual-str + (let* ((node (gethash item tree)) + (leafp (eq (caddr node) 'infty)) + (leaf-labels + (cond (leafp (car (nthcdr 5 node)))))) + (concat + (seq-subseq + (nth (car node) strs) + (min (cadr node) (1- (length (nth (car node) strs)))) + (cond (leafp -1) + (t (caddr node)))) + (cond + (leafp + (apply + #'concat + (mapcar + (lambda (label) + (let ((first (car label)) + (second (cadr label))) + (format "$ (%d : %d)" + first second))) + leaf-labels)))))))) + (cond (gst-full-error-p + (concat + (format "key: %d, %s, %s" item actual-str + (gst-pretty-node (gethash item tree))))) + (t actual-str))))))))) + +(provide 'generalized-suffix-tree) +;;; generalized-suffiex-tree.el ends here |