summaryrefslogtreecommitdiff
path: root/suffix tree/generalized-suffix-tree.el
diff options
context:
space:
mode:
Diffstat (limited to 'suffix tree/generalized-suffix-tree.el')
-rw-r--r--suffix tree/generalized-suffix-tree.el398
1 files changed, 398 insertions, 0 deletions
diff --git a/suffix tree/generalized-suffix-tree.el b/suffix tree/generalized-suffix-tree.el
new file mode 100644
index 0000000..227aea9
--- /dev/null
+++ b/suffix tree/generalized-suffix-tree.el
@@ -0,0 +1,398 @@
+;;; generalized-suffiex-tree.el --- Building a generalized suffix tree -*- lexical-binding: t; -*-
+
+;;; Author: Durand
+;;; Version: 0.0.2
+
+;;; Commentary:
+
+;; Our node is represented as a list of the following elements:
+
+;; number , from which string this edge comes
+;; start , which is the starting index of the edge going from its parent
+;; node to this node
+;; end , the index of the end of the above edge
+;; suffix-link, the index of the node this node's suffix-link points to
+;; children , a hash-table of pairs of integers and indices of its
+;; children
+;; leaf-labels, a list of pairs (number start), where
+;; number means the number-th string and
+;; start is the starting position of this edge.
+;; This element is nil for the internal nodes.
+
+;; To compute the length of the edge going into NODE we use:
+;; (- (min end (1+ position)) start)
+;; which is actually how far the position is on that edge,
+;; if it is on that edge.
+
+;; We use only one terminating symbol, and that is -1.
+
+;;; Code:
+
+;;;###autoload
+(defun gst-min (&rest args)
+ "Return the minimum among ARGS.
+If an argument is 'infty, then it is considered greater than
+every number."
+ (apply #'min (delq nil
+ (mapcar
+ (lambda (arg)
+ (cond
+ ((number-or-marker-p arg) arg)
+ ((eq arg 'infty) nil)))
+ args))))
+
+;;;###autoload
+(defun gst-edge-length (node position num str-lens)
+ "Return the length of the edge into NODE.
+See the comment above this function for the reason
+POSITION and NUM are here."
+ (let* ((node-active-num (car node))
+ (leafp (eq (car (cdr (cdr node))) 'infty))
+ (leaf-labels (cond (leafp (car (nthcdr 5 node))))))
+ (- (cond
+ ((and (/= node-active-num num)
+ leafp)
+ (nth node-active-num str-lens))
+ ((/= node-active-num num)
+ (caddr node))
+ (t (gst-min (caddr node) (1+ position))))
+ (cond
+ ((and (= node-active-num num)
+ leafp)
+ (or (cadr (assoc num leaf-labels #'eq))
+ (cadr node)))
+ (t (cadr node))))))
+
+;;;###autoload
+(defun gst-new-node (tree last-added number start &optional end suffix-start)
+ "Make a new node with START and END as the coming edge.
+NUMBER is the number of string the label belongs to.
+Then add the new node to TREE.
+LAST-ADDED is the number of elements already in the TREE."
+ (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf
+ )
+ (suffix-link 0) ;; The suffix link is initially 0
+ (new-node
+ (cond ((eq end 'infty)
+ (list number start end suffix-link (make-hash-table)
+ (list (list number suffix-start))))
+ (t
+ (list number start end suffix-link (make-hash-table))))))
+ (puthash (1+ last-added) new-node tree)
+ (1+ last-added)))
+
+;;;###autoload
+(defun gst-add-leaf-label (tree leaf number start)
+ "Add a label to LEAF."
+ (let* ((actual-node (gethash leaf tree))
+ (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node)))))))
+ (cond ((consp leaf-labels)
+ (setcar leaf-labels
+ (cons (list number start)
+ (car leaf-labels)))))))
+
+;;;###autoload
+(defun gst-add-suffix-link (tree need-sl node)
+ "If NEED-SL is positive, then add the suffix link.
+In particular, the node corresponding to NEED-SL in TREE
+gets a suffix link pointing to NODE.
+
+This always returns NODE."
+ (cond
+ ((and (> need-sl 1) (/= need-sl node))
+ (setcar (cdr (cdr (cdr (gethash need-sl tree)))) node)))
+ node)
+
+;;;###autoload
+(defun gst-canonize (tree node position active-number active-edge-index active-length active-node
+ str-lens)
+ "Walk down TREE to find the correct active point."
+ (let* ((actual-node (gethash node tree))
+ (node-edge-length (gst-edge-length actual-node position active-number
+ str-lens)))
+ (cond
+ ((>= active-length node-edge-length)
+ (list t
+ active-number
+ (+ active-edge-index
+ node-edge-length)
+ (- active-length
+ node-edge-length)
+ node))
+ (t
+ (list nil
+ active-number
+ active-edge-index
+ active-length
+ active-node)))))
+
+;;;###autoload
+(defsubst gst-aref (strs num index)
+ "Return the INDEX-th element in NUM-th element of STRS."
+ (aref (nth num strs) index))
+
+;;;###autoload
+(defun gst-extend-tree (tree last-added position remain
+ active-node active-number active-edge-index
+ active-length character strs num str-lens)
+ "Extend a tree by CHARACTER in NUM-th string of STRS.
+The return value is
+(tree
+ last-added remain active-node
+ active-number active-edge-index active-length)"
+ (let* ((need-sl 0)
+ (remain (1+ remain))
+ continue-p breakp)
+ (while (and (not breakp) (> remain 0))
+ (setq continue-p nil breakp nil)
+ (cond
+ ((= active-length 0)
+ (setq active-edge-index position)
+ (setq active-number num)))
+ (let* ((actual-node (gethash active-node tree))
+ (nxt (cond
+ (actual-node
+ (gethash (gst-aref strs active-number active-edge-index)
+ (cadr (cdr (cdr (cdr actual-node)))))))))
+ (cond
+ ((null nxt)
+ (let ((leaf (gst-new-node tree last-added num position nil
+ (- position remain -1))))
+ (setq last-added leaf)
+ (puthash (gst-aref strs active-number active-edge-index)
+ leaf
+ (cadr (cdr (cdr (cdr (gethash active-node tree))))))
+ (setq need-sl (gst-add-suffix-link tree need-sl active-node)))
+ ;; rule 2
+ )
+ (t
+ (let* ((result (gst-canonize
+ tree nxt position active-number
+ active-edge-index active-length active-node
+ str-lens)))
+ (cond
+ ((car result)
+ ;; observation 2
+ (setq active-number (car (cdr result)))
+ (setq active-edge-index (car (cdr (cdr result))))
+ (setq active-length (car (cdr (cdr (cdr result)))))
+ (setq active-node (car (cdr (cdr (cdr (cdr result))))))
+ (setq continue-p t))
+ (t
+ (cond
+ ((eq (gst-aref strs
+ (car (gethash nxt tree))
+ (+ active-length
+ (cadr (gethash nxt tree))))
+ character)
+ ;; observation 1
+ (setq active-length (1+ active-length))
+ (setq need-sl (gst-add-suffix-link tree need-sl active-node))
+ (setq breakp t)
+ ;; add a label
+ (cond
+ ((eq character -1) ; terminating symbol
+ (gst-add-leaf-label tree nxt num (- position remain -1))))))
+ (cond
+ (breakp)
+ (t ;; splitting
+ (let ((split (gst-new-node
+ tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree))
+ (+ (cadr (gethash nxt tree)) active-length))))
+ (setq last-added split)
+ (puthash
+ (gst-aref strs active-number active-edge-index)
+ split (cadr (cdr (cdr (cdr (gethash active-node tree))))))
+ (let ((leaf (gst-new-node tree last-added num position
+ nil (- position remain -1))))
+ (setq last-added leaf)
+ (puthash character leaf
+ (cadddr (cdr (gethash split tree))))
+ (setcar (cdr (gethash nxt tree))
+ (+ (cadr (gethash nxt tree))
+ active-length))
+ (puthash (gst-aref strs
+ (car (gethash nxt tree))
+ (cadr (gethash nxt tree)))
+ nxt
+ (cadddr (cdr (gethash split tree))))
+ ;; rule 2
+ (setq need-sl
+ (gst-add-suffix-link tree need-sl split)))))))))))
+ (cond
+ ((or continue-p breakp))
+ (t
+ (setq remain (1- remain))
+ (cond
+ ((and (eq active-node 1) ; root
+ (> active-length 0))
+ (setq active-length (1- active-length))
+ (setq active-edge-index
+ (1+ (- position remain))))
+ (t
+ (setq active-node
+ (let ((slink (cadddr (gethash active-node tree))))
+ (cond ; follow the suffix link or go to root
+ ((> slink 1) slink)
+ (t 1))))))))))
+ (list tree last-added remain active-node
+ active-number active-edge-index active-length)))
+
+;;;###autoload
+(defun gst-build-for-strs (strs &optional str-lens)
+ "Build the generalized suffix tree for STRS.
+One can optionally provide STR-LENS to avoid calculating the
+lengths again."
+ (let* ((len (length strs))
+ (index 0)
+ (tree (make-hash-table))
+ (last-added 0)
+ (active-node (gst-new-node tree last-added index -1 -1))
+ (str-lens (or str-lens (mapcar #'length strs))))
+ (setq last-added active-node)
+ (while (< index len)
+ (let* ((position 0)
+ (character (ignore-errors (gst-aref strs index position)))
+ (remain 0)
+ (active-node 1) ; start from the root
+ (active-number index)
+ (active-edge-index 0)
+ (active-length 0)
+ old-character result) ; temporary holder
+ (while character
+ (setq old-character character)
+ (setq result (gst-extend-tree tree last-added position remain
+ active-node active-number active-edge-index
+ active-length character strs index
+ str-lens))
+ (setq tree (pop result))
+ (setq last-added (pop result))
+ (setq remain (pop result))
+ (setq active-node (pop result))
+ (setq active-number (pop result))
+ (setq active-edge-index (pop result))
+ (setq active-length (pop result))
+ (setq position (1+ position))
+ (setq character (ignore-errors (gst-aref strs index position)))
+ (cond
+ ((characterp old-character)
+ (insert (format "After adding character %c:\n" old-character))
+ (gst-print-tree tree
+ (append (seq-take strs index)
+ (list (concat
+ (seq-take (nth index strs) position)
+ "$"))))
+ (insert "\n\n"))
+ ((= old-character -1)
+ (insert (format "After adding character -1:\n"))
+ (gst-print-tree tree
+ (seq-take strs (1+ index)))
+ (insert "\n\n")))))
+ (setq index (1+ index))
+ (insert "\n \n"))
+ tree))
+
+;;; Printing
+
+;; This section is not necessary. It is here just to print the trees
+;; to make sure we don't make some strange errors.
+
+(require 'hierarchy)
+(require 'seq)
+
+;;;###autoload
+(defun gst-print-strs (strs)
+ "Print strings STRS nicely."
+ (mapc (lambda (str)
+ (mapc (lambda (char)
+ (cond ((characterp char) (insert char))
+ (t (insert ", "))))
+ str))
+ strs))
+
+;;;###autoload
+(defun gst-pretty-hash-table (table)
+ "Only returns the useful parts from a table."
+ (let (keys-values-alist)
+ (maphash
+ (lambda (key value)
+ (setq keys-values-alist
+ (cons
+ (cons key value)
+ keys-values-alist)))
+ table)
+ keys-values-alist))
+
+;;;###autoload
+(defun gst-pretty-node (node)
+ "Outputs a prettified NODE."
+ (format "(%d %d %s %d %s %S)"
+ (car node) (cadr node)
+ (caddr node) (cadddr node)
+ (gst-pretty-hash-table (car (nthcdr 4 node)))
+ (car (nthcdr 5 node))))
+
+;;;###autoload
+(defun gst-print-tree-for-strs (strs)
+ "Print the generalized suffix tree for STRS."
+ (let* ((strs (mapcar (lambda (str)
+ (vconcat str (list -1)))
+ strs))
+ (symbol (make-symbol "gtree")))
+ (set symbol (gst-build-for-strs strs))
+ (insert "Generalized suffix tree for: ")
+ (gst-print-strs strs)
+ (delete-region (- (point) 2) (point))
+ (insert ":\n")
+ (gst-print-tree (symbol-value symbol) strs)))
+
+;;;###autoload
+(defvar gst-full-error-p nil
+ "Whether to print full nodes in the output.")
+
+;;;###autoload
+(defun gst-print-tree (tree strs)
+ "Print TREE with the aid of STRS."
+ (let* ((symbol-tree (make-symbol "new-hierarchy")))
+ (set symbol-tree (hierarchy-new))
+ (maphash
+ (lambda (key value)
+ (hierarchy-add-tree
+ (symbol-value symbol-tree) key nil
+ (lambda (item)
+ (hash-table-values (car (nthcdr 4 (gethash item tree)))))))
+ tree)
+ (hierarchy-print
+ (symbol-value symbol-tree)
+ (lambda (item)
+ (cond ((= item 1) "root")
+ (t (let ((actual-str
+ (let* ((node (gethash item tree))
+ (leafp (eq (caddr node) 'infty))
+ (leaf-labels
+ (cond (leafp (car (nthcdr 5 node))))))
+ (concat
+ (seq-subseq
+ (nth (car node) strs)
+ (min (cadr node) (1- (length (nth (car node) strs))))
+ (cond (leafp -1)
+ (t (caddr node))))
+ (cond
+ (leafp
+ (apply
+ #'concat
+ (mapcar
+ (lambda (label)
+ (let ((first (car label))
+ (second (cadr label)))
+ (format "$ (%d : %d)"
+ first second)))
+ leaf-labels))))))))
+ (cond (gst-full-error-p
+ (concat
+ (format "key: %d, %s, %s" item actual-str
+ (gst-pretty-node (gethash item tree)))))
+ (t actual-str)))))))))
+
+(provide 'generalized-suffix-tree)
+;;; generalized-suffiex-tree.el ends here