summaryrefslogtreecommitdiff
path: root/suffix tree/suffix-tree.el
diff options
context:
space:
mode:
Diffstat (limited to 'suffix tree/suffix-tree.el')
-rw-r--r--suffix tree/suffix-tree.el397
1 files changed, 397 insertions, 0 deletions
diff --git a/suffix tree/suffix-tree.el b/suffix tree/suffix-tree.el
new file mode 100644
index 0000000..0d929d0
--- /dev/null
+++ b/suffix tree/suffix-tree.el
@@ -0,0 +1,397 @@
+;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*-
+
+;;; Author: Durand
+;;; Version: 0.0.1
+
+;;; Commentary:
+
+;; Our node is represented as a list of the following elements:
+
+;; start , which is the starting index of the edge going from its parent
+;; node to this node
+;; end , the index of the end of the above edge
+;; suffix-link, the index of the node this node's suffix-link points to
+;; children , a hash-table of pairs of integers and indices of its
+;; children
+
+;; To compute the length of the edge going into NODE we use:
+;; (- (min end (1+ position)) start)
+;; which is actually how far the position is on that edge,
+;; if it is on that edge.
+
+;;; Code:
+
+;;;###autoload
+(defun st-min (&rest args)
+ "Return the minimum among ARGS.
+If an argument is 'infty, then it is considered greater than
+every number."
+ (apply #'min (delq nil
+ (mapcar
+ (lambda (arg)
+ (cond
+ ((number-or-marker-p arg) arg)
+ ((eq arg 'infty) nil)))
+ args))))
+
+;;;###autoload
+(defun st-edge-length (node position)
+ "Return the length of the edge into NODE.
+See the comment above this function for the reason
+POSITION is here."
+ (- (st-min (car (cdr node)) (1+ position))
+ (car node)))
+
+;;;###autoload
+(defun st-new-node (tree last-added start &optional end)
+ "Make a new node with START and END as the coming edge.
+Then add the new node to TREE.
+LAST-ADDED is the number of elements already in the TREE."
+ (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf
+ )
+ (suffix-link 0) ;; The suffix link is initially 0
+ (new-node
+ (list start end
+ suffix-link
+ (make-hash-table))))
+ (puthash (1+ last-added) new-node tree)
+ (1+ last-added)))
+
+;;;###autoload
+(defun st-add-suffix-link (tree need-sl node)
+ "If NEED-SL is positive, then add the suffix link.
+In particular, the node corresponding to NEED-SL in TREE
+gets a suffix link pointing to NODE.
+
+This always returns NODE."
+ (cond
+ ((and (> need-sl 1) (/= need-sl node))
+ (setcar (cdr (cdr (gethash need-sl tree))) node)))
+ node)
+
+;;;###autoload
+(defun st-canonize (tree node position active-edge-index active-length active-node)
+ "Walk down TREE to find the correct active point."
+ (let ((node-edge-length (st-edge-length (gethash node tree) position)))
+ (cond
+ ((>= active-length node-edge-length)
+ (list t
+ (+ active-edge-index
+ node-edge-length)
+ (- active-length
+ node-edge-length)
+ node))
+ (t
+ (list nil
+ active-edge-index
+ active-length
+ active-node)))))
+
+;;;###autoload
+(defun st-extend-tree (tree last-added position remain
+ active-node active-edge-index
+ active-length character str)
+ "Extend a tree by CHARACTER.
+The return value is
+(tree last-added remain active-node
+ active-edge-index active-length)"
+ (let* ((need-sl 0)
+ (remain (1+ remain))
+ continue-p breakp)
+ (while (and (not breakp) (> remain 0))
+ (setq continue-p nil breakp nil)
+ (cond
+ ((= active-length 0)
+ (setq active-edge-index position)))
+ (let* ((actual-node (gethash active-node tree))
+ (nxt (cond
+ (actual-node
+ (gethash (aref str active-edge-index)
+ (cadr (cdr (cdr actual-node))))))))
+ (cond
+ ((null nxt)
+ (let ((leaf (st-new-node tree last-added position)))
+ (setq last-added leaf)
+ (puthash (aref str active-edge-index)
+ leaf
+ (cadr (cdr (cdr (gethash active-node tree)))))
+ (setq need-sl (st-add-suffix-link tree need-sl active-node)))
+ ;; rule 2
+ )
+ (t
+ (let* ((result (st-canonize
+ tree nxt position active-edge-index
+ active-length active-node)))
+ (cond
+ ((car result)
+ ;; observation 2
+ (setq active-edge-index (cadr result))
+ (setq active-length (caddr result))
+ (setq active-node (cadr (cddr result)))
+ (setq continue-p t))
+ (t
+ (cond
+ ((eq (aref str (+ active-length
+ (car (gethash nxt tree))))
+ character)
+ ;; observation 1
+ (setq active-length (1+ active-length))
+ (setq need-sl (st-add-suffix-link tree need-sl active-node))
+ (setq breakp t)))
+ (cond
+ (breakp)
+ (t ;; splitting
+ (let ((split (st-new-node
+ tree last-added (car (gethash nxt tree))
+ (+ (car (gethash nxt tree)) active-length))))
+ (setq last-added split)
+ (puthash
+ (aref str active-edge-index)
+ split (cadr (cdr (cdr (gethash active-node tree)))))
+ (let ((leaf (st-new-node tree last-added position)))
+ (setq last-added leaf)
+ (puthash character leaf
+ (cadddr (gethash split tree)))
+ (setcar (gethash nxt tree)
+ (+ (car (gethash nxt tree))
+ active-length))
+ (puthash (aref str (car (gethash nxt tree))) nxt
+ (cadddr (gethash split tree)))
+ ;; rule 2
+ (setq need-sl
+ (st-add-suffix-link tree need-sl split)))))))))))
+ (cond
+ ((or continue-p breakp))
+ (t
+ (setq remain (1- remain))
+ (cond
+ ((and (eq active-node 1) ; root
+ (> active-length 0))
+ (setq active-length (1- active-length))
+ (setq active-edge-index
+ (1+ (- position remain))))
+ (t
+ (setq active-node
+ (let ((slink (caddr (gethash active-node tree))))
+ (cond
+ ((> slink 1) slink)
+ ; or root
+ (t 1))))))))))
+ (list tree last-added remain active-node
+ active-edge-index active-length)))
+
+;;;###autoload
+(defun st-build-for-str (str)
+ "Build the suffix tree for STR."
+ (let* ((position 0)
+ (character (ignore-error 'error (aref str position)))
+ (tree (make-hash-table))
+ (last-added 0)
+ (remain 0)
+ (active-node (st-new-node tree last-added -1 -1))
+ (active-edge-index 0)
+ (active-length 0)
+ result)
+ (setq last-added active-node)
+ (while character
+ (setq result (st-extend-tree tree last-added position remain
+ active-node active-edge-index
+ active-length character str))
+ (setq tree (pop result))
+ (setq last-added (pop result))
+ (setq remain (pop result))
+ (setq active-node (pop result))
+ (setq active-edge-index (pop result))
+ (setq active-length (pop result))
+ (setq position (1+ position))
+ (setq character (ignore-error 'error (aref str position))))
+ tree))
+
+;;; Some printing functions
+
+(require 'hierarchy)
+
+;;;###autoload
+(defun st-print-str (str)
+ "Print generalized string"
+ (mapc (lambda (char)
+ (cond ((characterp char) (insert char))
+ (t (insert ", "))))
+ str))
+
+;;;###autoload
+(defun st-print-tree-for-str (str)
+ "Print the suffix tree for STR."
+ (let* ((symbol (make-symbol "test")))
+ (set symbol (st-build-for-str str))
+ (insert "suffix tree for strings: ")
+ (st-print-str str)
+ (insert "\n")
+ (st-print-tree (symbol-value symbol) str)))
+
+;;;###autoload
+(defun st-reduce-index (node strs)
+ "Reduce the second element of the NODE."
+ (let* ((lengths (mapcar #'length strs))
+ (start (car node))
+ (end (cadr node))
+ (first-greater-than-start
+ (let ((sum -1) (index 0) done result)
+ (while (and (not done) (< index (length strs)))
+ (cond
+ ((<= (+ sum (nth index lengths)) start)
+ (setq index (1+ index))
+ (setq sum (+ sum (nth index lengths))))
+ (t (setq result (+ sum (nth index lengths)))
+ (setq done t))))
+ (cond ((null result) (setq result (1- (apply #'+ lengths)))))
+ result)))
+ (st-min end first-greater-than-start))
+ ;; (let* ((start (car node))
+ ;; (end (cadr node))
+ ;; (index start)
+ ;; result stop)
+ ;; (while (and (not stop) (or (eq end 'infty) (<= index end)))
+ ;; (cond
+ ;; ((and (< index (length str)) (characterp (aref str index))) (setq index (1+ index)))
+ ;; (t (setq stop t)
+ ;; (setq result index))))
+ ;; (cond ((and (numberp end) (= index (1+ end))) (setq result end)))
+ ;; (cond ((<= result start) (setq result 'end)))
+ ;; result)
+ )
+
+;; ;;;###autoload
+;; (defun st-build-generalized-tree (strs)
+;; (let* ((copy strs)
+;; (long-str
+;; (let ((index 1) result)
+;; (while (consp copy)
+;; (setq result (vconcat result
+;; (car copy)
+;; (vector (- index))))
+;; (setq index (1+ index))
+;; (setq copy (cdr copy)))
+;; result))
+;; (tree (make-symbol "tree")))
+;; (set tree (st-build-for-str long-str))))
+
+;;;###autoload
+(defun st-print-tree (tree str)
+ "Print TREE with the aid of STR."
+ (let* ((symbol-tree (make-symbol "test-tree")))
+ (set symbol-tree (hierarchy-new))
+ (maphash
+ (lambda (key value)
+ (hierarchy-add-tree
+ (symbol-value symbol-tree) key nil
+ (lambda (item)
+ (hash-table-values (cadr (cdr (cdr (gethash item tree))))))))
+ tree)
+ (hierarchy-print (symbol-value symbol-tree)
+ (lambda (item)
+ (cond ((= item 1) "root")
+ (t (let* ((node (gethash item tree))
+ ;; (reduced (st-reduce-index node str))
+ )
+ (substring str (car node)
+ (cond ((integerp (cadr node)) (cadr node))))
+ ;; (cond ((eq reduced 'end) "$")
+ ;; (t (apply #'string
+ ;; (append
+ ;; (seq-subseq str (car node)
+ ;; reduced
+ ;; ;; (cond ((integerp (cadr node)) (cadr node)))
+ ;; )
+ ;; nil))))
+
+ ;; (format "%d): (%S, %S): (%d): \"%s\""
+ ;; item
+ ;; (car node)
+ ;; (cadr node)
+ ;; (caddr node)
+ ;; (substring str (car node) (cond ((integerp (cadr node)) (cadr node))))
+ ;; )
+ )))))))
+
+(provide 'suffix-tree)
+;;; suffiex-tree.el ends here
+
+;;; archive
+
+;; ;;;###autoload
+;; (defvar st-root nil
+;; "The root of the suffix tree.")
+
+;; ;;;###autoload
+;; (defvar st-position nil
+;; "The current position in the string.")
+
+;; ;;;###autoload
+;; (defvar st-wait-for-link nil
+;; "The node that is waiting to have a suffix link.")
+
+;; ;;;###autoload
+;; (defvar st-remain nil
+;; "The number of remaining suffixes to insert.")
+
+;; ;;;###autoload
+;; (defvar st-active-node nil
+;; "Where to insert a new suffix.")
+
+;; ;;;###autoload
+;; (defvar st-active-edge-index nil
+;; "The index of the active edge.")
+
+;; ;;;###autoload
+;; (defvar st-active-length nil
+;; "How far down is the active point down the active edge.")
+
+;; (insert (format "after character %c, the tree becomes\n" character))
+;; (insert (format "string is %s\n" (substring str 0 (1+ position))))
+;; (insert (format "active-node: %d\n" active-node))
+;; (insert (format "active-edge: %c\n" (aref str active-edge-index)))
+;; (insert (format "active-length: %d\n" active-length))
+;; (cond ((eq last-added 11)
+;; (insert (format "slink: %d\n" (caddr (gethash active-node tree))))
+;; (insert (format "con: %S" continue-p))
+;; (insert (format "breakp: %S\n" breakp))))
+;; (st-print-tree tree (substring str 0 (1+ position)))
+;; (insert "\n\n")
+
+
+;; ;;;###autoload
+;; (defvar st-output-buffer-name "*suffix-tree*"
+;; "The name of the buffer that contains the output.")
+
+;; ;;;###autoload
+;; (defun st-print-tree (tree node str prefix)
+;; "Print TREE in the dedicated output buffer starting at NODE.
+;; PREFIX is used to recursively call this function.
+
+;; We need STR since the information we stored in the tree
+;; only contains the index into STR,
+;; for the sake of optimizations."
+;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil
+;; (st-print-tree-internal tree node str prefix)))
+
+;; ;;;###autoload
+;; (defun st-print-tree-internal (tree node str prefix)
+;; "The function that really prints out the data."
+;; (let* ((node-data (gethash node tree))
+;; (start (car node-data))
+;; (end (cadr node-data))
+;; (edges (cdddr node-data))
+;; (node-num (format "(%d)" node))
+;; (node-label (format "%s-%s-%s"
+;; prefix node-num
+;; (substring str start end))))
+;; (prin1 node-label)
+;; ;; print the first edge
+;; (let* ((prefix-base (concat
+;; prefix
+;; (make-string (length node-label) 32)))
+;; ;; TODO
+;; ))))
+
+