summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--comb/suffix-tree.el338
1 files changed, 338 insertions, 0 deletions
diff --git a/comb/suffix-tree.el b/comb/suffix-tree.el
new file mode 100644
index 0000000..f7149da
--- /dev/null
+++ b/comb/suffix-tree.el
@@ -0,0 +1,338 @@
+;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*-
+
+
+;; Our node is represented as a list with the following elements:
+
+;; start,
+;; which is the starting index of the edge going from its parent
+;; node
+;; end, the index of the end index
+;; suffix-link, the index of the node its suffix-link points to
+;; children,
+;; a hash-table of pairs of integers and indices of its
+;; children
+
+;; To compute the length of the edge going into NODE we use:
+;; (- (min end (1+ st-position)) start)
+;; which is actually how far the position is on that edge,
+;; if it is on that edge.
+
+;;;###autoload
+(defun st-min (&rest args)
+ "Return the minimum among ARGS.
+If an argument is 'infty, then it is considered greater than
+every number."
+ (apply #'min (delq nil
+ (mapcar
+ (lambda (arg)
+ (cond
+ ((number-or-marker-p arg) arg)
+ ((eq arg 'infty) nil)))
+ args))))
+
+;;;###autoload
+(defun st-edge-length (node position)
+ "Return the length of the edge into NODE.
+See the comment above this function for the reason
+POSITION is here."
+ (- (st-min (car (cdr node)) (1+ position))
+ (car node)))
+
+;;;###autoload
+(defun st-new-node (tree last-added start &optional end)
+ "Make a new node with START and END as the coming edge.
+Then add the new node to TREE.
+LAST-ADDED is the number of elements already in the TREE."
+ (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf
+ )
+ (suffix-link 0) ;; The suffix link is initially 0
+ (new-node
+ (list start end
+ suffix-link
+ (make-hash-table))))
+ (puthash (1+ last-added) new-node tree)
+ (1+ last-added)))
+
+;;;###autoload
+(defun st-add-suffix-link (tree need-sl node)
+ "If NEED-SL is positive, then add the suffix link.
+In particular, the node corresponding to NEED-SL in TREE
+gets a suffix link pointing to NODE.
+
+This always returns NODE."
+ (cond
+ ((> need-sl 0)
+ (setcar (cdr (cdr (gethash need-sl tree))) node)))
+ node)
+
+;;;###autoload
+(defun st-canonize (tree node position active-edge-index active-length active-node)
+ "Walk down TREE to find the correct active point."
+ (let ((node-edge-length (st-edge-length (gethash node tree) position)))
+ (cond
+ ((>= active-length node-edge-length)
+ (list t
+ (+ active-edge-index
+ node-edge-length)
+ (- active-length
+ node-edge-length)
+ node))
+ (t
+ (list nil
+ active-edge-index
+ active-length
+ active-node)))))
+
+;;; For review later
+;;;###autoload
+;; (defun st-init)
+;; {
+;; needSL = 0, last_added = 0, pos = -1,
+;; remainder = 0, active_node = 0, active_e = 0, active_len = 0;
+;; root = active_node = new_node(-1, -1);
+;; }
+
+;;;###autoload
+(defun st-extend-tree (tree last-added position remain
+ active-node active-edge-index
+ active-length character str)
+ "Extend a tree by CHARACTER.
+The return value is
+(tree last-added remain active-node
+ active-edge-index active-length)"
+ (let* ((need-sl 0)
+ (remain (1+ remain))
+ continue-p breakp)
+ (while (and (not breakp) (> remain 0))
+ (cond
+ ((= active-length 0)
+ (setq active-edge-index position)))
+ (let* ((actual-node (gethash active-node tree))
+ (nxt (cond
+ (actual-node
+ (gethash (aref str active-edge-index)
+ (cadr (cdr (cdr actual-node))))))))
+ ;; (cond ((null actual-node) (user-error "active-node: %S" active-node)))
+ ;; (message "aei: %S" active-edge-index)
+ ;; (message "act: %S" (gethash active-node tree))
+ (cond
+ ((null nxt)
+ (let ((leaf (st-new-node tree last-added position)))
+ (setq last-added leaf)
+ (puthash (aref str active-edge-index)
+ leaf
+ (cadr (cdr (cdr (gethash active-node tree)))))
+ (setq need-sl (st-add-suffix-link tree need-sl active-node))
+ ;; rule 2
+ ))
+ (t
+ (let* ((result (st-canonize
+ tree nxt position active-edge-index
+ active-length active-node)))
+ (cond
+ ((car result)
+ ;; observation 2
+ (setq active-edge-index (cadr result))
+ (setq active-length (caddr result))
+ (setq active-node (cadr (cddr result)))
+ (setq continue-p t))
+ (t
+ (cond
+ ((eq (aref str (+ active-length
+ (car (gethash nxt tree))))
+ character)
+ ;; observation 1
+ (setq active-length (1+ active-length))
+ (setq need-sl (st-add-suffix-link tree need-sl active-node))
+ (setq breakp t)))
+ (cond
+ (breakp)
+ (t ;; splitting
+ (let ((split (st-new-node
+ tree last-added (car (gethash nxt tree))
+ (+ (car (gethash nxt tree)) active-length))))
+ (setq last-added split)
+ (puthash
+ (aref str active-edge-index)
+ split (cadr (cdr (cdr (gethash active-node tree)))))
+ (let ((leaf (st-new-node tree last-added position)))
+ (setq last-added leaf)
+ (puthash character leaf
+ (cadddr (gethash split tree)))
+ (setcar (gethash nxt tree)
+ (+ (car (gethash nxt tree))
+ active-length))
+ (puthash (aref str (car (gethash nxt tree))) nxt
+ (cadddr (gethash split tree)))
+ ;; rule 2
+ (setq need-sl
+ (st-add-suffix-link tree need-sl split)))))))))))
+ (cond
+ ((or continue-p breakp))
+ (t
+ (setq remain (1- remain))
+ (cond
+ ((and (eq active-node 1) ; root
+ (> active-length 0))
+ (setq active-length (1- active-length))
+ (setq active-edge-index
+ (1+ (- position remain))))
+ (t
+ (setq active-node
+ (let ((slink (caddr (gethash active-node tree))))
+ (cond
+ ((> slink 0) slink)
+ ; or root
+ (t 1))))))))))
+ ;; (message "remain is %d" remain)
+ (list tree last-added remain active-node
+ active-edge-index active-length)))
+
+;;;###autoload
+(defun st-build-for-str (str)
+ "Build the suffix tree for STR."
+ (let* ((position 0)
+ (character (ignore-error 'error (aref str position)))
+ (tree (make-hash-table))
+ (last-added 0)
+ (remain 0)
+ (active-node (st-new-node tree last-added -1 -1))
+ (active-edge-index 0)
+ (active-length 0)
+ result)
+ (setq last-added active-node)
+ (while character
+ (setq result (st-extend-tree tree last-added position remain
+ active-node active-edge-index
+ active-length character str))
+ (setq tree (pop result))
+ (setq last-added (pop result))
+ (setq remain (pop result))
+ (setq active-node (pop result))
+ (setq active-edge-index (pop result))
+ (setq active-length (pop result))
+ (setq position (1+ position))
+ (setq character (ignore-error 'error (aref str position))))
+ tree))
+
+;;;###autoload
+(defun st-print-tree-for-str (str)
+ "Print the suffix tree for STR."
+ (let* ((symbol (make-symbol "test")))
+ (set symbol (st-build-for-str str))
+ (insert "suffix tree for \"" str "\"" "\n")
+ (st-print-tree (symbol-value symbol) str)))
+
+;;;###autoload
+(defun st-print-tree (tree str)
+ "Print TREE with the aid of STR."
+ (let* ((symbol-tree (make-symbol "test-tree")))
+ (set symbol-tree (hierarchy-new))
+ (maphash
+ (lambda (key value)
+ (hierarchy-add-tree
+ (symbol-value symbol-tree) key nil
+ (lambda (item)
+ (hash-table-values (cadr (cdr (cdr (gethash item tree))))))))
+ tree)
+ (hierarchy-print (symbol-value symbol-tree)
+ (lambda (item)
+ (cond ((= item 1) "root")
+ (t (let ((node (gethash item tree)))
+ (substring str (car node) (cond ((integerp (cadr node)) (cadr node)))))))))))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+;;; Some printing functions
+
+(use-package "hierarchy" 'hierarchy)
+
+;; ;;;###autoload
+;; (defvar st-output-buffer-name "*suffix-tree*"
+;; "The name of the buffer that contains the output.")
+
+;; ;;;###autoload
+;; (defun st-print-tree (tree node str prefix)
+;; "Print TREE in the dedicated output buffer starting at NODE.
+;; PREFIX is used to recursively call this function.
+
+;; We need STR since the information we stored in the tree
+;; only contains the index into STR,
+;; for the sake of optimizations."
+;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil
+;; (st-print-tree-internal tree node str prefix)))
+
+;; ;;;###autoload
+;; (defun st-print-tree-internal (tree node str prefix)
+;; "The function that really prints out the data."
+;; (let* ((node-data (gethash node tree))
+;; (start (car node-data))
+;; (end (cadr node-data))
+;; (edges (cdddr node-data))
+;; (node-num (format "(%d)" node))
+;; (node-label (format "%s-%s-%s"
+;; prefix node-num
+;; (substring str start end))))
+;; (prin1 node-label)
+;; ;; print the first edge
+;; (let* ((prefix-base (concat
+;; prefix
+;; (make-string (length node-label) 32)))
+;; ;; TODO
+;; ))))
+
+
+
+
+
+
+
+
+
+
+
+(provide 'suffix-tree)
+;;; suffiex-tree.el ends here.
+
+;;; archive
+
+;; ;;;###autoload
+;; (defvar st-root nil
+;; "The root of the suffix tree.")
+
+;; ;;;###autoload
+;; (defvar st-position nil
+;; "The current position in the string.")
+
+;; ;;;###autoload
+;; (defvar st-wait-for-link nil
+;; "The node that is waiting to have a suffix link.")
+
+;; ;;;###autoload
+;; (defvar st-remain nil
+;; "The number of remaining suffixes to insert.")
+
+;; ;;;###autoload
+;; (defvar st-active-node nil
+;; "Where to insert a new suffix.")
+
+;; ;;;###autoload
+;; (defvar st-active-edge-index nil
+;; "The index of the active edge.")
+
+;; ;;;###autoload
+;; (defvar st-active-length nil
+;; "How far down is the active point down the active edge.")