;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*- ;; Our node is represented as a list with the following elements: ;; start, ;; which is the starting index of the edge going from its parent ;; node ;; end, the index of the end index ;; suffix-link, the index of the node its suffix-link points to ;; children, ;; a hash-table of pairs of integers and indices of its ;; children ;; To compute the length of the edge going into NODE we use: ;; (- (min end (1+ st-position)) start) ;; which is actually how far the position is on that edge, ;; if it is on that edge. ;;;###autoload (defun st-min (&rest args) "Return the minimum among ARGS. If an argument is 'infty, then it is considered greater than every number." (apply #'min (delq nil (mapcar (lambda (arg) (cond ((number-or-marker-p arg) arg) ((eq arg 'infty) nil))) args)))) ;;;###autoload (defun st-edge-length (node position) "Return the length of the edge into NODE. See the comment above this function for the reason POSITION is here." (- (st-min (car (cdr node)) (1+ position)) (car node))) ;;;###autoload (defun st-new-node (tree last-added start &optional end) "Make a new node with START and END as the coming edge. Then add the new node to TREE. LAST-ADDED is the number of elements already in the TREE." (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf ) (suffix-link 0) ;; The suffix link is initially 0 (new-node (list start end suffix-link (make-hash-table)))) (puthash (1+ last-added) new-node tree) (1+ last-added))) ;;;###autoload (defun st-add-suffix-link (tree need-sl node) "If NEED-SL is positive, then add the suffix link. In particular, the node corresponding to NEED-SL in TREE gets a suffix link pointing to NODE. This always returns NODE." (cond ((> need-sl 0) (setcar (cdr (cdr (gethash need-sl tree))) node))) node) ;;;###autoload (defun st-canonize (tree node position active-edge-index active-length active-node) "Walk down TREE to find the correct active point." (let ((node-edge-length (st-edge-length (gethash node tree) position))) (cond ((>= active-length node-edge-length) (list t (+ active-edge-index node-edge-length) (- active-length node-edge-length) node)) (t (list nil active-edge-index active-length active-node))))) ;;; For review later ;;;###autoload ;; (defun st-init) ;; { ;; needSL = 0, last_added = 0, pos = -1, ;; remainder = 0, active_node = 0, active_e = 0, active_len = 0; ;; root = active_node = new_node(-1, -1); ;; } ;;;###autoload (defun st-extend-tree (tree last-added position remain active-node active-edge-index active-length character str) "Extend a tree by CHARACTER. The return value is (tree last-added remain active-node active-edge-index active-length)" (let* ((need-sl 0) (remain (1+ remain)) continue-p breakp) (while (and (not breakp) (> remain 0)) (cond ((= active-length 0) (setq active-edge-index position))) (let* ((actual-node (gethash active-node tree)) (nxt (cond (actual-node (gethash (aref str active-edge-index) (cadr (cdr (cdr actual-node)))))))) ;; (cond ((null actual-node) (user-error "active-node: %S" active-node))) ;; (message "aei: %S" active-edge-index) ;; (message "act: %S" (gethash active-node tree)) (cond ((null nxt) (let ((leaf (st-new-node tree last-added position))) (setq last-added leaf) (puthash (aref str active-edge-index) leaf (cadr (cdr (cdr (gethash active-node tree))))) (setq need-sl (st-add-suffix-link tree need-sl active-node)) ;; rule 2 )) (t (let* ((result (st-canonize tree nxt position active-edge-index active-length active-node))) (cond ((car result) ;; observation 2 (setq active-edge-index (cadr result)) (setq active-length (caddr result)) (setq active-node (cadr (cddr result))) (setq continue-p t)) (t (cond ((eq (aref str (+ active-length (car (gethash nxt tree)))) character) ;; observation 1 (setq active-length (1+ active-length)) (setq need-sl (st-add-suffix-link tree need-sl active-node)) (setq breakp t))) (cond (breakp) (t ;; splitting (let ((split (st-new-node tree last-added (car (gethash nxt tree)) (+ (car (gethash nxt tree)) active-length)))) (setq last-added split) (puthash (aref str active-edge-index) split (cadr (cdr (cdr (gethash active-node tree))))) (let ((leaf (st-new-node tree last-added position))) (setq last-added leaf) (puthash character leaf (cadddr (gethash split tree))) (setcar (gethash nxt tree) (+ (car (gethash nxt tree)) active-length)) (puthash (aref str (car (gethash nxt tree))) nxt (cadddr (gethash split tree))) ;; rule 2 (setq need-sl (st-add-suffix-link tree need-sl split))))))))))) (cond ((or continue-p breakp)) (t (setq remain (1- remain)) (cond ((and (eq active-node 1) ; root (> active-length 0)) (setq active-length (1- active-length)) (setq active-edge-index (1+ (- position remain)))) (t (setq active-node (let ((slink (caddr (gethash active-node tree)))) (cond ((> slink 0) slink) ; or root (t 1)))))))))) ;; (message "remain is %d" remain) (list tree last-added remain active-node active-edge-index active-length))) ;;;###autoload (defun st-build-for-str (str) "Build the suffix tree for STR." (let* ((position 0) (character (ignore-error 'error (aref str position))) (tree (make-hash-table)) (last-added 0) (remain 0) (active-node (st-new-node tree last-added -1 -1)) (active-edge-index 0) (active-length 0) result) (setq last-added active-node) (while character (setq result (st-extend-tree tree last-added position remain active-node active-edge-index active-length character str)) (setq tree (pop result)) (setq last-added (pop result)) (setq remain (pop result)) (setq active-node (pop result)) (setq active-edge-index (pop result)) (setq active-length (pop result)) (setq position (1+ position)) (setq character (ignore-error 'error (aref str position)))) tree)) ;;;###autoload (defun st-print-tree-for-str (str) "Print the suffix tree for STR." (let* ((symbol (make-symbol "test"))) (set symbol (st-build-for-str str)) (insert "suffix tree for \"" str "\"" "\n") (st-print-tree (symbol-value symbol) str))) ;;;###autoload (defun st-print-tree (tree str) "Print TREE with the aid of STR." (let* ((symbol-tree (make-symbol "test-tree"))) (set symbol-tree (hierarchy-new)) (maphash (lambda (key value) (hierarchy-add-tree (symbol-value symbol-tree) key nil (lambda (item) (hash-table-values (cadr (cdr (cdr (gethash item tree)))))))) tree) (hierarchy-print (symbol-value symbol-tree) (lambda (item) (cond ((= item 1) "root") (t (let ((node (gethash item tree))) (substring str (car node) (cond ((integerp (cadr node)) (cadr node))))))))))) ;;; Some printing functions (use-package "hierarchy" 'hierarchy) ;; ;;;###autoload ;; (defvar st-output-buffer-name "*suffix-tree*" ;; "The name of the buffer that contains the output.") ;; ;;;###autoload ;; (defun st-print-tree (tree node str prefix) ;; "Print TREE in the dedicated output buffer starting at NODE. ;; PREFIX is used to recursively call this function. ;; We need STR since the information we stored in the tree ;; only contains the index into STR, ;; for the sake of optimizations." ;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil ;; (st-print-tree-internal tree node str prefix))) ;; ;;;###autoload ;; (defun st-print-tree-internal (tree node str prefix) ;; "The function that really prints out the data." ;; (let* ((node-data (gethash node tree)) ;; (start (car node-data)) ;; (end (cadr node-data)) ;; (edges (cdddr node-data)) ;; (node-num (format "(%d)" node)) ;; (node-label (format "%s-%s-%s" ;; prefix node-num ;; (substring str start end)))) ;; (prin1 node-label) ;; ;; print the first edge ;; (let* ((prefix-base (concat ;; prefix ;; (make-string (length node-label) 32))) ;; ;; TODO ;; )))) (provide 'suffix-tree) ;;; suffiex-tree.el ends here. ;;; archive ;; ;;;###autoload ;; (defvar st-root nil ;; "The root of the suffix tree.") ;; ;;;###autoload ;; (defvar st-position nil ;; "The current position in the string.") ;; ;;;###autoload ;; (defvar st-wait-for-link nil ;; "The node that is waiting to have a suffix link.") ;; ;;;###autoload ;; (defvar st-remain nil ;; "The number of remaining suffixes to insert.") ;; ;;;###autoload ;; (defvar st-active-node nil ;; "Where to insert a new suffix.") ;; ;;;###autoload ;; (defvar st-active-edge-index nil ;; "The index of the active edge.") ;; ;;;###autoload ;; (defvar st-active-length nil ;; "How far down is the active point down the active edge.")