;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*- ;;; Author: Durand ;;; Version: 0.0.1 ;;; Commentary: ;; Our node is represented as a list of the following elements: ;; start , which is the starting index of the edge going from its parent ;; node to this node ;; end , the index of the end of the above edge ;; suffix-link, the index of the node this node's suffix-link points to ;; children , a hash-table of pairs of integers and indices of its ;; children ;; To compute the length of the edge going into NODE we use: ;; (- (min end (1+ position)) start) ;; which is actually how far the position is on that edge, ;; if it is on that edge. ;;; Code: ;;;###autoload (defun st-min (&rest args) "Return the minimum among ARGS. If an argument is 'infty, then it is considered greater than every number." (apply #'min (delq nil (mapcar (lambda (arg) (cond ((number-or-marker-p arg) arg) ((eq arg 'infty) nil))) args)))) ;;;###autoload (defun st-edge-length (node position) "Return the length of the edge into NODE. See the comment above this function for the reason POSITION is here." (- (st-min (car (cdr node)) (1+ position)) (car node))) ;;;###autoload (defun st-new-node (tree last-added start &optional end) "Make a new node with START and END as the coming edge. Then add the new node to TREE. LAST-ADDED is the number of elements already in the TREE." (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf ) (suffix-link 0) ;; The suffix link is initially 0 (new-node (list start end suffix-link (make-hash-table)))) (puthash (1+ last-added) new-node tree) (1+ last-added))) ;;;###autoload (defun st-add-suffix-link (tree need-sl node) "If NEED-SL is positive, then add the suffix link. In particular, the node corresponding to NEED-SL in TREE gets a suffix link pointing to NODE. This always returns NODE." (cond ((and (> need-sl 1) (/= need-sl node)) (setcar (cdr (cdr (gethash need-sl tree))) node))) node) ;;;###autoload (defun st-canonize (tree node position active-edge-index active-length active-node) "Walk down TREE to find the correct active point." (let ((node-edge-length (st-edge-length (gethash node tree) position))) (cond ((>= active-length node-edge-length) (list t (+ active-edge-index node-edge-length) (- active-length node-edge-length) node)) (t (list nil active-edge-index active-length active-node))))) ;;;###autoload (defun st-extend-tree (tree last-added position remain active-node active-edge-index active-length character str) "Extend a tree by CHARACTER. The return value is (tree last-added remain active-node active-edge-index active-length)" (let* ((need-sl 0) (remain (1+ remain)) continue-p breakp) (while (and (not breakp) (> remain 0)) (setq continue-p nil breakp nil) (cond ((= active-length 0) (setq active-edge-index position))) (let* ((actual-node (gethash active-node tree)) (nxt (cond (actual-node (gethash (aref str active-edge-index) (cadr (cdr (cdr actual-node)))))))) (cond ((null nxt) (let ((leaf (st-new-node tree last-added position))) (setq last-added leaf) (puthash (aref str active-edge-index) leaf (cadr (cdr (cdr (gethash active-node tree))))) (setq need-sl (st-add-suffix-link tree need-sl active-node))) ;; rule 2 ) (t (let* ((result (st-canonize tree nxt position active-edge-index active-length active-node))) (cond ((car result) ;; observation 2 (setq active-edge-index (cadr result)) (setq active-length (caddr result)) (setq active-node (cadr (cddr result))) (setq continue-p t)) (t (cond ((eq (aref str (+ active-length (car (gethash nxt tree)))) character) ;; observation 1 (setq active-length (1+ active-length)) (setq need-sl (st-add-suffix-link tree need-sl active-node)) (setq breakp t))) (cond (breakp) (t ;; splitting (let ((split (st-new-node tree last-added (car (gethash nxt tree)) (+ (car (gethash nxt tree)) active-length)))) (setq last-added split) (puthash (aref str active-edge-index) split (cadr (cdr (cdr (gethash active-node tree))))) (let ((leaf (st-new-node tree last-added position))) (setq last-added leaf) (puthash character leaf (cadddr (gethash split tree))) (setcar (gethash nxt tree) (+ (car (gethash nxt tree)) active-length)) (puthash (aref str (car (gethash nxt tree))) nxt (cadddr (gethash split tree))) ;; rule 2 (setq need-sl (st-add-suffix-link tree need-sl split))))))))))) (cond ((or continue-p breakp)) (t (setq remain (1- remain)) (cond ((and (eq active-node 1) ; root (> active-length 0)) (setq active-length (1- active-length)) (setq active-edge-index (1+ (- position remain)))) (t (setq active-node (let ((slink (caddr (gethash active-node tree)))) (cond ((> slink 1) slink) ; or root (t 1)))))))))) (list tree last-added remain active-node active-edge-index active-length))) ;;;###autoload (defun st-build-for-str (str) "Build the suffix tree for STR." (let* ((position 0) (character (ignore-error 'error (aref str position))) (tree (make-hash-table)) (last-added 0) (remain 0) (active-node (st-new-node tree last-added -1 -1)) (active-edge-index 0) (active-length 0) result) (setq last-added active-node) (while character (setq result (st-extend-tree tree last-added position remain active-node active-edge-index active-length character str)) (setq tree (pop result)) (setq last-added (pop result)) (setq remain (pop result)) (setq active-node (pop result)) (setq active-edge-index (pop result)) (setq active-length (pop result)) (setq position (1+ position)) (setq character (ignore-error 'error (aref str position)))) tree)) ;;; Some printing functions (require 'hierarchy) ;;;###autoload (defun st-print-str (str) "Print generalized string" (mapc (lambda (char) (cond ((characterp char) (insert char)) (t (insert ", ")))) str)) ;;;###autoload (defun st-print-tree-for-str (str) "Print the suffix tree for STR." (let* ((symbol (make-symbol "test"))) (set symbol (st-build-for-str str)) (insert "suffix tree for strings: ") (st-print-str str) (insert "\n") (st-print-tree (symbol-value symbol) str))) ;;;###autoload (defun st-reduce-index (node strs) "Reduce the second element of the NODE." (let* ((lengths (mapcar #'length strs)) (start (car node)) (end (cadr node)) (first-greater-than-start (let ((sum -1) (index 0) done result) (while (and (not done) (< index (length strs))) (cond ((<= (+ sum (nth index lengths)) start) (setq index (1+ index)) (setq sum (+ sum (nth index lengths)))) (t (setq result (+ sum (nth index lengths))) (setq done t)))) (cond ((null result) (setq result (1- (apply #'+ lengths))))) result))) (st-min end first-greater-than-start)) ;; (let* ((start (car node)) ;; (end (cadr node)) ;; (index start) ;; result stop) ;; (while (and (not stop) (or (eq end 'infty) (<= index end))) ;; (cond ;; ((and (< index (length str)) (characterp (aref str index))) (setq index (1+ index))) ;; (t (setq stop t) ;; (setq result index)))) ;; (cond ((and (numberp end) (= index (1+ end))) (setq result end))) ;; (cond ((<= result start) (setq result 'end))) ;; result) ) ;; ;;;###autoload ;; (defun st-build-generalized-tree (strs) ;; (let* ((copy strs) ;; (long-str ;; (let ((index 1) result) ;; (while (consp copy) ;; (setq result (vconcat result ;; (car copy) ;; (vector (- index)))) ;; (setq index (1+ index)) ;; (setq copy (cdr copy))) ;; result)) ;; (tree (make-symbol "tree"))) ;; (set tree (st-build-for-str long-str)))) ;;;###autoload (defun st-print-tree (tree str) "Print TREE with the aid of STR." (let* ((symbol-tree (make-symbol "test-tree"))) (set symbol-tree (hierarchy-new)) (maphash (lambda (key value) (hierarchy-add-tree (symbol-value symbol-tree) key nil (lambda (item) (hash-table-values (cadr (cdr (cdr (gethash item tree)))))))) tree) (hierarchy-print (symbol-value symbol-tree) (lambda (item) (cond ((= item 1) "root") (t (let* ((node (gethash item tree)) ;; (reduced (st-reduce-index node str)) ) (substring str (car node) (cond ((integerp (cadr node)) (cadr node)))) ;; (cond ((eq reduced 'end) "$") ;; (t (apply #'string ;; (append ;; (seq-subseq str (car node) ;; reduced ;; ;; (cond ((integerp (cadr node)) (cadr node))) ;; ) ;; nil)))) ;; (format "%d): (%S, %S): (%d): \"%s\"" ;; item ;; (car node) ;; (cadr node) ;; (caddr node) ;; (substring str (car node) (cond ((integerp (cadr node)) (cadr node)))) ;; ) ))))))) (provide 'suffix-tree) ;;; suffiex-tree.el ends here ;;; archive ;; ;;;###autoload ;; (defvar st-root nil ;; "The root of the suffix tree.") ;; ;;;###autoload ;; (defvar st-position nil ;; "The current position in the string.") ;; ;;;###autoload ;; (defvar st-wait-for-link nil ;; "The node that is waiting to have a suffix link.") ;; ;;;###autoload ;; (defvar st-remain nil ;; "The number of remaining suffixes to insert.") ;; ;;;###autoload ;; (defvar st-active-node nil ;; "Where to insert a new suffix.") ;; ;;;###autoload ;; (defvar st-active-edge-index nil ;; "The index of the active edge.") ;; ;;;###autoload ;; (defvar st-active-length nil ;; "How far down is the active point down the active edge.") ;; (insert (format "after character %c, the tree becomes\n" character)) ;; (insert (format "string is %s\n" (substring str 0 (1+ position)))) ;; (insert (format "active-node: %d\n" active-node)) ;; (insert (format "active-edge: %c\n" (aref str active-edge-index))) ;; (insert (format "active-length: %d\n" active-length)) ;; (cond ((eq last-added 11) ;; (insert (format "slink: %d\n" (caddr (gethash active-node tree)))) ;; (insert (format "con: %S" continue-p)) ;; (insert (format "breakp: %S\n" breakp)))) ;; (st-print-tree tree (substring str 0 (1+ position))) ;; (insert "\n\n") ;; ;;;###autoload ;; (defvar st-output-buffer-name "*suffix-tree*" ;; "The name of the buffer that contains the output.") ;; ;;;###autoload ;; (defun st-print-tree (tree node str prefix) ;; "Print TREE in the dedicated output buffer starting at NODE. ;; PREFIX is used to recursively call this function. ;; We need STR since the information we stored in the tree ;; only contains the index into STR, ;; for the sake of optimizations." ;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil ;; (st-print-tree-internal tree node str prefix))) ;; ;;;###autoload ;; (defun st-print-tree-internal (tree node str prefix) ;; "The function that really prints out the data." ;; (let* ((node-data (gethash node tree)) ;; (start (car node-data)) ;; (end (cadr node-data)) ;; (edges (cdddr node-data)) ;; (node-num (format "(%d)" node)) ;; (node-label (format "%s-%s-%s" ;; prefix node-num ;; (substring str start end)))) ;; (prin1 node-label) ;; ;; print the first edge ;; (let* ((prefix-base (concat ;; prefix ;; (make-string (length node-label) 32))) ;; ;; TODO ;; ))))