From a2f7f2bf9077ba8acfc550575b9e21aa9ffe7bae Mon Sep 17 00:00:00 2001 From: JSDurand Date: Sat, 9 Jan 2021 14:14:33 +0800 Subject: A temporary successful result I can build the suffix trees for strings. But there are some problems when the string shows some hitherto unknown pattern. --- comb/suffix-tree.el | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 comb/suffix-tree.el (limited to 'comb/suffix-tree.el') diff --git a/comb/suffix-tree.el b/comb/suffix-tree.el new file mode 100644 index 0000000..f7149da --- /dev/null +++ b/comb/suffix-tree.el @@ -0,0 +1,338 @@ +;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*- + + +;; Our node is represented as a list with the following elements: + +;; start, +;; which is the starting index of the edge going from its parent +;; node +;; end, the index of the end index +;; suffix-link, the index of the node its suffix-link points to +;; children, +;; a hash-table of pairs of integers and indices of its +;; children + +;; To compute the length of the edge going into NODE we use: +;; (- (min end (1+ st-position)) start) +;; which is actually how far the position is on that edge, +;; if it is on that edge. + +;;;###autoload +(defun st-min (&rest args) + "Return the minimum among ARGS. +If an argument is 'infty, then it is considered greater than +every number." + (apply #'min (delq nil + (mapcar + (lambda (arg) + (cond + ((number-or-marker-p arg) arg) + ((eq arg 'infty) nil))) + args)))) + +;;;###autoload +(defun st-edge-length (node position) + "Return the length of the edge into NODE. +See the comment above this function for the reason +POSITION is here." + (- (st-min (car (cdr node)) (1+ position)) + (car node))) + +;;;###autoload +(defun st-new-node (tree last-added start &optional end) + "Make a new node with START and END as the coming edge. +Then add the new node to TREE. +LAST-ADDED is the number of elements already in the TREE." + (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf + ) + (suffix-link 0) ;; The suffix link is initially 0 + (new-node + (list start end + suffix-link + (make-hash-table)))) + (puthash (1+ last-added) new-node tree) + (1+ last-added))) + +;;;###autoload +(defun st-add-suffix-link (tree need-sl node) + "If NEED-SL is positive, then add the suffix link. +In particular, the node corresponding to NEED-SL in TREE +gets a suffix link pointing to NODE. + +This always returns NODE." + (cond + ((> need-sl 0) + (setcar (cdr (cdr (gethash need-sl tree))) node))) + node) + +;;;###autoload +(defun st-canonize (tree node position active-edge-index active-length active-node) + "Walk down TREE to find the correct active point." + (let ((node-edge-length (st-edge-length (gethash node tree) position))) + (cond + ((>= active-length node-edge-length) + (list t + (+ active-edge-index + node-edge-length) + (- active-length + node-edge-length) + node)) + (t + (list nil + active-edge-index + active-length + active-node))))) + +;;; For review later +;;;###autoload +;; (defun st-init) +;; { +;; needSL = 0, last_added = 0, pos = -1, +;; remainder = 0, active_node = 0, active_e = 0, active_len = 0; +;; root = active_node = new_node(-1, -1); +;; } + +;;;###autoload +(defun st-extend-tree (tree last-added position remain + active-node active-edge-index + active-length character str) + "Extend a tree by CHARACTER. +The return value is +(tree last-added remain active-node + active-edge-index active-length)" + (let* ((need-sl 0) + (remain (1+ remain)) + continue-p breakp) + (while (and (not breakp) (> remain 0)) + (cond + ((= active-length 0) + (setq active-edge-index position))) + (let* ((actual-node (gethash active-node tree)) + (nxt (cond + (actual-node + (gethash (aref str active-edge-index) + (cadr (cdr (cdr actual-node)))))))) + ;; (cond ((null actual-node) (user-error "active-node: %S" active-node))) + ;; (message "aei: %S" active-edge-index) + ;; (message "act: %S" (gethash active-node tree)) + (cond + ((null nxt) + (let ((leaf (st-new-node tree last-added position))) + (setq last-added leaf) + (puthash (aref str active-edge-index) + leaf + (cadr (cdr (cdr (gethash active-node tree))))) + (setq need-sl (st-add-suffix-link tree need-sl active-node)) + ;; rule 2 + )) + (t + (let* ((result (st-canonize + tree nxt position active-edge-index + active-length active-node))) + (cond + ((car result) + ;; observation 2 + (setq active-edge-index (cadr result)) + (setq active-length (caddr result)) + (setq active-node (cadr (cddr result))) + (setq continue-p t)) + (t + (cond + ((eq (aref str (+ active-length + (car (gethash nxt tree)))) + character) + ;; observation 1 + (setq active-length (1+ active-length)) + (setq need-sl (st-add-suffix-link tree need-sl active-node)) + (setq breakp t))) + (cond + (breakp) + (t ;; splitting + (let ((split (st-new-node + tree last-added (car (gethash nxt tree)) + (+ (car (gethash nxt tree)) active-length)))) + (setq last-added split) + (puthash + (aref str active-edge-index) + split (cadr (cdr (cdr (gethash active-node tree))))) + (let ((leaf (st-new-node tree last-added position))) + (setq last-added leaf) + (puthash character leaf + (cadddr (gethash split tree))) + (setcar (gethash nxt tree) + (+ (car (gethash nxt tree)) + active-length)) + (puthash (aref str (car (gethash nxt tree))) nxt + (cadddr (gethash split tree))) + ;; rule 2 + (setq need-sl + (st-add-suffix-link tree need-sl split))))))))))) + (cond + ((or continue-p breakp)) + (t + (setq remain (1- remain)) + (cond + ((and (eq active-node 1) ; root + (> active-length 0)) + (setq active-length (1- active-length)) + (setq active-edge-index + (1+ (- position remain)))) + (t + (setq active-node + (let ((slink (caddr (gethash active-node tree)))) + (cond + ((> slink 0) slink) + ; or root + (t 1)))))))))) + ;; (message "remain is %d" remain) + (list tree last-added remain active-node + active-edge-index active-length))) + +;;;###autoload +(defun st-build-for-str (str) + "Build the suffix tree for STR." + (let* ((position 0) + (character (ignore-error 'error (aref str position))) + (tree (make-hash-table)) + (last-added 0) + (remain 0) + (active-node (st-new-node tree last-added -1 -1)) + (active-edge-index 0) + (active-length 0) + result) + (setq last-added active-node) + (while character + (setq result (st-extend-tree tree last-added position remain + active-node active-edge-index + active-length character str)) + (setq tree (pop result)) + (setq last-added (pop result)) + (setq remain (pop result)) + (setq active-node (pop result)) + (setq active-edge-index (pop result)) + (setq active-length (pop result)) + (setq position (1+ position)) + (setq character (ignore-error 'error (aref str position)))) + tree)) + +;;;###autoload +(defun st-print-tree-for-str (str) + "Print the suffix tree for STR." + (let* ((symbol (make-symbol "test"))) + (set symbol (st-build-for-str str)) + (insert "suffix tree for \"" str "\"" "\n") + (st-print-tree (symbol-value symbol) str))) + +;;;###autoload +(defun st-print-tree (tree str) + "Print TREE with the aid of STR." + (let* ((symbol-tree (make-symbol "test-tree"))) + (set symbol-tree (hierarchy-new)) + (maphash + (lambda (key value) + (hierarchy-add-tree + (symbol-value symbol-tree) key nil + (lambda (item) + (hash-table-values (cadr (cdr (cdr (gethash item tree)))))))) + tree) + (hierarchy-print (symbol-value symbol-tree) + (lambda (item) + (cond ((= item 1) "root") + (t (let ((node (gethash item tree))) + (substring str (car node) (cond ((integerp (cadr node)) (cadr node))))))))))) + + + + + + + + + + + + + + + + +;;; Some printing functions + +(use-package "hierarchy" 'hierarchy) + +;; ;;;###autoload +;; (defvar st-output-buffer-name "*suffix-tree*" +;; "The name of the buffer that contains the output.") + +;; ;;;###autoload +;; (defun st-print-tree (tree node str prefix) +;; "Print TREE in the dedicated output buffer starting at NODE. +;; PREFIX is used to recursively call this function. + +;; We need STR since the information we stored in the tree +;; only contains the index into STR, +;; for the sake of optimizations." +;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil +;; (st-print-tree-internal tree node str prefix))) + +;; ;;;###autoload +;; (defun st-print-tree-internal (tree node str prefix) +;; "The function that really prints out the data." +;; (let* ((node-data (gethash node tree)) +;; (start (car node-data)) +;; (end (cadr node-data)) +;; (edges (cdddr node-data)) +;; (node-num (format "(%d)" node)) +;; (node-label (format "%s-%s-%s" +;; prefix node-num +;; (substring str start end)))) +;; (prin1 node-label) +;; ;; print the first edge +;; (let* ((prefix-base (concat +;; prefix +;; (make-string (length node-label) 32))) +;; ;; TODO +;; )))) + + + + + + + + + + + +(provide 'suffix-tree) +;;; suffiex-tree.el ends here. + +;;; archive + +;; ;;;###autoload +;; (defvar st-root nil +;; "The root of the suffix tree.") + +;; ;;;###autoload +;; (defvar st-position nil +;; "The current position in the string.") + +;; ;;;###autoload +;; (defvar st-wait-for-link nil +;; "The node that is waiting to have a suffix link.") + +;; ;;;###autoload +;; (defvar st-remain nil +;; "The number of remaining suffixes to insert.") + +;; ;;;###autoload +;; (defvar st-active-node nil +;; "Where to insert a new suffix.") + +;; ;;;###autoload +;; (defvar st-active-edge-index nil +;; "The index of the active edge.") + +;; ;;;###autoload +;; (defvar st-active-length nil +;; "How far down is the active point down the active edge.") -- cgit v1.2.3-18-g5258