diff options
author | JSDurand <mmemmew@gmail.com> | 2021-01-13 13:01:34 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2021-01-13 13:01:34 +0800 |
commit | 3666deaed5b0baf0a74f14db5872105c9e7865f9 (patch) | |
tree | 3535c3f57ed9d5b1cd4e3e81831f627840b6e81b /suffix tree/suffix-tree.el | |
parent | 1700588e1a3cfb5fa45fb64393c68782bc35fc38 (diff) |
A temporary intermeidate step
Now I got almost every functionality that we need, including pdf,
mu4e, magit, et cetera.
Diffstat (limited to 'suffix tree/suffix-tree.el')
-rw-r--r-- | suffix tree/suffix-tree.el | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/suffix tree/suffix-tree.el b/suffix tree/suffix-tree.el new file mode 100644 index 0000000..0d929d0 --- /dev/null +++ b/suffix tree/suffix-tree.el @@ -0,0 +1,397 @@ +;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*- + +;;; Author: Durand +;;; Version: 0.0.1 + +;;; Commentary: + +;; Our node is represented as a list of the following elements: + +;; start , which is the starting index of the edge going from its parent +;; node to this node +;; end , the index of the end of the above edge +;; suffix-link, the index of the node this node's suffix-link points to +;; children , a hash-table of pairs of integers and indices of its +;; children + +;; To compute the length of the edge going into NODE we use: +;; (- (min end (1+ position)) start) +;; which is actually how far the position is on that edge, +;; if it is on that edge. + +;;; Code: + +;;;###autoload +(defun st-min (&rest args) + "Return the minimum among ARGS. +If an argument is 'infty, then it is considered greater than +every number." + (apply #'min (delq nil + (mapcar + (lambda (arg) + (cond + ((number-or-marker-p arg) arg) + ((eq arg 'infty) nil))) + args)))) + +;;;###autoload +(defun st-edge-length (node position) + "Return the length of the edge into NODE. +See the comment above this function for the reason +POSITION is here." + (- (st-min (car (cdr node)) (1+ position)) + (car node))) + +;;;###autoload +(defun st-new-node (tree last-added start &optional end) + "Make a new node with START and END as the coming edge. +Then add the new node to TREE. +LAST-ADDED is the number of elements already in the TREE." + (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf + ) + (suffix-link 0) ;; The suffix link is initially 0 + (new-node + (list start end + suffix-link + (make-hash-table)))) + (puthash (1+ last-added) new-node tree) + (1+ last-added))) + +;;;###autoload +(defun st-add-suffix-link (tree need-sl node) + "If NEED-SL is positive, then add the suffix link. +In particular, the node corresponding to NEED-SL in TREE +gets a suffix link pointing to NODE. + +This always returns NODE." + (cond + ((and (> need-sl 1) (/= need-sl node)) + (setcar (cdr (cdr (gethash need-sl tree))) node))) + node) + +;;;###autoload +(defun st-canonize (tree node position active-edge-index active-length active-node) + "Walk down TREE to find the correct active point." + (let ((node-edge-length (st-edge-length (gethash node tree) position))) + (cond + ((>= active-length node-edge-length) + (list t + (+ active-edge-index + node-edge-length) + (- active-length + node-edge-length) + node)) + (t + (list nil + active-edge-index + active-length + active-node))))) + +;;;###autoload +(defun st-extend-tree (tree last-added position remain + active-node active-edge-index + active-length character str) + "Extend a tree by CHARACTER. +The return value is +(tree last-added remain active-node + active-edge-index active-length)" + (let* ((need-sl 0) + (remain (1+ remain)) + continue-p breakp) + (while (and (not breakp) (> remain 0)) + (setq continue-p nil breakp nil) + (cond + ((= active-length 0) + (setq active-edge-index position))) + (let* ((actual-node (gethash active-node tree)) + (nxt (cond + (actual-node + (gethash (aref str active-edge-index) + (cadr (cdr (cdr actual-node)))))))) + (cond + ((null nxt) + (let ((leaf (st-new-node tree last-added position))) + (setq last-added leaf) + (puthash (aref str active-edge-index) + leaf + (cadr (cdr (cdr (gethash active-node tree))))) + (setq need-sl (st-add-suffix-link tree need-sl active-node))) + ;; rule 2 + ) + (t + (let* ((result (st-canonize + tree nxt position active-edge-index + active-length active-node))) + (cond + ((car result) + ;; observation 2 + (setq active-edge-index (cadr result)) + (setq active-length (caddr result)) + (setq active-node (cadr (cddr result))) + (setq continue-p t)) + (t + (cond + ((eq (aref str (+ active-length + (car (gethash nxt tree)))) + character) + ;; observation 1 + (setq active-length (1+ active-length)) + (setq need-sl (st-add-suffix-link tree need-sl active-node)) + (setq breakp t))) + (cond + (breakp) + (t ;; splitting + (let ((split (st-new-node + tree last-added (car (gethash nxt tree)) + (+ (car (gethash nxt tree)) active-length)))) + (setq last-added split) + (puthash + (aref str active-edge-index) + split (cadr (cdr (cdr (gethash active-node tree))))) + (let ((leaf (st-new-node tree last-added position))) + (setq last-added leaf) + (puthash character leaf + (cadddr (gethash split tree))) + (setcar (gethash nxt tree) + (+ (car (gethash nxt tree)) + active-length)) + (puthash (aref str (car (gethash nxt tree))) nxt + (cadddr (gethash split tree))) + ;; rule 2 + (setq need-sl + (st-add-suffix-link tree need-sl split))))))))))) + (cond + ((or continue-p breakp)) + (t + (setq remain (1- remain)) + (cond + ((and (eq active-node 1) ; root + (> active-length 0)) + (setq active-length (1- active-length)) + (setq active-edge-index + (1+ (- position remain)))) + (t + (setq active-node + (let ((slink (caddr (gethash active-node tree)))) + (cond + ((> slink 1) slink) + ; or root + (t 1)))))))))) + (list tree last-added remain active-node + active-edge-index active-length))) + +;;;###autoload +(defun st-build-for-str (str) + "Build the suffix tree for STR." + (let* ((position 0) + (character (ignore-error 'error (aref str position))) + (tree (make-hash-table)) + (last-added 0) + (remain 0) + (active-node (st-new-node tree last-added -1 -1)) + (active-edge-index 0) + (active-length 0) + result) + (setq last-added active-node) + (while character + (setq result (st-extend-tree tree last-added position remain + active-node active-edge-index + active-length character str)) + (setq tree (pop result)) + (setq last-added (pop result)) + (setq remain (pop result)) + (setq active-node (pop result)) + (setq active-edge-index (pop result)) + (setq active-length (pop result)) + (setq position (1+ position)) + (setq character (ignore-error 'error (aref str position)))) + tree)) + +;;; Some printing functions + +(require 'hierarchy) + +;;;###autoload +(defun st-print-str (str) + "Print generalized string" + (mapc (lambda (char) + (cond ((characterp char) (insert char)) + (t (insert ", ")))) + str)) + +;;;###autoload +(defun st-print-tree-for-str (str) + "Print the suffix tree for STR." + (let* ((symbol (make-symbol "test"))) + (set symbol (st-build-for-str str)) + (insert "suffix tree for strings: ") + (st-print-str str) + (insert "\n") + (st-print-tree (symbol-value symbol) str))) + +;;;###autoload +(defun st-reduce-index (node strs) + "Reduce the second element of the NODE." + (let* ((lengths (mapcar #'length strs)) + (start (car node)) + (end (cadr node)) + (first-greater-than-start + (let ((sum -1) (index 0) done result) + (while (and (not done) (< index (length strs))) + (cond + ((<= (+ sum (nth index lengths)) start) + (setq index (1+ index)) + (setq sum (+ sum (nth index lengths)))) + (t (setq result (+ sum (nth index lengths))) + (setq done t)))) + (cond ((null result) (setq result (1- (apply #'+ lengths))))) + result))) + (st-min end first-greater-than-start)) + ;; (let* ((start (car node)) + ;; (end (cadr node)) + ;; (index start) + ;; result stop) + ;; (while (and (not stop) (or (eq end 'infty) (<= index end))) + ;; (cond + ;; ((and (< index (length str)) (characterp (aref str index))) (setq index (1+ index))) + ;; (t (setq stop t) + ;; (setq result index)))) + ;; (cond ((and (numberp end) (= index (1+ end))) (setq result end))) + ;; (cond ((<= result start) (setq result 'end))) + ;; result) + ) + +;; ;;;###autoload +;; (defun st-build-generalized-tree (strs) +;; (let* ((copy strs) +;; (long-str +;; (let ((index 1) result) +;; (while (consp copy) +;; (setq result (vconcat result +;; (car copy) +;; (vector (- index)))) +;; (setq index (1+ index)) +;; (setq copy (cdr copy))) +;; result)) +;; (tree (make-symbol "tree"))) +;; (set tree (st-build-for-str long-str)))) + +;;;###autoload +(defun st-print-tree (tree str) + "Print TREE with the aid of STR." + (let* ((symbol-tree (make-symbol "test-tree"))) + (set symbol-tree (hierarchy-new)) + (maphash + (lambda (key value) + (hierarchy-add-tree + (symbol-value symbol-tree) key nil + (lambda (item) + (hash-table-values (cadr (cdr (cdr (gethash item tree)))))))) + tree) + (hierarchy-print (symbol-value symbol-tree) + (lambda (item) + (cond ((= item 1) "root") + (t (let* ((node (gethash item tree)) + ;; (reduced (st-reduce-index node str)) + ) + (substring str (car node) + (cond ((integerp (cadr node)) (cadr node)))) + ;; (cond ((eq reduced 'end) "$") + ;; (t (apply #'string + ;; (append + ;; (seq-subseq str (car node) + ;; reduced + ;; ;; (cond ((integerp (cadr node)) (cadr node))) + ;; ) + ;; nil)))) + + ;; (format "%d): (%S, %S): (%d): \"%s\"" + ;; item + ;; (car node) + ;; (cadr node) + ;; (caddr node) + ;; (substring str (car node) (cond ((integerp (cadr node)) (cadr node)))) + ;; ) + ))))))) + +(provide 'suffix-tree) +;;; suffiex-tree.el ends here + +;;; archive + +;; ;;;###autoload +;; (defvar st-root nil +;; "The root of the suffix tree.") + +;; ;;;###autoload +;; (defvar st-position nil +;; "The current position in the string.") + +;; ;;;###autoload +;; (defvar st-wait-for-link nil +;; "The node that is waiting to have a suffix link.") + +;; ;;;###autoload +;; (defvar st-remain nil +;; "The number of remaining suffixes to insert.") + +;; ;;;###autoload +;; (defvar st-active-node nil +;; "Where to insert a new suffix.") + +;; ;;;###autoload +;; (defvar st-active-edge-index nil +;; "The index of the active edge.") + +;; ;;;###autoload +;; (defvar st-active-length nil +;; "How far down is the active point down the active edge.") + +;; (insert (format "after character %c, the tree becomes\n" character)) +;; (insert (format "string is %s\n" (substring str 0 (1+ position)))) +;; (insert (format "active-node: %d\n" active-node)) +;; (insert (format "active-edge: %c\n" (aref str active-edge-index))) +;; (insert (format "active-length: %d\n" active-length)) +;; (cond ((eq last-added 11) +;; (insert (format "slink: %d\n" (caddr (gethash active-node tree)))) +;; (insert (format "con: %S" continue-p)) +;; (insert (format "breakp: %S\n" breakp)))) +;; (st-print-tree tree (substring str 0 (1+ position))) +;; (insert "\n\n") + + +;; ;;;###autoload +;; (defvar st-output-buffer-name "*suffix-tree*" +;; "The name of the buffer that contains the output.") + +;; ;;;###autoload +;; (defun st-print-tree (tree node str prefix) +;; "Print TREE in the dedicated output buffer starting at NODE. +;; PREFIX is used to recursively call this function. + +;; We need STR since the information we stored in the tree +;; only contains the index into STR, +;; for the sake of optimizations." +;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil +;; (st-print-tree-internal tree node str prefix))) + +;; ;;;###autoload +;; (defun st-print-tree-internal (tree node str prefix) +;; "The function that really prints out the data." +;; (let* ((node-data (gethash node tree)) +;; (start (car node-data)) +;; (end (cadr node-data)) +;; (edges (cdddr node-data)) +;; (node-num (format "(%d)" node)) +;; (node-label (format "%s-%s-%s" +;; prefix node-num +;; (substring str start end)))) +;; (prin1 node-label) +;; ;; print the first edge +;; (let* ((prefix-base (concat +;; prefix +;; (make-string (length node-label) 32))) +;; ;; TODO +;; )))) + + |