diff options
Diffstat (limited to 'suffix tree/generalized-suffix-tree.el')
-rw-r--r-- | suffix tree/generalized-suffix-tree.el | 465 |
1 files changed, 0 insertions, 465 deletions
diff --git a/suffix tree/generalized-suffix-tree.el b/suffix tree/generalized-suffix-tree.el deleted file mode 100644 index 0435614..0000000 --- a/suffix tree/generalized-suffix-tree.el +++ /dev/null @@ -1,465 +0,0 @@ -;;; generalized-suffiex-tree.el --- Building a generalized suffix tree -*- lexical-binding: t; -*- - -;;; Author: Durand -;;; Version: 0.0.2 - -;;; Commentary: - -;; Our node is represented as a list of the following elements: - -;; number , from which string this edge comes -;; start , which is the starting index of the edge going from its parent -;; node to this node -;; end , the index of the end of the above edge -;; suffix-link, the index of the node this node's suffix-link points to -;; children , a hash-table of pairs of integers and indices of its -;; children -;; leaf-labels, a list of pairs (number start), where -;; number means the number-th string and -;; start is the starting position of this edge. -;; This element is nil for the internal nodes. - -;; To compute the length of the edge going into NODE we use: -;; (- (min end (1+ position)) start) -;; which is actually how far the position is on that edge, -;; if it is on that edge. - -;; We use only one terminating symbol, and that is -1. - -;;; Code: - -;;;###autoload -(defun gst-min (&rest args) - "Return the minimum among ARGS. -If an argument is 'infty, then it is considered greater than -every number." - (apply #'min (delq nil - (mapcar - (lambda (arg) - (cond - ((number-or-marker-p arg) arg) - ((eq arg 'infty) nil))) - args)))) - -;;;###autoload -(defun gst-edge-length (node position num str-lens) - "Return the length of the edge into NODE. -See the comment above this function for the reason -POSITION and NUM are here. - -And STR-LENS is here so that we know the lengths of previous strings." - (let* ((node-active-num (car node)) - (leafp (eq (car (cdr (cdr node))) 'infty)) - (leaf-labels (cond (leafp (car (nthcdr 5 node)))))) - (- (cond - ((and (/= node-active-num num) - leafp) - (nth node-active-num str-lens)) - ((/= node-active-num num) - (caddr node)) - (t (gst-min (caddr node) (1+ position)))) - (cond - ((and (/= node-active-num num) - leafp) - (or (cadr (assoc num leaf-labels #'eq)) - (cadr node))) - (t (cadr node)))))) - -;;;###autoload -(defun gst-new-node (tree last-added number start &optional end suffix-start) - "Make a new node with START and END as the coming edge. -NUMBER is the number of string the label belongs to. -Then add the new node to TREE. -LAST-ADDED is the number of elements already in the TREE." - (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf - ) - (suffix-link 0) ;; The suffix link is initially 0 - (new-node - (cond ((eq end 'infty) - (list number start end suffix-link (make-hash-table) - (list (list number suffix-start)))) - (t - (list number start end suffix-link (make-hash-table)))))) - (puthash (1+ last-added) new-node tree) - (1+ last-added))) - -;;;###autoload -(defun gst-add-leaf-label (tree leaf number start) - "Add a label to LEAF in TREE. -NUMBER and START represent the label." - (let* ((actual-node (gethash leaf tree)) - (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node))))))) - (cond ((and (consp leaf-labels) - (not (eq (caaar leaf-labels) number))) - (setcar leaf-labels - (cons (list number start) - (car leaf-labels))))))) - -;;;###autoload -(defun gst-add-suffix-link (tree need-sl node) - "If NEED-SL is positive, then add the suffix link. -In particular, the node corresponding to NEED-SL in TREE -gets a suffix link pointing to NODE. - -This always returns NODE." - (let ((suffix-link-cdr (nthcdr 3 (gethash need-sl tree)))) - (cond - ((and (> need-sl 1) (/= need-sl node) - (= (car suffix-link-cdr) 0)) - (setcar suffix-link-cdr node)))) - node) - -;;;###autoload -(defun gst-canonize (tree node position active-number active-edge-index active-length active-node - str-lens) - "Walk down TREE to find the correct active point. - -To be precise, we start from NODE, use POSITION, ACTIVE-NUMBER, -and STR-LENS to calculate the length of the edge, and compare -with ACTIVE-LENGTH. If it is less than that, then that means we -shall continue walking down the tree, so we return t in the first -element, while setting other variables to the correct values so -that the caller of this function can use the updated values. - -ACTIVE-EDGE-INDEX is present solely for the purpose of updating." - (let* ((actual-node (gethash node tree)) - (node-edge-length (gst-edge-length actual-node position active-number - str-lens))) - (cond - ((>= active-length node-edge-length) - (list t - active-number - (+ active-edge-index - node-edge-length) - (- active-length - node-edge-length) - node)) - (t - (list nil - active-number - active-edge-index - active-length - active-node))))) - -;;;###autoload -(defsubst gst-aref (strs num index) - "Return the INDEX th element in NUM th element of STRS." - (aref (nth num strs) index)) - -;;;###autoload -(defun gst-extend-tree (tree last-added position remain - active-node active-number active-edge-index - active-length character strs num str-lens) - "Extend TREE by CHARACTER in NUM-th string of STRS. -The return value is -\(tree - last-added remain active-node - active-number active-edge-index active-length) - -Other parameters: LAST-ADDED, POSITION, REMAIN, ACTIVE-NODE, -ACTIVE-NUMBER, ACTIVE-EDGE-INDEX, and ACTIVE-LENGTH have special -meanings in the algorithm. - -STR-LENS can be supplied so that we don't have to calculate the -lengths repeatedly." - (let* ((need-sl 0) - (remain (1+ remain)) - continue-p breakp terminating-no-split) - (while (and (not breakp) (> remain 0)) - (setq continue-p nil breakp nil) - (setq terminating-no-split nil) - (cond - ((= active-length 0) - (setq active-edge-index position) - (setq active-number num))) - (let* ((actual-node (gethash active-node tree)) - (nxt (cond - (actual-node - (gethash (gst-aref strs active-number active-edge-index) - (car (nthcdr 4 actual-node))))))) - (cond - ((null nxt) - ;; We don't want many terminating characters branches. - (cond - ((and (eq character -1) - (eq (caddr actual-node) 'infty) - (eq (gst-aref strs (car actual-node) - (cadr actual-node)) - -1))) - (t - (let ((leaf (gst-new-node tree last-added num position nil - (- position remain -1)))) - (setq last-added leaf) - (puthash (gst-aref strs active-number active-edge-index) - leaf - (car (nthcdr 4 (gethash active-node tree)))) - (setq need-sl (gst-add-suffix-link tree need-sl active-node))))) - ;; rule 2 - ) - (t - (let* ((result (gst-canonize - tree nxt position active-number - active-edge-index active-length active-node - str-lens))) - (cond - ((car result) - ;; observation 2 - (setq active-number (car (cdr result))) - (setq active-edge-index (car (cdr (cdr result)))) - (setq active-length (car (cdr (cdr (cdr result))))) - (setq active-node (car (cdr (cdr (cdr (cdr result)))))) - (setq continue-p t)) - (t - (cond - ((eq (gst-aref strs - (car (gethash nxt tree)) - (+ active-length - (cadr (gethash nxt tree)))) - character) - ;; observation 1 - (setq active-length (1+ active-length)) - (setq need-sl (gst-add-suffix-link tree need-sl active-node)) - ;; (setq breakp t) - ;; terminating symbol special handling - (cond - ((eq character -1) ; terminating symbol - (gst-add-leaf-label tree nxt num (- position remain -1)) - (setq active-length (1- active-length)) - ;; We don't want to split since this is a match. But - ;; we don't want to break since this is a - ;; terminating symbol. So we invent a new variable - ;; to do this. - (setq terminating-no-split t) - ;; But if remain = 1 then we do want to break. - (cond - ((eq remain 1) - (setq breakp t))) - ) - (t - (setq breakp t))))) - (cond - ((or breakp terminating-no-split)) - (t ; splitting - (let ((split (gst-new-node - tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree)) - (+ (cadr (gethash nxt tree)) active-length)))) - (setq last-added split) - (puthash - (gst-aref strs active-number active-edge-index) - split (cadr (cdr (cdr (cdr (gethash active-node tree)))))) - (let ((leaf (gst-new-node tree last-added num position - nil (- position remain -1)))) - (setq last-added leaf) - (puthash character - leaf - (cadddr (cdr (gethash split tree)))) - (setcar (cdr (gethash nxt tree)) - (min - (+ (cadr (gethash nxt tree)) - active-length) - (1- (nth (car (gethash nxt tree)) str-lens)))) - (puthash (gst-aref strs - (car (gethash nxt tree)) - (cadr (gethash nxt tree))) - nxt - (cadddr (cdr (gethash split tree)))) - ;; rule 2 - (setq need-sl - (gst-add-suffix-link tree need-sl split))))))))))) - (cond - ((or continue-p breakp)) - (t - (setq remain (1- remain)) - (cond - ((and (eq active-node 1) ; root - (> active-length 0)) - (setq active-length (1- active-length)) - (setq active-edge-index - (1+ (- position remain)))) - (t - (setq active-node - (let ((slink (cadddr (gethash active-node tree)))) - (cond ; follow the suffix link or go to root - ((> slink 1) slink) - (t 1))))))))) - ;; (cond ((eq character -1) - ;; (insert (format "Extending terminating character\n")) - ;; (insert "strs: " (format "%S" strs) "\n") - ;; (lcs-debug remain active-node active-edge-index - ;; active-number active-length num - ;; continue-p breakp terminating-no-split) - ;; (gst-print-tree tree strs))) - ) - (list tree last-added remain active-node - active-number active-edge-index active-length))) - -;;;###autoload -(defun gst-build-for-strs (strs &optional str-lens) - "Build the generalized suffix tree for STRS. -One can optionally provide STR-LENS to avoid calculating the -lengths again." - (let* ((len (length strs)) - (index 0) - (tree (make-hash-table)) - (last-added 0) - (active-node (gst-new-node tree last-added index -1 -1)) - (str-lens (or str-lens (mapcar #'length strs)))) - (setq last-added active-node) - (while (< index len) - (let* ((position 0) - (character (gst-aref strs index position)) - (remain 0) - (active-node 1) ; start from the root - (active-number index) ; root comes - (active-edge-index 0) - (active-length 0) - result) ; temporary holder - (while (and (< position (nth index str-lens)) character) - ;; (setq old-character character) - (setq result (gst-extend-tree tree last-added position remain - active-node active-number active-edge-index - active-length character strs index - str-lens)) - (setq tree (pop result)) - (setq last-added (pop result)) - (setq remain (pop result)) - (setq active-node (pop result)) - (setq active-number (pop result)) - (setq active-edge-index (pop result)) - (setq active-length (pop result)) - (setq position (1+ position)) - (setq character (ignore-errors (gst-aref strs index position))) - ;; (cond - ;; ((characterp old-character) - ;; (insert (format "After adding character %c:\n" old-character)) - ;; (gst-print-tree tree - ;; (append (seq-take strs index) - ;; (list (concat - ;; (seq-take (nth index strs) position) - ;; "$")))) - ;; (insert "\n\n")) - ;; ((= old-character -1) - ;; (insert (format "After adding character -1:\n")) - ;; (gst-print-tree tree - ;; (seq-take strs (1+ index))) - ;; (insert "\n\n"))) - )) - (setq index (1+ index)) - ;; (insert "\n\n") - ) - tree)) - -;;; Printing - -;; This section is not necessary. It is here just to print the trees -;; to make sure we don't make some strange errors. - -(require 'hierarchy) -(require 'seq) - -;;;###autoload -(defun gst-print-strs (strs) - "Print strings STRS nicely." - (mapc (lambda (str) - (mapc (lambda (char) - (cond ((characterp char) (insert char)) - (t (insert ", ")))) - str)) - strs)) - -;;;###autoload -(defun gst-pretty-hash-table (table) - "Only return the useful parts from TABLE. - -To be precise, this returns the keys and values of the -hash-table, in the form of an alist." - (let (keys-values-alist) - (maphash - (lambda (key value) - (setq keys-values-alist - (cons - (cons key value) - keys-values-alist))) - table) - keys-values-alist)) - -;;;###autoload -(defun gst-pretty-node (node) - "Outputs a prettified NODE." - (format "(%d %d %s %d %s %S)" - (car node) (cadr node) - (caddr node) (cadddr node) - (gst-pretty-hash-table (car (nthcdr 4 node))) - (car (nthcdr 5 node)))) - -;;;###autoload -(defun gst-print-tree-for-strs (strs) - "Print the generalized suffix tree for STRS." - (let* ((strs (mapcar (lambda (str) - (vconcat str (list -1))) - strs)) - (symbol (make-symbol "gtree"))) - (set symbol (gst-build-for-strs strs)) - (insert "Generalized suffix tree for: ") - (gst-print-strs strs) - (delete-region (- (point) 2) (point)) - (insert ":\n") - (gst-print-tree (symbol-value symbol) strs))) - -;;;###autoload -(defvar gst-full-error-p nil - "Whether to print full nodes in the output.") - -;;;###autoload -(defun gst-print-tree (tree strs) - "Print TREE with the aid of STRS." - (let* ((symbol-tree (make-symbol "new-hierarchy")) - (strs (mapcar (lambda (str) - (mapcar (lambda (c) - (cond ((eq c -1) ?$) - (c))) - str)) - strs))) - (set symbol-tree (hierarchy-new)) - (maphash - (lambda (key value) - (hierarchy-add-tree - (symbol-value symbol-tree) key nil - (lambda (item) - (hash-table-values (car (nthcdr 4 (gethash item tree))))))) - tree) - (hierarchy-print - (symbol-value symbol-tree) - (lambda (item) - (cond ((= item 1) "root") - (t (let ((actual-str - (let* ((node (gethash item tree)) - (leafp (eq (caddr node) 'infty)) - (leaf-labels - (cond (leafp (car (nthcdr 5 node)))))) - (concat - (seq-subseq - (nth (car node) strs) - (min (cadr node) (1- (length (nth (car node) strs)))) - (cond (leafp -1) - (t (caddr node)))) - (cond - (leafp - (apply - #'concat - (mapcar - (lambda (label) - (let ((first (car label)) - (second (cadr label))) - (format "$ (%d : %d)" - first second))) - leaf-labels)))))))) - (cond (gst-full-error-p - (concat - (format "key: %d, %s, %s" item actual-str - (gst-pretty-node (gethash item tree))))) - (t actual-str))))))))) - -(provide 'generalized-suffix-tree) - -;;; generalized-suffix-tree.el ends here |