summaryrefslogtreecommitdiff
path: root/suffix tree/generalized-suffix-tree.el
diff options
context:
space:
mode:
Diffstat (limited to 'suffix tree/generalized-suffix-tree.el')
-rw-r--r--suffix tree/generalized-suffix-tree.el465
1 files changed, 0 insertions, 465 deletions
diff --git a/suffix tree/generalized-suffix-tree.el b/suffix tree/generalized-suffix-tree.el
deleted file mode 100644
index 0435614..0000000
--- a/suffix tree/generalized-suffix-tree.el
+++ /dev/null
@@ -1,465 +0,0 @@
-;;; generalized-suffiex-tree.el --- Building a generalized suffix tree -*- lexical-binding: t; -*-
-
-;;; Author: Durand
-;;; Version: 0.0.2
-
-;;; Commentary:
-
-;; Our node is represented as a list of the following elements:
-
-;; number , from which string this edge comes
-;; start , which is the starting index of the edge going from its parent
-;; node to this node
-;; end , the index of the end of the above edge
-;; suffix-link, the index of the node this node's suffix-link points to
-;; children , a hash-table of pairs of integers and indices of its
-;; children
-;; leaf-labels, a list of pairs (number start), where
-;; number means the number-th string and
-;; start is the starting position of this edge.
-;; This element is nil for the internal nodes.
-
-;; To compute the length of the edge going into NODE we use:
-;; (- (min end (1+ position)) start)
-;; which is actually how far the position is on that edge,
-;; if it is on that edge.
-
-;; We use only one terminating symbol, and that is -1.
-
-;;; Code:
-
-;;;###autoload
-(defun gst-min (&rest args)
- "Return the minimum among ARGS.
-If an argument is 'infty, then it is considered greater than
-every number."
- (apply #'min (delq nil
- (mapcar
- (lambda (arg)
- (cond
- ((number-or-marker-p arg) arg)
- ((eq arg 'infty) nil)))
- args))))
-
-;;;###autoload
-(defun gst-edge-length (node position num str-lens)
- "Return the length of the edge into NODE.
-See the comment above this function for the reason
-POSITION and NUM are here.
-
-And STR-LENS is here so that we know the lengths of previous strings."
- (let* ((node-active-num (car node))
- (leafp (eq (car (cdr (cdr node))) 'infty))
- (leaf-labels (cond (leafp (car (nthcdr 5 node))))))
- (- (cond
- ((and (/= node-active-num num)
- leafp)
- (nth node-active-num str-lens))
- ((/= node-active-num num)
- (caddr node))
- (t (gst-min (caddr node) (1+ position))))
- (cond
- ((and (/= node-active-num num)
- leafp)
- (or (cadr (assoc num leaf-labels #'eq))
- (cadr node)))
- (t (cadr node))))))
-
-;;;###autoload
-(defun gst-new-node (tree last-added number start &optional end suffix-start)
- "Make a new node with START and END as the coming edge.
-NUMBER is the number of string the label belongs to.
-Then add the new node to TREE.
-LAST-ADDED is the number of elements already in the TREE."
- (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf
- )
- (suffix-link 0) ;; The suffix link is initially 0
- (new-node
- (cond ((eq end 'infty)
- (list number start end suffix-link (make-hash-table)
- (list (list number suffix-start))))
- (t
- (list number start end suffix-link (make-hash-table))))))
- (puthash (1+ last-added) new-node tree)
- (1+ last-added)))
-
-;;;###autoload
-(defun gst-add-leaf-label (tree leaf number start)
- "Add a label to LEAF in TREE.
-NUMBER and START represent the label."
- (let* ((actual-node (gethash leaf tree))
- (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node)))))))
- (cond ((and (consp leaf-labels)
- (not (eq (caaar leaf-labels) number)))
- (setcar leaf-labels
- (cons (list number start)
- (car leaf-labels)))))))
-
-;;;###autoload
-(defun gst-add-suffix-link (tree need-sl node)
- "If NEED-SL is positive, then add the suffix link.
-In particular, the node corresponding to NEED-SL in TREE
-gets a suffix link pointing to NODE.
-
-This always returns NODE."
- (let ((suffix-link-cdr (nthcdr 3 (gethash need-sl tree))))
- (cond
- ((and (> need-sl 1) (/= need-sl node)
- (= (car suffix-link-cdr) 0))
- (setcar suffix-link-cdr node))))
- node)
-
-;;;###autoload
-(defun gst-canonize (tree node position active-number active-edge-index active-length active-node
- str-lens)
- "Walk down TREE to find the correct active point.
-
-To be precise, we start from NODE, use POSITION, ACTIVE-NUMBER,
-and STR-LENS to calculate the length of the edge, and compare
-with ACTIVE-LENGTH. If it is less than that, then that means we
-shall continue walking down the tree, so we return t in the first
-element, while setting other variables to the correct values so
-that the caller of this function can use the updated values.
-
-ACTIVE-EDGE-INDEX is present solely for the purpose of updating."
- (let* ((actual-node (gethash node tree))
- (node-edge-length (gst-edge-length actual-node position active-number
- str-lens)))
- (cond
- ((>= active-length node-edge-length)
- (list t
- active-number
- (+ active-edge-index
- node-edge-length)
- (- active-length
- node-edge-length)
- node))
- (t
- (list nil
- active-number
- active-edge-index
- active-length
- active-node)))))
-
-;;;###autoload
-(defsubst gst-aref (strs num index)
- "Return the INDEX th element in NUM th element of STRS."
- (aref (nth num strs) index))
-
-;;;###autoload
-(defun gst-extend-tree (tree last-added position remain
- active-node active-number active-edge-index
- active-length character strs num str-lens)
- "Extend TREE by CHARACTER in NUM-th string of STRS.
-The return value is
-\(tree
- last-added remain active-node
- active-number active-edge-index active-length)
-
-Other parameters: LAST-ADDED, POSITION, REMAIN, ACTIVE-NODE,
-ACTIVE-NUMBER, ACTIVE-EDGE-INDEX, and ACTIVE-LENGTH have special
-meanings in the algorithm.
-
-STR-LENS can be supplied so that we don't have to calculate the
-lengths repeatedly."
- (let* ((need-sl 0)
- (remain (1+ remain))
- continue-p breakp terminating-no-split)
- (while (and (not breakp) (> remain 0))
- (setq continue-p nil breakp nil)
- (setq terminating-no-split nil)
- (cond
- ((= active-length 0)
- (setq active-edge-index position)
- (setq active-number num)))
- (let* ((actual-node (gethash active-node tree))
- (nxt (cond
- (actual-node
- (gethash (gst-aref strs active-number active-edge-index)
- (car (nthcdr 4 actual-node)))))))
- (cond
- ((null nxt)
- ;; We don't want many terminating characters branches.
- (cond
- ((and (eq character -1)
- (eq (caddr actual-node) 'infty)
- (eq (gst-aref strs (car actual-node)
- (cadr actual-node))
- -1)))
- (t
- (let ((leaf (gst-new-node tree last-added num position nil
- (- position remain -1))))
- (setq last-added leaf)
- (puthash (gst-aref strs active-number active-edge-index)
- leaf
- (car (nthcdr 4 (gethash active-node tree))))
- (setq need-sl (gst-add-suffix-link tree need-sl active-node)))))
- ;; rule 2
- )
- (t
- (let* ((result (gst-canonize
- tree nxt position active-number
- active-edge-index active-length active-node
- str-lens)))
- (cond
- ((car result)
- ;; observation 2
- (setq active-number (car (cdr result)))
- (setq active-edge-index (car (cdr (cdr result))))
- (setq active-length (car (cdr (cdr (cdr result)))))
- (setq active-node (car (cdr (cdr (cdr (cdr result))))))
- (setq continue-p t))
- (t
- (cond
- ((eq (gst-aref strs
- (car (gethash nxt tree))
- (+ active-length
- (cadr (gethash nxt tree))))
- character)
- ;; observation 1
- (setq active-length (1+ active-length))
- (setq need-sl (gst-add-suffix-link tree need-sl active-node))
- ;; (setq breakp t)
- ;; terminating symbol special handling
- (cond
- ((eq character -1) ; terminating symbol
- (gst-add-leaf-label tree nxt num (- position remain -1))
- (setq active-length (1- active-length))
- ;; We don't want to split since this is a match. But
- ;; we don't want to break since this is a
- ;; terminating symbol. So we invent a new variable
- ;; to do this.
- (setq terminating-no-split t)
- ;; But if remain = 1 then we do want to break.
- (cond
- ((eq remain 1)
- (setq breakp t)))
- )
- (t
- (setq breakp t)))))
- (cond
- ((or breakp terminating-no-split))
- (t ; splitting
- (let ((split (gst-new-node
- tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree))
- (+ (cadr (gethash nxt tree)) active-length))))
- (setq last-added split)
- (puthash
- (gst-aref strs active-number active-edge-index)
- split (cadr (cdr (cdr (cdr (gethash active-node tree))))))
- (let ((leaf (gst-new-node tree last-added num position
- nil (- position remain -1))))
- (setq last-added leaf)
- (puthash character
- leaf
- (cadddr (cdr (gethash split tree))))
- (setcar (cdr (gethash nxt tree))
- (min
- (+ (cadr (gethash nxt tree))
- active-length)
- (1- (nth (car (gethash nxt tree)) str-lens))))
- (puthash (gst-aref strs
- (car (gethash nxt tree))
- (cadr (gethash nxt tree)))
- nxt
- (cadddr (cdr (gethash split tree))))
- ;; rule 2
- (setq need-sl
- (gst-add-suffix-link tree need-sl split)))))))))))
- (cond
- ((or continue-p breakp))
- (t
- (setq remain (1- remain))
- (cond
- ((and (eq active-node 1) ; root
- (> active-length 0))
- (setq active-length (1- active-length))
- (setq active-edge-index
- (1+ (- position remain))))
- (t
- (setq active-node
- (let ((slink (cadddr (gethash active-node tree))))
- (cond ; follow the suffix link or go to root
- ((> slink 1) slink)
- (t 1)))))))))
- ;; (cond ((eq character -1)
- ;; (insert (format "Extending terminating character\n"))
- ;; (insert "strs: " (format "%S" strs) "\n")
- ;; (lcs-debug remain active-node active-edge-index
- ;; active-number active-length num
- ;; continue-p breakp terminating-no-split)
- ;; (gst-print-tree tree strs)))
- )
- (list tree last-added remain active-node
- active-number active-edge-index active-length)))
-
-;;;###autoload
-(defun gst-build-for-strs (strs &optional str-lens)
- "Build the generalized suffix tree for STRS.
-One can optionally provide STR-LENS to avoid calculating the
-lengths again."
- (let* ((len (length strs))
- (index 0)
- (tree (make-hash-table))
- (last-added 0)
- (active-node (gst-new-node tree last-added index -1 -1))
- (str-lens (or str-lens (mapcar #'length strs))))
- (setq last-added active-node)
- (while (< index len)
- (let* ((position 0)
- (character (gst-aref strs index position))
- (remain 0)
- (active-node 1) ; start from the root
- (active-number index) ; root comes
- (active-edge-index 0)
- (active-length 0)
- result) ; temporary holder
- (while (and (< position (nth index str-lens)) character)
- ;; (setq old-character character)
- (setq result (gst-extend-tree tree last-added position remain
- active-node active-number active-edge-index
- active-length character strs index
- str-lens))
- (setq tree (pop result))
- (setq last-added (pop result))
- (setq remain (pop result))
- (setq active-node (pop result))
- (setq active-number (pop result))
- (setq active-edge-index (pop result))
- (setq active-length (pop result))
- (setq position (1+ position))
- (setq character (ignore-errors (gst-aref strs index position)))
- ;; (cond
- ;; ((characterp old-character)
- ;; (insert (format "After adding character %c:\n" old-character))
- ;; (gst-print-tree tree
- ;; (append (seq-take strs index)
- ;; (list (concat
- ;; (seq-take (nth index strs) position)
- ;; "$"))))
- ;; (insert "\n\n"))
- ;; ((= old-character -1)
- ;; (insert (format "After adding character -1:\n"))
- ;; (gst-print-tree tree
- ;; (seq-take strs (1+ index)))
- ;; (insert "\n\n")))
- ))
- (setq index (1+ index))
- ;; (insert "\n \n")
- )
- tree))
-
-;;; Printing
-
-;; This section is not necessary. It is here just to print the trees
-;; to make sure we don't make some strange errors.
-
-(require 'hierarchy)
-(require 'seq)
-
-;;;###autoload
-(defun gst-print-strs (strs)
- "Print strings STRS nicely."
- (mapc (lambda (str)
- (mapc (lambda (char)
- (cond ((characterp char) (insert char))
- (t (insert ", "))))
- str))
- strs))
-
-;;;###autoload
-(defun gst-pretty-hash-table (table)
- "Only return the useful parts from TABLE.
-
-To be precise, this returns the keys and values of the
-hash-table, in the form of an alist."
- (let (keys-values-alist)
- (maphash
- (lambda (key value)
- (setq keys-values-alist
- (cons
- (cons key value)
- keys-values-alist)))
- table)
- keys-values-alist))
-
-;;;###autoload
-(defun gst-pretty-node (node)
- "Outputs a prettified NODE."
- (format "(%d %d %s %d %s %S)"
- (car node) (cadr node)
- (caddr node) (cadddr node)
- (gst-pretty-hash-table (car (nthcdr 4 node)))
- (car (nthcdr 5 node))))
-
-;;;###autoload
-(defun gst-print-tree-for-strs (strs)
- "Print the generalized suffix tree for STRS."
- (let* ((strs (mapcar (lambda (str)
- (vconcat str (list -1)))
- strs))
- (symbol (make-symbol "gtree")))
- (set symbol (gst-build-for-strs strs))
- (insert "Generalized suffix tree for: ")
- (gst-print-strs strs)
- (delete-region (- (point) 2) (point))
- (insert ":\n")
- (gst-print-tree (symbol-value symbol) strs)))
-
-;;;###autoload
-(defvar gst-full-error-p nil
- "Whether to print full nodes in the output.")
-
-;;;###autoload
-(defun gst-print-tree (tree strs)
- "Print TREE with the aid of STRS."
- (let* ((symbol-tree (make-symbol "new-hierarchy"))
- (strs (mapcar (lambda (str)
- (mapcar (lambda (c)
- (cond ((eq c -1) ?$)
- (c)))
- str))
- strs)))
- (set symbol-tree (hierarchy-new))
- (maphash
- (lambda (key value)
- (hierarchy-add-tree
- (symbol-value symbol-tree) key nil
- (lambda (item)
- (hash-table-values (car (nthcdr 4 (gethash item tree)))))))
- tree)
- (hierarchy-print
- (symbol-value symbol-tree)
- (lambda (item)
- (cond ((= item 1) "root")
- (t (let ((actual-str
- (let* ((node (gethash item tree))
- (leafp (eq (caddr node) 'infty))
- (leaf-labels
- (cond (leafp (car (nthcdr 5 node))))))
- (concat
- (seq-subseq
- (nth (car node) strs)
- (min (cadr node) (1- (length (nth (car node) strs))))
- (cond (leafp -1)
- (t (caddr node))))
- (cond
- (leafp
- (apply
- #'concat
- (mapcar
- (lambda (label)
- (let ((first (car label))
- (second (cadr label)))
- (format "$ (%d : %d)"
- first second)))
- leaf-labels))))))))
- (cond (gst-full-error-p
- (concat
- (format "key: %d, %s, %s" item actual-str
- (gst-pretty-node (gethash item tree)))))
- (t actual-str)))))))))
-
-(provide 'generalized-suffix-tree)
-
-;;; generalized-suffix-tree.el ends here