summaryrefslogtreecommitdiff
path: root/suffix tree/generalized-suffix-tree.el
diff options
context:
space:
mode:
Diffstat (limited to 'suffix tree/generalized-suffix-tree.el')
-rw-r--r--suffix tree/generalized-suffix-tree.el122
1 files changed, 83 insertions, 39 deletions
diff --git a/suffix tree/generalized-suffix-tree.el b/suffix tree/generalized-suffix-tree.el
index 227aea9..07d9b18 100644
--- a/suffix tree/generalized-suffix-tree.el
+++ b/suffix tree/generalized-suffix-tree.el
@@ -45,7 +45,9 @@ every number."
(defun gst-edge-length (node position num str-lens)
"Return the length of the edge into NODE.
See the comment above this function for the reason
-POSITION and NUM are here."
+POSITION and NUM are here.
+
+And STR-LENS is here so that we know the lengths of previous strings."
(let* ((node-active-num (car node))
(leafp (eq (car (cdr (cdr node))) 'infty))
(leaf-labels (cond (leafp (car (nthcdr 5 node))))))
@@ -83,10 +85,12 @@ LAST-ADDED is the number of elements already in the TREE."
;;;###autoload
(defun gst-add-leaf-label (tree leaf number start)
- "Add a label to LEAF."
+ "Add a label to LEAF in TREE.
+NUMBER and START represent the label."
(let* ((actual-node (gethash leaf tree))
(leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node)))))))
- (cond ((consp leaf-labels)
+ (cond ((and (consp leaf-labels)
+ (not (eq (caaar leaf-labels) number)))
(setcar leaf-labels
(cons (list number start)
(car leaf-labels)))))))
@@ -106,7 +110,16 @@ This always returns NODE."
;;;###autoload
(defun gst-canonize (tree node position active-number active-edge-index active-length active-node
str-lens)
- "Walk down TREE to find the correct active point."
+ "Walk down TREE to find the correct active point.
+
+To be precise, we start from NODE, use POSITION, ACTIVE-NUMBER,
+and STR-LENS to calculate the length of the edge, and compare
+with ACTIVE-LENGTH. If it is less than that, then that means we
+shall continue walking down the tree, so we return t in the first
+element, while setting other variables to the correct values so
+that the caller of this function can use the updated values.
+
+ACTIVE-EDGE-INDEX is for the purpose of updating solely."
(let* ((actual-node (gethash node tree))
(node-edge-length (gst-edge-length actual-node position active-number
str-lens)))
@@ -128,23 +141,31 @@ This always returns NODE."
;;;###autoload
(defsubst gst-aref (strs num index)
- "Return the INDEX-th element in NUM-th element of STRS."
+ "Return the INDEX th element in NUM th element of STRS."
(aref (nth num strs) index))
;;;###autoload
(defun gst-extend-tree (tree last-added position remain
active-node active-number active-edge-index
active-length character strs num str-lens)
- "Extend a tree by CHARACTER in NUM-th string of STRS.
+ "Extend TREE by CHARACTER in NUM-th string of STRS.
The return value is
-(tree
+\(tree
last-added remain active-node
- active-number active-edge-index active-length)"
+ active-number active-edge-index active-length)
+
+Other parameters: LAST-ADDED, POSITION, REMAIN, ACTIVE-NODE,
+ACTIVE-NUMBER, ACTIVE-EDGE-INDEX, and ACTIVE-LENGTH have special
+meanings in the algorithm.
+
+STR-LENS can be supplied so that we don't have to calculate the
+lengths repeatedly."
(let* ((need-sl 0)
(remain (1+ remain))
- continue-p breakp)
+ continue-p breakp terminating-no-split)
(while (and (not breakp) (> remain 0))
(setq continue-p nil breakp nil)
+ (setq terminating-no-split nil)
(cond
((= active-length 0)
(setq active-edge-index position)
@@ -153,7 +174,7 @@ The return value is
(nxt (cond
(actual-node
(gethash (gst-aref strs active-number active-edge-index)
- (cadr (cdr (cdr (cdr actual-node)))))))))
+ (car (nthcdr 4 actual-node)))))))
(cond
((null nxt)
(let ((leaf (gst-new-node tree last-added num position nil
@@ -161,7 +182,7 @@ The return value is
(setq last-added leaf)
(puthash (gst-aref strs active-number active-edge-index)
leaf
- (cadr (cdr (cdr (cdr (gethash active-node tree))))))
+ (car (nthcdr 4 (gethash active-node tree))))
(setq need-sl (gst-add-suffix-link tree need-sl active-node)))
;; rule 2
)
@@ -188,14 +209,22 @@ The return value is
;; observation 1
(setq active-length (1+ active-length))
(setq need-sl (gst-add-suffix-link tree need-sl active-node))
- (setq breakp t)
- ;; add a label
+ ;; (setq breakp t)
+ ;; terminating symbol special handling
(cond
((eq character -1) ; terminating symbol
- (gst-add-leaf-label tree nxt num (- position remain -1))))))
+ (gst-add-leaf-label tree nxt num (- position remain -1))
+ (setq active-length (1- active-length))
+ ;; We don't want to split since this is a match. But
+ ;; we don't want to break since this is a
+ ;; terminating symbol. So we invent a new variable
+ ;; to do this.
+ (setq terminating-no-split t))
+ (t
+ (setq breakp t)))))
(cond
- (breakp)
- (t ;; splitting
+ ((or breakp terminating-no-split))
+ (t ; splitting
(let ((split (gst-new-node
tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree))
(+ (cadr (gethash nxt tree)) active-length))))
@@ -206,11 +235,14 @@ The return value is
(let ((leaf (gst-new-node tree last-added num position
nil (- position remain -1))))
(setq last-added leaf)
- (puthash character leaf
+ (puthash character
+ leaf
(cadddr (cdr (gethash split tree))))
(setcar (cdr (gethash nxt tree))
- (+ (cadr (gethash nxt tree))
- active-length))
+ (min
+ (+ (cadr (gethash nxt tree))
+ active-length)
+ (1- (nth (car (gethash nxt tree)) str-lens))))
(puthash (gst-aref strs
(car (gethash nxt tree))
(cadr (gethash nxt tree)))
@@ -258,9 +290,9 @@ lengths again."
(active-number index)
(active-edge-index 0)
(active-length 0)
- old-character result) ; temporary holder
+ result) ; temporary holder
(while character
- (setq old-character character)
+ ;; (setq old-character character)
(setq result (gst-extend-tree tree last-added position remain
active-node active-number active-edge-index
active-length character strs index
@@ -274,22 +306,24 @@ lengths again."
(setq active-length (pop result))
(setq position (1+ position))
(setq character (ignore-errors (gst-aref strs index position)))
- (cond
- ((characterp old-character)
- (insert (format "After adding character %c:\n" old-character))
- (gst-print-tree tree
- (append (seq-take strs index)
- (list (concat
- (seq-take (nth index strs) position)
- "$"))))
- (insert "\n\n"))
- ((= old-character -1)
- (insert (format "After adding character -1:\n"))
- (gst-print-tree tree
- (seq-take strs (1+ index)))
- (insert "\n\n")))))
+ ;; (cond
+ ;; ((characterp old-character)
+ ;; (insert (format "After adding character %c:\n" old-character))
+ ;; (gst-print-tree tree
+ ;; (append (seq-take strs index)
+ ;; (list (concat
+ ;; (seq-take (nth index strs) position)
+ ;; "$"))))
+ ;; (insert "\n\n"))
+ ;; ((= old-character -1)
+ ;; (insert (format "After adding character -1:\n"))
+ ;; (gst-print-tree tree
+ ;; (seq-take strs (1+ index)))
+ ;; (insert "\n\n")))
+ ))
(setq index (1+ index))
- (insert "\n \n"))
+ ;; (insert "\n \n")
+ )
tree))
;;; Printing
@@ -312,7 +346,10 @@ lengths again."
;;;###autoload
(defun gst-pretty-hash-table (table)
- "Only returns the useful parts from a table."
+ "Only return the useful parts from TABLE.
+
+To be precise, this returns the keys and values of the
+hash-table, in the form of an alist."
(let (keys-values-alist)
(maphash
(lambda (key value)
@@ -353,7 +390,13 @@ lengths again."
;;;###autoload
(defun gst-print-tree (tree strs)
"Print TREE with the aid of STRS."
- (let* ((symbol-tree (make-symbol "new-hierarchy")))
+ (let* ((symbol-tree (make-symbol "new-hierarchy"))
+ (strs (mapcar (lambda (str)
+ (mapcar (lambda (c)
+ (cond ((eq c -1) ?$)
+ (c)))
+ str))
+ strs)))
(set symbol-tree (hierarchy-new))
(maphash
(lambda (key value)
@@ -395,4 +438,5 @@ lengths again."
(t actual-str)))))))))
(provide 'generalized-suffix-tree)
-;;; generalized-suffiex-tree.el ends here
+
+;;; generalized-suffix-tree.el ends here