diff options
Diffstat (limited to 'suffix tree/generalized-suffix-tree.el')
-rw-r--r-- | suffix tree/generalized-suffix-tree.el | 122 |
1 files changed, 83 insertions, 39 deletions
diff --git a/suffix tree/generalized-suffix-tree.el b/suffix tree/generalized-suffix-tree.el index 227aea9..07d9b18 100644 --- a/suffix tree/generalized-suffix-tree.el +++ b/suffix tree/generalized-suffix-tree.el @@ -45,7 +45,9 @@ every number." (defun gst-edge-length (node position num str-lens) "Return the length of the edge into NODE. See the comment above this function for the reason -POSITION and NUM are here." +POSITION and NUM are here. + +And STR-LENS is here so that we know the lengths of previous strings." (let* ((node-active-num (car node)) (leafp (eq (car (cdr (cdr node))) 'infty)) (leaf-labels (cond (leafp (car (nthcdr 5 node)))))) @@ -83,10 +85,12 @@ LAST-ADDED is the number of elements already in the TREE." ;;;###autoload (defun gst-add-leaf-label (tree leaf number start) - "Add a label to LEAF." + "Add a label to LEAF in TREE. +NUMBER and START represent the label." (let* ((actual-node (gethash leaf tree)) (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node))))))) - (cond ((consp leaf-labels) + (cond ((and (consp leaf-labels) + (not (eq (caaar leaf-labels) number))) (setcar leaf-labels (cons (list number start) (car leaf-labels))))))) @@ -106,7 +110,16 @@ This always returns NODE." ;;;###autoload (defun gst-canonize (tree node position active-number active-edge-index active-length active-node str-lens) - "Walk down TREE to find the correct active point." + "Walk down TREE to find the correct active point. + +To be precise, we start from NODE, use POSITION, ACTIVE-NUMBER, +and STR-LENS to calculate the length of the edge, and compare +with ACTIVE-LENGTH. If it is less than that, then that means we +shall continue walking down the tree, so we return t in the first +element, while setting other variables to the correct values so +that the caller of this function can use the updated values. + +ACTIVE-EDGE-INDEX is for the purpose of updating solely." (let* ((actual-node (gethash node tree)) (node-edge-length (gst-edge-length actual-node position active-number str-lens))) @@ -128,23 +141,31 @@ This always returns NODE." ;;;###autoload (defsubst gst-aref (strs num index) - "Return the INDEX-th element in NUM-th element of STRS." + "Return the INDEX th element in NUM th element of STRS." (aref (nth num strs) index)) ;;;###autoload (defun gst-extend-tree (tree last-added position remain active-node active-number active-edge-index active-length character strs num str-lens) - "Extend a tree by CHARACTER in NUM-th string of STRS. + "Extend TREE by CHARACTER in NUM-th string of STRS. The return value is -(tree +\(tree last-added remain active-node - active-number active-edge-index active-length)" + active-number active-edge-index active-length) + +Other parameters: LAST-ADDED, POSITION, REMAIN, ACTIVE-NODE, +ACTIVE-NUMBER, ACTIVE-EDGE-INDEX, and ACTIVE-LENGTH have special +meanings in the algorithm. + +STR-LENS can be supplied so that we don't have to calculate the +lengths repeatedly." (let* ((need-sl 0) (remain (1+ remain)) - continue-p breakp) + continue-p breakp terminating-no-split) (while (and (not breakp) (> remain 0)) (setq continue-p nil breakp nil) + (setq terminating-no-split nil) (cond ((= active-length 0) (setq active-edge-index position) @@ -153,7 +174,7 @@ The return value is (nxt (cond (actual-node (gethash (gst-aref strs active-number active-edge-index) - (cadr (cdr (cdr (cdr actual-node))))))))) + (car (nthcdr 4 actual-node))))))) (cond ((null nxt) (let ((leaf (gst-new-node tree last-added num position nil @@ -161,7 +182,7 @@ The return value is (setq last-added leaf) (puthash (gst-aref strs active-number active-edge-index) leaf - (cadr (cdr (cdr (cdr (gethash active-node tree)))))) + (car (nthcdr 4 (gethash active-node tree)))) (setq need-sl (gst-add-suffix-link tree need-sl active-node))) ;; rule 2 ) @@ -188,14 +209,22 @@ The return value is ;; observation 1 (setq active-length (1+ active-length)) (setq need-sl (gst-add-suffix-link tree need-sl active-node)) - (setq breakp t) - ;; add a label + ;; (setq breakp t) + ;; terminating symbol special handling (cond ((eq character -1) ; terminating symbol - (gst-add-leaf-label tree nxt num (- position remain -1)))))) + (gst-add-leaf-label tree nxt num (- position remain -1)) + (setq active-length (1- active-length)) + ;; We don't want to split since this is a match. But + ;; we don't want to break since this is a + ;; terminating symbol. So we invent a new variable + ;; to do this. + (setq terminating-no-split t)) + (t + (setq breakp t))))) (cond - (breakp) - (t ;; splitting + ((or breakp terminating-no-split)) + (t ; splitting (let ((split (gst-new-node tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree)) (+ (cadr (gethash nxt tree)) active-length)))) @@ -206,11 +235,14 @@ The return value is (let ((leaf (gst-new-node tree last-added num position nil (- position remain -1)))) (setq last-added leaf) - (puthash character leaf + (puthash character + leaf (cadddr (cdr (gethash split tree)))) (setcar (cdr (gethash nxt tree)) - (+ (cadr (gethash nxt tree)) - active-length)) + (min + (+ (cadr (gethash nxt tree)) + active-length) + (1- (nth (car (gethash nxt tree)) str-lens)))) (puthash (gst-aref strs (car (gethash nxt tree)) (cadr (gethash nxt tree))) @@ -258,9 +290,9 @@ lengths again." (active-number index) (active-edge-index 0) (active-length 0) - old-character result) ; temporary holder + result) ; temporary holder (while character - (setq old-character character) + ;; (setq old-character character) (setq result (gst-extend-tree tree last-added position remain active-node active-number active-edge-index active-length character strs index @@ -274,22 +306,24 @@ lengths again." (setq active-length (pop result)) (setq position (1+ position)) (setq character (ignore-errors (gst-aref strs index position))) - (cond - ((characterp old-character) - (insert (format "After adding character %c:\n" old-character)) - (gst-print-tree tree - (append (seq-take strs index) - (list (concat - (seq-take (nth index strs) position) - "$")))) - (insert "\n\n")) - ((= old-character -1) - (insert (format "After adding character -1:\n")) - (gst-print-tree tree - (seq-take strs (1+ index))) - (insert "\n\n"))))) + ;; (cond + ;; ((characterp old-character) + ;; (insert (format "After adding character %c:\n" old-character)) + ;; (gst-print-tree tree + ;; (append (seq-take strs index) + ;; (list (concat + ;; (seq-take (nth index strs) position) + ;; "$")))) + ;; (insert "\n\n")) + ;; ((= old-character -1) + ;; (insert (format "After adding character -1:\n")) + ;; (gst-print-tree tree + ;; (seq-take strs (1+ index))) + ;; (insert "\n\n"))) + )) (setq index (1+ index)) - (insert "\n\n")) + ;; (insert "\n\n") + ) tree)) ;;; Printing @@ -312,7 +346,10 @@ lengths again." ;;;###autoload (defun gst-pretty-hash-table (table) - "Only returns the useful parts from a table." + "Only return the useful parts from TABLE. + +To be precise, this returns the keys and values of the +hash-table, in the form of an alist." (let (keys-values-alist) (maphash (lambda (key value) @@ -353,7 +390,13 @@ lengths again." ;;;###autoload (defun gst-print-tree (tree strs) "Print TREE with the aid of STRS." - (let* ((symbol-tree (make-symbol "new-hierarchy"))) + (let* ((symbol-tree (make-symbol "new-hierarchy")) + (strs (mapcar (lambda (str) + (mapcar (lambda (c) + (cond ((eq c -1) ?$) + (c))) + str)) + strs))) (set symbol-tree (hierarchy-new)) (maphash (lambda (key value) @@ -395,4 +438,5 @@ lengths again." (t actual-str))))))))) (provide 'generalized-suffix-tree) -;;; generalized-suffiex-tree.el ends here + +;;; generalized-suffix-tree.el ends here |