diff options
Diffstat (limited to 'suffix tree/LCS.el')
-rw-r--r-- | suffix tree/LCS.el | 327 |
1 files changed, 0 insertions, 327 deletions
diff --git a/suffix tree/LCS.el b/suffix tree/LCS.el deleted file mode 100644 index 41bcec0..0000000 --- a/suffix tree/LCS.el +++ /dev/null @@ -1,327 +0,0 @@ -;;; LCS.el --- Longest Common Substrings of multiple strings -*- lexical-binding: t; -*- - -;;; Author: Durand -;;; Version: 0.0.1 - -;;; Commentary: - -;; First generate a generalized suffix tree of the strings, then -;; traverses the tree in a depth-first manner to find the nodes under -;; which there are suffixes from each and every string in question. -;; Finally collect and return the list of all substrings which are -;; common to all input strings and which have the longest length. - -;;; Code: - -;;; Our building block -(require 'generalized-suffix-tree) - -;;;###autoload -(defun lcs-string (strs) - "Return the longest common substring of STRS." - (lcs-interpret (lcs strs) strs)) - -;;;###autoload -(defun lcs-edge-length (node str-lens) - "Return the length of the edge going into NODE. - -Unlike `gst-edge-length', this does not need to know about -POSITION and NUM, since it is assumed that the tree is already -built before calling this function. - -If the node is a leaf, its end is denoted by infty, and we -wouldn't know the exact length if we don't know the length of the -string it corresponds to, so we need the argument STR-LENS to -provide that information." - (let ((start (car (cdr node))) - (end (cond - ((eq (car (cdr (cdr node))) 'infty) - (nth (car node) str-lens)) - (t (car (cdr (cdr node))))))) - (- end start))) - -;;;###autoload -(defun lcs-get-parent-length (parents) - "Return the length of the parent strings PARENTS." - (let* ((ls parents) - (result 0)) - (while (consp ls) - (setq result (+ result (- (caddr (car ls)) - (cadr (car ls))))) - (setq ls (cdr ls))) - result)) - -;;;###autoload -(defun lcs-get-env (node str-lens) - "Return the information we want to keep from NODE on the stack. - -STR-LENS is here to provide information about the ending index of -NODE if it is a lead. See the documentation for `lcs-edge-length' -for more." - (let ((num (car node)) - (start (car (cdr node))) - (end (cond - ((eq (car (cdr (cdr node))) 'infty) - (nth (car node) str-lens)) - (t (car (cdr (cdr node)))))) - (children (car (nthcdr 4 node)))) - (list num start end children))) - -;;; For debug purposes - -;;;###autoload -(defmacro lcs-debug (&rest vars) - "Print the names and the values of VARS." - `(insert - (mapconcat 'identity - (list - ,@(mapcar - (lambda (var) - `(format "%s: %S" - ,(symbol-name var) - ,var)) - vars)) - "\n") - "\n")) - -;;;###autoload -(defun lcs-pretty-stack-element (element) - "Return a pretty representation of ELEMENT." - (concat - "(" - (mapconcat - (lambda (slot) - (format "%S" - (cond - ((hash-table-p slot) - (gst-pretty-hash-table slot)) - ((consp slot) - (mapconcat #'lcs-pretty-stack-element slot ", ")) - (t slot)))) - element ", ") - ")")) - -;;;###autoload -(defun lcs-pretty-stack (stack) - "Prettify STACK. - -Apply `lcs-pretty-stack-element' to each element, and concatenate -them in a clean way." - (concat - "[ " - (mapconcat #'lcs-pretty-stack-element - stack - ",") - " ]" - "\n")) - -;;;###autoload -(defun lcs-prepare-env (long-env) - "Take the first three elements out of LONG-ENV." - (cons (car long-env) - (cons (cadr long-env) - (cons (caddr long-env) - nil)))) - -;;; Interpreting the result - -;;;###autoload -(defun lcs-interpret (result strs &optional just-length) - "Since the raw RESULT of `lcs' is not human-readable, we interpret it. -If JUST-LENGTH is non-nil, then this only returns the length of -the longest common substrings. Otherwise, it returns the longest -common substrings themselves. - -Of course, without STRS, we don't even know what our strings are." - (cond - (just-length (car result)) - (t (mapcar - (lambda (one-result) - (let ((chain one-result) - temp res-str) - (while (consp one-result) - (setq temp (car one-result)) - (setq res-str - (concat - (let ((nstr (nth (car temp) strs))) - (substring nstr - (cadr temp) - (min (caddr temp) (length nstr)))) - res-str)) - (setq one-result (cdr one-result))) - res-str)) - (cadr result))))) - -;;; The main engine - -;;;###autoload -(defun lcs (strs &optional just-length) - "Return the longest common substring of a list STRS of strings. - -If JUST-LENGTH is non-nil, then this only returns the length of the -longest common substring. Otherwise, it returns the length as well as the -longest common substring itself." - (let* ((strs (mapcar (lambda (str) - (vconcat str (list -1))) - strs)) - (str-lens (mapcar #'length strs)) - (tree (gst-build-for-strs strs str-lens)) - (max-height 0) - (bits-table (make-hash-table)) - (discovered-table (make-hash-table)) - (all-common (1- (expt 2 (length strs)))) - max-env stack current temp) - ;; The format of elements on the stack is as follows. - ;; (number start end children node-number parents) - - ;; Here number identifies the string this comes from. start, end - ;; and children have the same meaning as in the tree. Node-number - ;; is the number of the node that is recorded in the hash-table - ;; tree. And finally parents is a list of triples that represents - ;; parent segments. - (setq stack (progn - (maphash - (lambda (letter node) - (setq temp - (cons - (append - (lcs-get-env (gethash node tree) str-lens) - (list node nil ; parent strings represented as a triple of integers - )) - temp))) - (car (nthcdr 4 (gethash 1 tree)))) - temp)) - (setq temp nil) - ;; (insert (format "count: %d\n" count)) - ;; (lcs-debug all-common) - ;; (insert "stack: " (lcs-pretty-stack stack)) - (while (consp stack) - ;; (insert "stack: " (lcs-pretty-stack stack)) - (setq current (car stack)) - (let ((current-length (+ (lcs-get-parent-length (car (nthcdr 5 current))) - (- (min (caddr current) - ;; The final character does - ;; not count! - (1- (nth (car current) - str-lens))) - (cadr current)))) - (current-env (list (car current) - (cadr current) - (caddr current) - (car (nthcdr 3 current)))) - (current-children (car (nthcdr 3 current)))) - ;; (insert (format "\n\ncount: %d\n" count)) - ;; (insert "current: " (lcs-pretty-stack-element current) "\n") - ;; (lcs-debug current-length max-env max-height) - ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children))) - ;; (insert (format - ;; "discovered: %S\n" - ;; (gst-pretty-hash-table discovered-table))) - ;; (insert (format - ;; "bits: %S\n" - ;; (gst-pretty-hash-table bits-table))) - (cond - ((not (hash-table-empty-p current-children)) ; There are children - (cond - ((gethash (car (nthcdr 4 current)) discovered-table) - ;; This node is already traversed - ;; This means we have already given bits to its children - (let ((bit (progn - (setq temp 0) - (maphash (lambda (child-letter child-num) - (setq - temp - (logior temp - (gethash child-num bits-table)))) - current-children) - temp) - ;; An alternative - ;; (apply #'logior - ;; (mapcar (lambda (num) - ;; (gethash num bits-table)) - ;; (hash-table-values current-children))) - )) - (setq temp nil) - (puthash (car (nthcdr 4 current)) bit bits-table) - ;; (lcs-debug bit) - ;; (insert (format - ;; "bits: %S\n" - ;; (gst-pretty-hash-table bits-table))) - (cond - ((= bit all-common) ; a common substring - (cond - ((> current-length max-height) - (setq max-height current-length) - (setq max-env (list - (cons - (list (car current) - (cadr current) - (caddr current)) - (mapcar #'lcs-prepare-env - (car (nthcdr 5 current))))))) - ((= current-length max-height) - (setq max-env - (cons (cons - (list (car current) - (cadr current) - (caddr current)) - (mapcar #'lcs-prepare-env - (car (nthcdr 5 current)))) - max-env))))))) - (setq stack (cdr stack))) - (t ; a new node! - (setq stack - (append - (progn - (maphash - (lambda (letter node) - (setq temp - (cons - (append - (lcs-get-env (gethash node tree) str-lens) - (list node - (cons current-env (car (nthcdr 5 current))))) - temp))) - current-children) - temp) - stack)) - (puthash (car (nthcdr 4 current)) 1 discovered-table) - ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children))) - ;; (insert "stack: " (lcs-pretty-stack stack)) - (setq temp nil)))) - (t ; a leaf - (let* ((node-number (car (nthcdr 4 current))) - (leaf-labels (car (nthcdr 5 (gethash node-number tree)))) - (bit (apply #'logior - (mapcar (lambda (label) - (ash 1 (car label))) - leaf-labels)))) - ;; (insert "leaf") - ;; (lcs-debug node-number leaf-labels bit) - (puthash node-number bit bits-table) - (cond - ((= bit all-common) ; a common substring - (cond - ((> current-length max-height) - (setq max-height current-length) - (setq max-env (list - (cons - (list (car current) - (cadr current) - (caddr current)) - (mapcar #'lcs-prepare-env - (car (nthcdr 5 current))))))) - ((= current-length max-height) - (setq max-env - (cons (cons - (list (car current) - (cadr current) - (caddr current)) - (mapcar #'lcs-prepare-env - (car (nthcdr 5 current)))) - max-env)))))) - (setq stack (cdr stack))))))) - (list max-height max-env))) - -(provide 'lcs) -;;; LCS.el ends here |