summaryrefslogtreecommitdiff
path: root/suffix tree/LCS.el
diff options
context:
space:
mode:
Diffstat (limited to 'suffix tree/LCS.el')
-rw-r--r--suffix tree/LCS.el314
1 files changed, 314 insertions, 0 deletions
diff --git a/suffix tree/LCS.el b/suffix tree/LCS.el
new file mode 100644
index 0000000..989c25e
--- /dev/null
+++ b/suffix tree/LCS.el
@@ -0,0 +1,314 @@
+;;; LCS.el --- Longest Common Substrings of multiple strings -*- lexical-binding: t; -*-
+
+;;; Author: Durand
+;;; Version: 0.0.1
+
+;;; Commentary:
+
+;; First generate a generalized suffix tree of the strings, then
+;; traverses the tree in a depth-first manner to find the nodes under
+;; which there are suffixes from each and every string in question.
+;; Finally collect and return the list of all substrings which are
+;; common to all input strings and which have the longest length.
+
+;;; Code:
+
+;;; Our building block
+(require 'generalized-suffix-tree)
+
+;;;###autoload
+(defun lcs-string (strs)
+ "Return the longest common substring of STRS."
+ (lcs-interpret (lcs strs) strs))
+
+;;;###autoload
+(defun lcs-edge-length (node str-lens)
+ "Return the length of the edge going into NODE.
+Unlike `gst-edge-length', this does not need to know about POSITION
+and NUM, since it is assumed that the tree is already built
+before calling this function."
+ (let ((start (car (cdr node)))
+ (end (cond
+ ((eq (car (cdr (cdr node))) 'infty)
+ (nth (car node) str-lens))
+ (t (car (cdr (cdr node)))))))
+ (- end start)))
+
+;;;###autoload
+(defun lcs-get-parent-length (parents)
+ "Return the length of the parent strings PARENTS."
+ (let* ((ls parents)
+ (result 0))
+ (while (consp ls)
+ (setq result (+ result (- (caddr (car ls))
+ (cadr (car ls)))))
+ (setq ls (cdr ls)))
+ result))
+
+;;;###autoload
+(defun lcs-get-env (node str-lens)
+ "Return the information we want to keep on the stack."
+ (let ((num (car node))
+ (start (car (cdr node)))
+ (end (cond
+ ((eq (car (cdr (cdr node))) 'infty)
+ (nth (car node) str-lens))
+ (t (car (cdr (cdr node))))))
+ (children (car (nthcdr 4 node))))
+ (list num start end children)))
+
+;;; For debug purposes
+
+;;;###autoload
+(defmacro lcs-debug (&rest vars)
+ "Print the names and the values of VARS."
+ `(insert
+ (mapconcat 'identity
+ (list
+ ,@(mapcar
+ (lambda (var)
+ `(format "%s: %S"
+ ,(symbol-name var)
+ ,var))
+ vars))
+ "\n")
+ "\n"))
+
+;;;###autoload
+(defun lcs-pretty-stack-element (element)
+ "Return a pretty representation of ELEMENT."
+ (concat
+ "("
+ (mapconcat
+ (lambda (slot)
+ (format "%S"
+ (cond
+ ((hash-table-p slot)
+ (gst-pretty-hash-table slot))
+ ((consp slot)
+ (mapconcat #'lcs-pretty-stack-element slot ", "))
+ (t slot))))
+ element ", ")
+ ")"))
+
+;;;###autoload
+(defun lcs-pretty-stack (stack)
+ "Prettify a stack."
+ (concat
+ "[ "
+ (mapconcat #'lcs-pretty-stack-element
+ stack
+ ",")
+ " ]"
+ "\n"))
+
+;;;###autoload
+(defun lcs-prepare-env (long-env)
+ "Take the first three elements out of LONG-ENV."
+ (cons (car long-env)
+ (cons (cadr long-env)
+ (cons (caddr long-env)
+ nil))))
+
+;;; Interpreting the result
+
+;;;###autoload
+(defun lcs-interpret (result strs &optional just-length)
+ "Since the raw RESULT of `lcs' is not human-readable, we interpret it.
+If JUST-LENGTH is non-nil, then this only returns the length of
+the longest common substrings. Otherwise, it returns the longest
+common substrings themselves.
+
+Of course, without STRS, we don't even know what our strings are."
+ (cond
+ (just-length (car result))
+ (t (mapcar
+ (lambda (one-result)
+ (let ((chain one-result)
+ temp res-str)
+ (while (consp one-result)
+ (setq temp (car one-result))
+ (setq res-str
+ (concat
+ (let ((nstr (nth (car temp) strs)))
+ (substring nstr
+ (cadr temp)
+ (min (caddr temp) (length nstr))))
+ res-str))
+ (setq one-result (cdr one-result)))
+ res-str))
+ (cadr result)))))
+
+;;; The main engine
+
+;;;###autoload
+(defun lcs (strs &optional just-length)
+ "Return the longest common substring of a list STRS of strings.
+
+If JUST-LENGTH is non-nil, then this only returns the length of the
+longest common substring. Otherwise, it returns the length as well as the
+longest common substring itself."
+ (let* ((strs (mapcar (lambda (str)
+ (vconcat str (list -1)))
+ strs))
+ (str-lens (mapcar #'length strs))
+ (tree (gst-build-for-strs strs str-lens))
+ (max-height 0)
+ (bits-table (make-hash-table))
+ (discovered-table (make-hash-table))
+ (all-common (1- (expt 2 (length strs))))
+ max-env stack current temp)
+ ;; The format of elements on the stack is as follows.
+ ;; (number start end children node-number parents)
+
+ ;; Here number identifies the string this comes from. start, end
+ ;; and children have the same meaning as in the tree. Node-number
+ ;; is the number of the node that is recorded in the hash-table
+ ;; tree. And finally parents is a list of triples that represents
+ ;; parent segments.
+ (setq stack (progn
+ (maphash
+ (lambda (letter node)
+ (setq temp
+ (cons
+ (append
+ (lcs-get-env (gethash node tree) str-lens)
+ (list node nil ; parent strings represented as a triple of integers
+ ))
+ temp)))
+ (car (nthcdr 4 (gethash 1 tree))))
+ temp))
+ (setq temp nil)
+ ;; (insert (format "count: %d\n" count))
+ ;; (lcs-debug all-common)
+ ;; (insert "stack: " (lcs-pretty-stack stack))
+ (while (consp stack)
+ ;; (insert "stack: " (lcs-pretty-stack stack))
+ (setq current (car stack))
+ (let ((current-length (+ (lcs-get-parent-length (car (nthcdr 5 current)))
+ (- (min (caddr current)
+ ;; The final character does
+ ;; not count!
+ (1- (nth (car current)
+ str-lens)))
+ (cadr current))))
+ (current-env (list (car current)
+ (cadr current)
+ (caddr current)
+ (car (nthcdr 3 current))))
+ (current-children (car (nthcdr 3 current))))
+ ;; (insert (format "\n\ncount: %d\n" count))
+ ;; (insert "current: " (lcs-pretty-stack-element current) "\n")
+ ;; (lcs-debug current-length max-env max-height)
+ ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children)))
+ ;; (insert (format
+ ;; "discovered: %S\n"
+ ;; (gst-pretty-hash-table discovered-table)))
+ ;; (insert (format
+ ;; "bits: %S\n"
+ ;; (gst-pretty-hash-table bits-table)))
+ (cond
+ ((not (hash-table-empty-p current-children)) ; There are children
+ (cond
+ ((gethash (car (nthcdr 4 current)) discovered-table)
+ ;; This node is already traversed
+ ;; This means we have already given bits to its children
+ (let ((bit (progn
+ (setq temp 0)
+ (maphash (lambda (child-letter child-num)
+ (setq
+ temp
+ (logior temp
+ (gethash child-num bits-table))))
+ current-children)
+ temp)
+ ;; An alternative
+ ;; (apply #'logior
+ ;; (mapcar (lambda (num)
+ ;; (gethash num bits-table))
+ ;; (hash-table-values current-children)))
+ ))
+ (setq temp nil)
+ (puthash (car (nthcdr 4 current)) bit bits-table)
+ ;; (lcs-debug bit)
+ ;; (insert (format
+ ;; "bits: %S\n"
+ ;; (gst-pretty-hash-table bits-table)))
+ (cond
+ ((= bit all-common) ; a common substring
+ (cond
+ ((> current-length max-height)
+ (setq max-height current-length)
+ (setq max-env (list
+ (cons
+ (list (car current)
+ (cadr current)
+ (caddr current))
+ (mapcar #'lcs-prepare-env
+ (car (nthcdr 5 current)))))))
+ ((= current-length max-height)
+ (setq max-env
+ (cons (cons
+ (list (car current)
+ (cadr current)
+ (caddr current))
+ (mapcar #'lcs-prepare-env
+ (car (nthcdr 5 current))))
+ max-env)))))))
+ (setq stack (cdr stack)))
+ (t ; a new node!
+ (setq stack
+ (append
+ (progn
+ (maphash
+ (lambda (letter node)
+ (setq temp
+ (cons
+ (append
+ (lcs-get-env (gethash node tree) str-lens)
+ (list node
+ (cons current-env (car (nthcdr 5 current)))))
+ temp)))
+ current-children)
+ temp)
+ stack))
+ (puthash (car (nthcdr 4 current)) 1 discovered-table)
+ ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children)))
+ ;; (insert "stack: " (lcs-pretty-stack stack))
+ (setq temp nil))))
+ (t ; a leaf
+ (let* ((node-number (car (nthcdr 4 current)))
+ (leaf-labels (car (nthcdr 5 (gethash node-number tree))))
+ (bit (apply #'logior
+ (mapcar (lambda (label)
+ (ash 1 (car label)))
+ leaf-labels))))
+ ;; (insert "leaf")
+ ;; (lcs-debug node-number leaf-labels bit)
+ (puthash node-number bit bits-table)
+ (cond
+ ((= bit all-common) ; a common substring
+ (cond
+ ((> current-length max-height)
+ (setq max-height current-length)
+ (setq max-env (list
+ (cons
+ (list (car current)
+ (cadr current)
+ (caddr current))
+ (mapcar #'lcs-prepare-env
+ (car (nthcdr 5 current)))))))
+ ((= current-length max-height)
+ (setq max-env
+ (cons (cons
+ (list (car current)
+ (cadr current)
+ (caddr current))
+ (mapcar #'lcs-prepare-env
+ (car (nthcdr 5 current))))
+ max-env))))))
+ (setq stack (cdr stack)))))))
+ (list max-height max-env)))
+
+(provide 'lcs)
+;;; LCS.el ends here