summaryrefslogtreecommitdiff
path: root/suffix tree/LCS.el
diff options
context:
space:
mode:
Diffstat (limited to 'suffix tree/LCS.el')
-rw-r--r--suffix tree/LCS.el327
1 files changed, 0 insertions, 327 deletions
diff --git a/suffix tree/LCS.el b/suffix tree/LCS.el
deleted file mode 100644
index 41bcec0..0000000
--- a/suffix tree/LCS.el
+++ /dev/null
@@ -1,327 +0,0 @@
-;;; LCS.el --- Longest Common Substrings of multiple strings -*- lexical-binding: t; -*-
-
-;;; Author: Durand
-;;; Version: 0.0.1
-
-;;; Commentary:
-
-;; First generate a generalized suffix tree of the strings, then
-;; traverses the tree in a depth-first manner to find the nodes under
-;; which there are suffixes from each and every string in question.
-;; Finally collect and return the list of all substrings which are
-;; common to all input strings and which have the longest length.
-
-;;; Code:
-
-;;; Our building block
-(require 'generalized-suffix-tree)
-
-;;;###autoload
-(defun lcs-string (strs)
- "Return the longest common substring of STRS."
- (lcs-interpret (lcs strs) strs))
-
-;;;###autoload
-(defun lcs-edge-length (node str-lens)
- "Return the length of the edge going into NODE.
-
-Unlike `gst-edge-length', this does not need to know about
-POSITION and NUM, since it is assumed that the tree is already
-built before calling this function.
-
-If the node is a leaf, its end is denoted by infty, and we
-wouldn't know the exact length if we don't know the length of the
-string it corresponds to, so we need the argument STR-LENS to
-provide that information."
- (let ((start (car (cdr node)))
- (end (cond
- ((eq (car (cdr (cdr node))) 'infty)
- (nth (car node) str-lens))
- (t (car (cdr (cdr node)))))))
- (- end start)))
-
-;;;###autoload
-(defun lcs-get-parent-length (parents)
- "Return the length of the parent strings PARENTS."
- (let* ((ls parents)
- (result 0))
- (while (consp ls)
- (setq result (+ result (- (caddr (car ls))
- (cadr (car ls)))))
- (setq ls (cdr ls)))
- result))
-
-;;;###autoload
-(defun lcs-get-env (node str-lens)
- "Return the information we want to keep from NODE on the stack.
-
-STR-LENS is here to provide information about the ending index of
-NODE if it is a lead. See the documentation for `lcs-edge-length'
-for more."
- (let ((num (car node))
- (start (car (cdr node)))
- (end (cond
- ((eq (car (cdr (cdr node))) 'infty)
- (nth (car node) str-lens))
- (t (car (cdr (cdr node))))))
- (children (car (nthcdr 4 node))))
- (list num start end children)))
-
-;;; For debug purposes
-
-;;;###autoload
-(defmacro lcs-debug (&rest vars)
- "Print the names and the values of VARS."
- `(insert
- (mapconcat 'identity
- (list
- ,@(mapcar
- (lambda (var)
- `(format "%s: %S"
- ,(symbol-name var)
- ,var))
- vars))
- "\n")
- "\n"))
-
-;;;###autoload
-(defun lcs-pretty-stack-element (element)
- "Return a pretty representation of ELEMENT."
- (concat
- "("
- (mapconcat
- (lambda (slot)
- (format "%S"
- (cond
- ((hash-table-p slot)
- (gst-pretty-hash-table slot))
- ((consp slot)
- (mapconcat #'lcs-pretty-stack-element slot ", "))
- (t slot))))
- element ", ")
- ")"))
-
-;;;###autoload
-(defun lcs-pretty-stack (stack)
- "Prettify STACK.
-
-Apply `lcs-pretty-stack-element' to each element, and concatenate
-them in a clean way."
- (concat
- "[ "
- (mapconcat #'lcs-pretty-stack-element
- stack
- ",")
- " ]"
- "\n"))
-
-;;;###autoload
-(defun lcs-prepare-env (long-env)
- "Take the first three elements out of LONG-ENV."
- (cons (car long-env)
- (cons (cadr long-env)
- (cons (caddr long-env)
- nil))))
-
-;;; Interpreting the result
-
-;;;###autoload
-(defun lcs-interpret (result strs &optional just-length)
- "Since the raw RESULT of `lcs' is not human-readable, we interpret it.
-If JUST-LENGTH is non-nil, then this only returns the length of
-the longest common substrings. Otherwise, it returns the longest
-common substrings themselves.
-
-Of course, without STRS, we don't even know what our strings are."
- (cond
- (just-length (car result))
- (t (mapcar
- (lambda (one-result)
- (let ((chain one-result)
- temp res-str)
- (while (consp one-result)
- (setq temp (car one-result))
- (setq res-str
- (concat
- (let ((nstr (nth (car temp) strs)))
- (substring nstr
- (cadr temp)
- (min (caddr temp) (length nstr))))
- res-str))
- (setq one-result (cdr one-result)))
- res-str))
- (cadr result)))))
-
-;;; The main engine
-
-;;;###autoload
-(defun lcs (strs &optional just-length)
- "Return the longest common substring of a list STRS of strings.
-
-If JUST-LENGTH is non-nil, then this only returns the length of the
-longest common substring. Otherwise, it returns the length as well as the
-longest common substring itself."
- (let* ((strs (mapcar (lambda (str)
- (vconcat str (list -1)))
- strs))
- (str-lens (mapcar #'length strs))
- (tree (gst-build-for-strs strs str-lens))
- (max-height 0)
- (bits-table (make-hash-table))
- (discovered-table (make-hash-table))
- (all-common (1- (expt 2 (length strs))))
- max-env stack current temp)
- ;; The format of elements on the stack is as follows.
- ;; (number start end children node-number parents)
-
- ;; Here number identifies the string this comes from. start, end
- ;; and children have the same meaning as in the tree. Node-number
- ;; is the number of the node that is recorded in the hash-table
- ;; tree. And finally parents is a list of triples that represents
- ;; parent segments.
- (setq stack (progn
- (maphash
- (lambda (letter node)
- (setq temp
- (cons
- (append
- (lcs-get-env (gethash node tree) str-lens)
- (list node nil ; parent strings represented as a triple of integers
- ))
- temp)))
- (car (nthcdr 4 (gethash 1 tree))))
- temp))
- (setq temp nil)
- ;; (insert (format "count: %d\n" count))
- ;; (lcs-debug all-common)
- ;; (insert "stack: " (lcs-pretty-stack stack))
- (while (consp stack)
- ;; (insert "stack: " (lcs-pretty-stack stack))
- (setq current (car stack))
- (let ((current-length (+ (lcs-get-parent-length (car (nthcdr 5 current)))
- (- (min (caddr current)
- ;; The final character does
- ;; not count!
- (1- (nth (car current)
- str-lens)))
- (cadr current))))
- (current-env (list (car current)
- (cadr current)
- (caddr current)
- (car (nthcdr 3 current))))
- (current-children (car (nthcdr 3 current))))
- ;; (insert (format "\n\ncount: %d\n" count))
- ;; (insert "current: " (lcs-pretty-stack-element current) "\n")
- ;; (lcs-debug current-length max-env max-height)
- ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children)))
- ;; (insert (format
- ;; "discovered: %S\n"
- ;; (gst-pretty-hash-table discovered-table)))
- ;; (insert (format
- ;; "bits: %S\n"
- ;; (gst-pretty-hash-table bits-table)))
- (cond
- ((not (hash-table-empty-p current-children)) ; There are children
- (cond
- ((gethash (car (nthcdr 4 current)) discovered-table)
- ;; This node is already traversed
- ;; This means we have already given bits to its children
- (let ((bit (progn
- (setq temp 0)
- (maphash (lambda (child-letter child-num)
- (setq
- temp
- (logior temp
- (gethash child-num bits-table))))
- current-children)
- temp)
- ;; An alternative
- ;; (apply #'logior
- ;; (mapcar (lambda (num)
- ;; (gethash num bits-table))
- ;; (hash-table-values current-children)))
- ))
- (setq temp nil)
- (puthash (car (nthcdr 4 current)) bit bits-table)
- ;; (lcs-debug bit)
- ;; (insert (format
- ;; "bits: %S\n"
- ;; (gst-pretty-hash-table bits-table)))
- (cond
- ((= bit all-common) ; a common substring
- (cond
- ((> current-length max-height)
- (setq max-height current-length)
- (setq max-env (list
- (cons
- (list (car current)
- (cadr current)
- (caddr current))
- (mapcar #'lcs-prepare-env
- (car (nthcdr 5 current)))))))
- ((= current-length max-height)
- (setq max-env
- (cons (cons
- (list (car current)
- (cadr current)
- (caddr current))
- (mapcar #'lcs-prepare-env
- (car (nthcdr 5 current))))
- max-env)))))))
- (setq stack (cdr stack)))
- (t ; a new node!
- (setq stack
- (append
- (progn
- (maphash
- (lambda (letter node)
- (setq temp
- (cons
- (append
- (lcs-get-env (gethash node tree) str-lens)
- (list node
- (cons current-env (car (nthcdr 5 current)))))
- temp)))
- current-children)
- temp)
- stack))
- (puthash (car (nthcdr 4 current)) 1 discovered-table)
- ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children)))
- ;; (insert "stack: " (lcs-pretty-stack stack))
- (setq temp nil))))
- (t ; a leaf
- (let* ((node-number (car (nthcdr 4 current)))
- (leaf-labels (car (nthcdr 5 (gethash node-number tree))))
- (bit (apply #'logior
- (mapcar (lambda (label)
- (ash 1 (car label)))
- leaf-labels))))
- ;; (insert "leaf")
- ;; (lcs-debug node-number leaf-labels bit)
- (puthash node-number bit bits-table)
- (cond
- ((= bit all-common) ; a common substring
- (cond
- ((> current-length max-height)
- (setq max-height current-length)
- (setq max-env (list
- (cons
- (list (car current)
- (cadr current)
- (caddr current))
- (mapcar #'lcs-prepare-env
- (car (nthcdr 5 current)))))))
- ((= current-length max-height)
- (setq max-env
- (cons (cons
- (list (car current)
- (cadr current)
- (caddr current))
- (mapcar #'lcs-prepare-env
- (car (nthcdr 5 current))))
- max-env))))))
- (setq stack (cdr stack)))))))
- (list max-height max-env)))
-
-(provide 'lcs)
-;;; LCS.el ends here