;;; LCS.el --- Longest Common Substrings of multiple strings -*- lexical-binding: t; -*- ;;; Author: Durand ;;; Version: 0.0.1 ;;; Commentary: ;; First generate a generalized suffix tree of the strings, then ;; traverses the tree in a depth-first manner to find the nodes under ;; which there are suffixes from each and every string in question. ;; Finally collect and return the list of all substrings which are ;; common to all input strings and which have the longest length. ;;; Code: ;;; Our building block (require 'generalized-suffix-tree) ;;;###autoload (defun lcs-string (strs) "Return the longest common substring of STRS." (lcs-interpret (lcs strs) strs)) ;;;###autoload (defun lcs-edge-length (node str-lens) "Return the length of the edge going into NODE. Unlike `gst-edge-length', this does not need to know about POSITION and NUM, since it is assumed that the tree is already built before calling this function. If the node is a leaf, its end is denoted by infty, and we wouldn't know the exact length if we don't know the length of the string it corresponds to, so we need the argument STR-LENS to provide that information." (let ((start (car (cdr node))) (end (cond ((eq (car (cdr (cdr node))) 'infty) (nth (car node) str-lens)) (t (car (cdr (cdr node))))))) (- end start))) ;;;###autoload (defun lcs-get-parent-length (parents) "Return the length of the parent strings PARENTS." (let* ((ls parents) (result 0)) (while (consp ls) (setq result (+ result (- (caddr (car ls)) (cadr (car ls))))) (setq ls (cdr ls))) result)) ;;;###autoload (defun lcs-get-env (node str-lens) "Return the information we want to keep from NODE on the stack. STR-LENS is here to provide information about the ending index of NODE if it is a lead. See the documentation for `lcs-edge-length' for more." (let ((num (car node)) (start (car (cdr node))) (end (cond ((eq (car (cdr (cdr node))) 'infty) (nth (car node) str-lens)) (t (car (cdr (cdr node)))))) (children (car (nthcdr 4 node)))) (list num start end children))) ;;; For debug purposes ;;;###autoload (defmacro lcs-debug (&rest vars) "Print the names and the values of VARS." `(insert (mapconcat 'identity (list ,@(mapcar (lambda (var) `(format "%s: %S" ,(symbol-name var) ,var)) vars)) "\n") "\n")) ;;;###autoload (defun lcs-pretty-stack-element (element) "Return a pretty representation of ELEMENT." (concat "(" (mapconcat (lambda (slot) (format "%S" (cond ((hash-table-p slot) (gst-pretty-hash-table slot)) ((consp slot) (mapconcat #'lcs-pretty-stack-element slot ", ")) (t slot)))) element ", ") ")")) ;;;###autoload (defun lcs-pretty-stack (stack) "Prettify STACK. Apply `lcs-pretty-stack-element' to each element, and concatenate them in a clean way." (concat "[ " (mapconcat #'lcs-pretty-stack-element stack ",") " ]" "\n")) ;;;###autoload (defun lcs-prepare-env (long-env) "Take the first three elements out of LONG-ENV." (cons (car long-env) (cons (cadr long-env) (cons (caddr long-env) nil)))) ;;; Interpreting the result ;;;###autoload (defun lcs-interpret (result strs &optional just-length) "Since the raw RESULT of `lcs' is not human-readable, we interpret it. If JUST-LENGTH is non-nil, then this only returns the length of the longest common substrings. Otherwise, it returns the longest common substrings themselves. Of course, without STRS, we don't even know what our strings are." (cond (just-length (car result)) (t (mapcar (lambda (one-result) (let ((chain one-result) temp res-str) (while (consp one-result) (setq temp (car one-result)) (setq res-str (concat (let ((nstr (nth (car temp) strs))) (substring nstr (cadr temp) (min (caddr temp) (length nstr)))) res-str)) (setq one-result (cdr one-result))) res-str)) (cadr result))))) ;;; The main engine ;;;###autoload (defun lcs (strs &optional just-length) "Return the longest common substring of a list STRS of strings. If JUST-LENGTH is non-nil, then this only returns the length of the longest common substring. Otherwise, it returns the length as well as the longest common substring itself." (let* ((strs (mapcar (lambda (str) (vconcat str (list -1))) strs)) (str-lens (mapcar #'length strs)) (tree (gst-build-for-strs strs str-lens)) (max-height 0) (bits-table (make-hash-table)) (discovered-table (make-hash-table)) (all-common (1- (expt 2 (length strs)))) max-env stack current temp) ;; The format of elements on the stack is as follows. ;; (number start end children node-number parents) ;; Here number identifies the string this comes from. start, end ;; and children have the same meaning as in the tree. Node-number ;; is the number of the node that is recorded in the hash-table ;; tree. And finally parents is a list of triples that represents ;; parent segments. (setq stack (progn (maphash (lambda (letter node) (setq temp (cons (append (lcs-get-env (gethash node tree) str-lens) (list node nil ; parent strings represented as a triple of integers )) temp))) (car (nthcdr 4 (gethash 1 tree)))) temp)) (setq temp nil) ;; (insert (format "count: %d\n" count)) ;; (lcs-debug all-common) ;; (insert "stack: " (lcs-pretty-stack stack)) (while (consp stack) ;; (insert "stack: " (lcs-pretty-stack stack)) (setq current (car stack)) (let ((current-length (+ (lcs-get-parent-length (car (nthcdr 5 current))) (- (min (caddr current) ;; The final character does ;; not count! (1- (nth (car current) str-lens))) (cadr current)))) (current-env (list (car current) (cadr current) (caddr current) (car (nthcdr 3 current)))) (current-children (car (nthcdr 3 current)))) ;; (insert (format "\n\ncount: %d\n" count)) ;; (insert "current: " (lcs-pretty-stack-element current) "\n") ;; (lcs-debug current-length max-env max-height) ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children))) ;; (insert (format ;; "discovered: %S\n" ;; (gst-pretty-hash-table discovered-table))) ;; (insert (format ;; "bits: %S\n" ;; (gst-pretty-hash-table bits-table))) (cond ((not (hash-table-empty-p current-children)) ; There are children (cond ((gethash (car (nthcdr 4 current)) discovered-table) ;; This node is already traversed ;; This means we have already given bits to its children (let ((bit (progn (setq temp 0) (maphash (lambda (child-letter child-num) (setq temp (logior temp (gethash child-num bits-table)))) current-children) temp) ;; An alternative ;; (apply #'logior ;; (mapcar (lambda (num) ;; (gethash num bits-table)) ;; (hash-table-values current-children))) )) (setq temp nil) (puthash (car (nthcdr 4 current)) bit bits-table) ;; (lcs-debug bit) ;; (insert (format ;; "bits: %S\n" ;; (gst-pretty-hash-table bits-table))) (cond ((= bit all-common) ; a common substring (cond ((> current-length max-height) (setq max-height current-length) (setq max-env (list (cons (list (car current) (cadr current) (caddr current)) (mapcar #'lcs-prepare-env (car (nthcdr 5 current))))))) ((= current-length max-height) (setq max-env (cons (cons (list (car current) (cadr current) (caddr current)) (mapcar #'lcs-prepare-env (car (nthcdr 5 current)))) max-env))))))) (setq stack (cdr stack))) (t ; a new node! (setq stack (append (progn (maphash (lambda (letter node) (setq temp (cons (append (lcs-get-env (gethash node tree) str-lens) (list node (cons current-env (car (nthcdr 5 current))))) temp))) current-children) temp) stack)) (puthash (car (nthcdr 4 current)) 1 discovered-table) ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children))) ;; (insert "stack: " (lcs-pretty-stack stack)) (setq temp nil)))) (t ; a leaf (let* ((node-number (car (nthcdr 4 current))) (leaf-labels (car (nthcdr 5 (gethash node-number tree)))) (bit (apply #'logior (mapcar (lambda (label) (ash 1 (car label))) leaf-labels)))) ;; (insert "leaf") ;; (lcs-debug node-number leaf-labels bit) (puthash node-number bit bits-table) (cond ((= bit all-common) ; a common substring (cond ((> current-length max-height) (setq max-height current-length) (setq max-env (list (cons (list (car current) (cadr current) (caddr current)) (mapcar #'lcs-prepare-env (car (nthcdr 5 current))))))) ((= current-length max-height) (setq max-env (cons (cons (list (car current) (cadr current) (caddr current)) (mapcar #'lcs-prepare-env (car (nthcdr 5 current)))) max-env)))))) (setq stack (cdr stack))))))) (list max-height max-env))) (provide 'lcs) ;;; LCS.el ends here