From 3666deaed5b0baf0a74f14db5872105c9e7865f9 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 13 Jan 2021 13:01:34 +0800 Subject: A temporary intermeidate step Now I got almost every functionality that we need, including pdf, mu4e, magit, et cetera. --- suffix tree/LCS.el | 314 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 suffix tree/LCS.el (limited to 'suffix tree/LCS.el') diff --git a/suffix tree/LCS.el b/suffix tree/LCS.el new file mode 100644 index 0000000..989c25e --- /dev/null +++ b/suffix tree/LCS.el @@ -0,0 +1,314 @@ +;;; LCS.el --- Longest Common Substrings of multiple strings -*- lexical-binding: t; -*- + +;;; Author: Durand +;;; Version: 0.0.1 + +;;; Commentary: + +;; First generate a generalized suffix tree of the strings, then +;; traverses the tree in a depth-first manner to find the nodes under +;; which there are suffixes from each and every string in question. +;; Finally collect and return the list of all substrings which are +;; common to all input strings and which have the longest length. + +;;; Code: + +;;; Our building block +(require 'generalized-suffix-tree) + +;;;###autoload +(defun lcs-string (strs) + "Return the longest common substring of STRS." + (lcs-interpret (lcs strs) strs)) + +;;;###autoload +(defun lcs-edge-length (node str-lens) + "Return the length of the edge going into NODE. +Unlike `gst-edge-length', this does not need to know about POSITION +and NUM, since it is assumed that the tree is already built +before calling this function." + (let ((start (car (cdr node))) + (end (cond + ((eq (car (cdr (cdr node))) 'infty) + (nth (car node) str-lens)) + (t (car (cdr (cdr node))))))) + (- end start))) + +;;;###autoload +(defun lcs-get-parent-length (parents) + "Return the length of the parent strings PARENTS." + (let* ((ls parents) + (result 0)) + (while (consp ls) + (setq result (+ result (- (caddr (car ls)) + (cadr (car ls))))) + (setq ls (cdr ls))) + result)) + +;;;###autoload +(defun lcs-get-env (node str-lens) + "Return the information we want to keep on the stack." + (let ((num (car node)) + (start (car (cdr node))) + (end (cond + ((eq (car (cdr (cdr node))) 'infty) + (nth (car node) str-lens)) + (t (car (cdr (cdr node)))))) + (children (car (nthcdr 4 node)))) + (list num start end children))) + +;;; For debug purposes + +;;;###autoload +(defmacro lcs-debug (&rest vars) + "Print the names and the values of VARS." + `(insert + (mapconcat 'identity + (list + ,@(mapcar + (lambda (var) + `(format "%s: %S" + ,(symbol-name var) + ,var)) + vars)) + "\n") + "\n")) + +;;;###autoload +(defun lcs-pretty-stack-element (element) + "Return a pretty representation of ELEMENT." + (concat + "(" + (mapconcat + (lambda (slot) + (format "%S" + (cond + ((hash-table-p slot) + (gst-pretty-hash-table slot)) + ((consp slot) + (mapconcat #'lcs-pretty-stack-element slot ", ")) + (t slot)))) + element ", ") + ")")) + +;;;###autoload +(defun lcs-pretty-stack (stack) + "Prettify a stack." + (concat + "[ " + (mapconcat #'lcs-pretty-stack-element + stack + ",") + " ]" + "\n")) + +;;;###autoload +(defun lcs-prepare-env (long-env) + "Take the first three elements out of LONG-ENV." + (cons (car long-env) + (cons (cadr long-env) + (cons (caddr long-env) + nil)))) + +;;; Interpreting the result + +;;;###autoload +(defun lcs-interpret (result strs &optional just-length) + "Since the raw RESULT of `lcs' is not human-readable, we interpret it. +If JUST-LENGTH is non-nil, then this only returns the length of +the longest common substrings. Otherwise, it returns the longest +common substrings themselves. + +Of course, without STRS, we don't even know what our strings are." + (cond + (just-length (car result)) + (t (mapcar + (lambda (one-result) + (let ((chain one-result) + temp res-str) + (while (consp one-result) + (setq temp (car one-result)) + (setq res-str + (concat + (let ((nstr (nth (car temp) strs))) + (substring nstr + (cadr temp) + (min (caddr temp) (length nstr)))) + res-str)) + (setq one-result (cdr one-result))) + res-str)) + (cadr result))))) + +;;; The main engine + +;;;###autoload +(defun lcs (strs &optional just-length) + "Return the longest common substring of a list STRS of strings. + +If JUST-LENGTH is non-nil, then this only returns the length of the +longest common substring. Otherwise, it returns the length as well as the +longest common substring itself." + (let* ((strs (mapcar (lambda (str) + (vconcat str (list -1))) + strs)) + (str-lens (mapcar #'length strs)) + (tree (gst-build-for-strs strs str-lens)) + (max-height 0) + (bits-table (make-hash-table)) + (discovered-table (make-hash-table)) + (all-common (1- (expt 2 (length strs)))) + max-env stack current temp) + ;; The format of elements on the stack is as follows. + ;; (number start end children node-number parents) + + ;; Here number identifies the string this comes from. start, end + ;; and children have the same meaning as in the tree. Node-number + ;; is the number of the node that is recorded in the hash-table + ;; tree. And finally parents is a list of triples that represents + ;; parent segments. + (setq stack (progn + (maphash + (lambda (letter node) + (setq temp + (cons + (append + (lcs-get-env (gethash node tree) str-lens) + (list node nil ; parent strings represented as a triple of integers + )) + temp))) + (car (nthcdr 4 (gethash 1 tree)))) + temp)) + (setq temp nil) + ;; (insert (format "count: %d\n" count)) + ;; (lcs-debug all-common) + ;; (insert "stack: " (lcs-pretty-stack stack)) + (while (consp stack) + ;; (insert "stack: " (lcs-pretty-stack stack)) + (setq current (car stack)) + (let ((current-length (+ (lcs-get-parent-length (car (nthcdr 5 current))) + (- (min (caddr current) + ;; The final character does + ;; not count! + (1- (nth (car current) + str-lens))) + (cadr current)))) + (current-env (list (car current) + (cadr current) + (caddr current) + (car (nthcdr 3 current)))) + (current-children (car (nthcdr 3 current)))) + ;; (insert (format "\n\ncount: %d\n" count)) + ;; (insert "current: " (lcs-pretty-stack-element current) "\n") + ;; (lcs-debug current-length max-env max-height) + ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children))) + ;; (insert (format + ;; "discovered: %S\n" + ;; (gst-pretty-hash-table discovered-table))) + ;; (insert (format + ;; "bits: %S\n" + ;; (gst-pretty-hash-table bits-table))) + (cond + ((not (hash-table-empty-p current-children)) ; There are children + (cond + ((gethash (car (nthcdr 4 current)) discovered-table) + ;; This node is already traversed + ;; This means we have already given bits to its children + (let ((bit (progn + (setq temp 0) + (maphash (lambda (child-letter child-num) + (setq + temp + (logior temp + (gethash child-num bits-table)))) + current-children) + temp) + ;; An alternative + ;; (apply #'logior + ;; (mapcar (lambda (num) + ;; (gethash num bits-table)) + ;; (hash-table-values current-children))) + )) + (setq temp nil) + (puthash (car (nthcdr 4 current)) bit bits-table) + ;; (lcs-debug bit) + ;; (insert (format + ;; "bits: %S\n" + ;; (gst-pretty-hash-table bits-table))) + (cond + ((= bit all-common) ; a common substring + (cond + ((> current-length max-height) + (setq max-height current-length) + (setq max-env (list + (cons + (list (car current) + (cadr current) + (caddr current)) + (mapcar #'lcs-prepare-env + (car (nthcdr 5 current))))))) + ((= current-length max-height) + (setq max-env + (cons (cons + (list (car current) + (cadr current) + (caddr current)) + (mapcar #'lcs-prepare-env + (car (nthcdr 5 current)))) + max-env))))))) + (setq stack (cdr stack))) + (t ; a new node! + (setq stack + (append + (progn + (maphash + (lambda (letter node) + (setq temp + (cons + (append + (lcs-get-env (gethash node tree) str-lens) + (list node + (cons current-env (car (nthcdr 5 current))))) + temp))) + current-children) + temp) + stack)) + (puthash (car (nthcdr 4 current)) 1 discovered-table) + ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children))) + ;; (insert "stack: " (lcs-pretty-stack stack)) + (setq temp nil)))) + (t ; a leaf + (let* ((node-number (car (nthcdr 4 current))) + (leaf-labels (car (nthcdr 5 (gethash node-number tree)))) + (bit (apply #'logior + (mapcar (lambda (label) + (ash 1 (car label))) + leaf-labels)))) + ;; (insert "leaf") + ;; (lcs-debug node-number leaf-labels bit) + (puthash node-number bit bits-table) + (cond + ((= bit all-common) ; a common substring + (cond + ((> current-length max-height) + (setq max-height current-length) + (setq max-env (list + (cons + (list (car current) + (cadr current) + (caddr current)) + (mapcar #'lcs-prepare-env + (car (nthcdr 5 current))))))) + ((= current-length max-height) + (setq max-env + (cons (cons + (list (car current) + (cadr current) + (caddr current)) + (mapcar #'lcs-prepare-env + (car (nthcdr 5 current)))) + max-env)))))) + (setq stack (cdr stack))))))) + (list max-height max-env))) + +(provide 'lcs) +;;; LCS.el ends here -- cgit v1.2.3-18-g5258