diff options
Diffstat (limited to 'comb')
-rw-r--r-- | comb/orderless-conf.el | 39 | ||||
-rw-r--r-- | comb/suffiex-tree.txt | 217 | ||||
-rw-r--r-- | comb/suffix-tree.el | 328 |
3 files changed, 39 insertions, 545 deletions
diff --git a/comb/orderless-conf.el b/comb/orderless-conf.el new file mode 100644 index 0000000..8af75e4 --- /dev/null +++ b/comb/orderless-conf.el @@ -0,0 +1,39 @@ +;;; -*- lexical-binding: t; -*- + +(use-package "orderless" 'orderless + (setq completion-styles '(orderless partial-completion) + completion-category-defaults nil + completion-flex-nospace nil + completion-pcm-complete-word-inserts-delimiters t + completion-show-help nil + completion-ignore-case t + read-file-name-completion-ignore-case t + read-buffer-completion-ignore-case t + resize-mini-windows t) + + (setq orderless-component-separator " +" + orderless-matching-styles '(orderless-prefixes + orderless-literal + orderless-regexp + orderless-strict-leading-initialism) + orderless-style-dispatchers '(equal-means-literal-dispatcher + comma-means-initial-dispatcher)) + + ;; unbind space in the minibuffer + (define-key minibuffer-local-completion-map (vector 32) nil) + + ;;; dispatchers + +;;;###autoload + (defun equal-means-literal-dispatcher (pattern _index _total) + "If a pattern begins with an equal sign, then we use literals." + (cond + ((= (aref pattern 0) ?=) + (cons 'orderless-literal (substring pattern 1))))) + +;;;###autoload + (defun comma-means-initial-dispatcher (pattern _index _total) + "If a pattern begins with a comma, then we use initials to match." + (cond + ((= (aref pattern 0) ?,) + (cons 'orderless-strict-leading-initialism (substring pattern 1)))))) diff --git a/comb/suffiex-tree.txt b/comb/suffiex-tree.txt deleted file mode 100644 index 51908a3..0000000 --- a/comb/suffiex-tree.txt +++ /dev/null @@ -1,217 +0,0 @@ -Title: Suffix Trees -Author: JSDurand -Created: 2020-01-03 -------------------- - -====================================================================== - Motivation to implement this algorithm in Emacs -====================================================================== - -The reason I want to implement this algorithm is a problem I -encountered in using the package "orderless", which provides a -completion-style for the built-in completion system in Emacs. The -problem is that the function "orderless-try-completion" does not -handle completion aggressively in a way that conforms to the -documentation of "try-completion". To be more precise, if there is -only one match for the current text input, then this function returns -that match (rather than comforming to the requirement in the -documentation of "try-completion" by returning t, since it wants to -highlight the matches by itself). This isn't a problem from the -perspective of the user, and if there are no matches, then this -correctly returns nil. But in any other cases, this function returns -the original string, instead of the longest common substring, as a -user might desire. - -Initially, this does not seem like a serious concern: the user could -still select a completion from the "*Completions*" buffer once the -list of candidates becomes small enough to easily manage. But as time -goes by, this starts to frustrate me. For example, when there are only -two candidates left, but one is a substring of another, then I cannot -use the completion feature to quickly select (or "complete to") the -shorter candidate, unless I type out the full candidate string, or to -choose from the "*Completions*" buffer. For me this kind of defeats -the main purpose of the built-in completion system. - -So I start wondering, is it possible to fix the problem by finding the -longest common substring in all the matches? From a naïve first -impression, this seems to be what the user might expect in most cases. - - - -====================================================================== - Choice of the algorithm -====================================================================== - -After some thinking and searching through the internet, I found that -perhaps the most flexible and performant solution to the problem of -finding the longest common substring(s) of multiple strings is to -build a "generalized suffix tree" of them, and then use a tree -traversal to find the longest common substring(s). Well, this is all -fun and great. The only problem is that it is difficult to build a -suffix tree (or a generalized one). - -So I decide to implement the seemingly fastest algorithm to construct -a suffix tree of strings and hope that this can not only solve my -problem but also help others out in some other problems in the future. - - - -====================================================================== - Definitions -====================================================================== - -I describe the basic definition of a suffix tree briefly below. - -In this document a string is a sequence of alphabets. In particular, -for the case of Emacs, these alphabets are just numbers. We begin by -considering a string S of length n. A suffix of S is a substring of S -of the form S[i..n] for some i from 1 to n. And we also say the empty -string is a suffix of S. A suffix tree of S is defined as a rooted -tree T (so it has a node called "root" that is the ancestor of every -other node) whose every edge has a label which is a substring of S -that satisfies the following conditions: - -- Starting from the root of T, walking down any path to a leaf, and - concatenating the labels along the way, then we will get a suffix of - S. And every suffix of S is obtained in this way as well. -- Every node has at least two out-going edges. -- For every node, every two out-going edges cannot have labels that - start with the same letter. (So two edges with labels both starting - with 'a' cannot emanate from the same node.) - -Intuitively speaking, this is to list all suffixes of S as an edge -from the root to a leaf, and then "merge" these suffixes so that any -common prefix among some of them is in only one edge. - - - -====================================================================== - Description of the algorithm -====================================================================== - -Below is a breif description of the algorithm. For a description in -"plain English", see the accepted answer to this Stack Overflow post. - -https://stackoverflow.com/questions/9452701/ - -For a more detailed survey on the principle behind the algorithm and -on many other related topics, see the book "Algorithms on Strings, -Trees, and Sequences: Computer Science and Computational Biology" by -Dan Gusfield, or if you prefer reading the original paper, then the -original paper by Ukkonen is as follows. - -https://link.springer.com/article/10.1007/BF01206331 - -(I am fortunate enough to be able to access the article. If you want -to read that PDF and don't want to pay Springer, let me know and I can -send you the file.) - -Given a string S of length n, we will first append a symbol that is -not present anywhere in S, in order to ensure that no suffix of S is -also the prefix of another suffix of S; otherwise S cannot have a -suffix tree. I refer to this terminating symbol as $. - -Also an edge of the tree is not labelled explicitly by strings. To do -so would violate already the linear time constraint. Instead, we -represent each edge with a pair of integers, interpreted as the -indices of the starting and the ending points of the associated -substring in T. - -The algorithm has n + 1 iterations. We start with the following -variables: - -- s = root -- k = 1 -- i = 0 -# - A root node. -# - An "active point" with value (root, nil, 0). -# - "remainder" with value 1. - -Then we want to add n + 1 symbols to the tree iteratively. - -In the i-th iteration we look to add the i-th letter S (i) of S. In -Emacs Lisp this is expressed as (aref S i). - -# substring -# S[(i-remainder+1)..i] - -And in each iteration we do the following things: - -- (setq i (+ i 1)) -- (let ((result (update s k i))) - (setq s (car result)) - (setq k (cadr result))) - We update by adding the i-th letter S(i) to the current active point - indicated by (s, k, i - 1), see below. -- (let ((result (canonize s k i))) - (setq s (car result)) - (setq k (cadr result))) - We canonize the new active point returned by the update function. - The process of canonization is to find the closest node to the - point. - -Then we repeat until S(i) equals the terminating symbol (this way we -avoid calculating the length of the string beforehand, since in Emacs -Lisp the string does not have the length pre-calculated. Though this -does not affect the overall time complexity, it might affect the -practical performance. - -The function "update is described below. - -It taks three arguments: s, k, and i. First let oldr = root. Then let -(end-p, r) be the result of (test-and-split s k (- i 1) S(i)). - - - - - - - - - - - - - -# begin a "remainder loop". Whenever the remainder -# loop ends, we go to the next iteration. - -We compare the letter S(i) with the labels of edges going out from the -active point. That is, if the active point is (node, edge_label, m), -then this is the length m point in the edge going out from node with -the first letter of label given as edge_label. - -If the letter is the prefix of some edge label, then we set the -active point to (node, edge_label, m+1) and increment remainder by 1. -If m+1 is greater than or equal to the length of the current edge, -then set the active point to follow that edge to the new point. - -Then we end the remainder loop and skip to the next iteration. - ---- - -If the letter is not the prefix of any edge label emanating from the -active point, and m > 0, then we split the point (node, edge_label, m) -into a node and add a new leaf to that node, with label S(i). And then -we decrement remainder by 1. - -If this is not the first time we split a node in the current -iteration, then we add a "suffix link" from the previously created -node to the newly created node. - -If the "node" in the specification of the active point is the root, -then we set the active point to (root, S(i-remainder+1), m-1). - -If the "node" is not the root, then if the node has a suffix link to -other_node, then we set the active point to the following: -(other_node, S(i-remainder+1), m-1) - -If the node has no suffix link, then we set the active point to the -following: (root, S(i-remainder+1), m-1) - ---- - -If the letter is not the prefix of any edge label emanating from the -active point, and if m = 0, then we add a new leaf with label S(i) to -node, and decrement remainder by 1. Then we follow the same rules to -reset the active point. diff --git a/comb/suffix-tree.el b/comb/suffix-tree.el deleted file mode 100644 index af55ba9..0000000 --- a/comb/suffix-tree.el +++ /dev/null @@ -1,328 +0,0 @@ -;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*- - -;;; Author: Durand -;;; Version: 0.0.1 - -;;; Commentary: - -;; Our node is represented as a list of the following elements: - -;; start , which is the starting index of the edge going from its parent -;; node to this node -;; end , the index of the end of the above edge -;; suffix-link, the index of the node this node's suffix-link points to -;; children , a hash-table of pairs of integers and indices of its -;; children - -;; To compute the length of the edge going into NODE we use: -;; (- (min end (1+ st-position)) start) -;; which is actually how far the position is on that edge, -;; if it is on that edge. - -;;; Code: - -;;;###autoload -(defun st-min (&rest args) - "Return the minimum among ARGS. -If an argument is 'infty, then it is considered greater than -every number." - (apply #'min (delq nil - (mapcar - (lambda (arg) - (cond - ((number-or-marker-p arg) arg) - ((eq arg 'infty) nil))) - args)))) - -;;;###autoload -(defun st-edge-length (node position) - "Return the length of the edge into NODE. -See the comment above this function for the reason -POSITION is here." - (- (st-min (car (cdr node)) (1+ position)) - (car node))) - -;;;###autoload -(defun st-new-node (tree last-added start &optional end) - "Make a new node with START and END as the coming edge. -Then add the new node to TREE. -LAST-ADDED is the number of elements already in the TREE." - (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf - ) - (suffix-link 0) ;; The suffix link is initially 0 - (new-node - (list start end - suffix-link - (make-hash-table)))) - (puthash (1+ last-added) new-node tree) - (1+ last-added))) - -;;;###autoload -(defun st-add-suffix-link (tree need-sl node) - "If NEED-SL is positive, then add the suffix link. -In particular, the node corresponding to NEED-SL in TREE -gets a suffix link pointing to NODE. - -This always returns NODE." - (cond - ((and (> need-sl 1) (/= need-sl node)) - (setcar (cdr (cdr (gethash need-sl tree))) node))) - node) - -;;;###autoload -(defun st-canonize (tree node position active-edge-index active-length active-node) - "Walk down TREE to find the correct active point." - (let ((node-edge-length (st-edge-length (gethash node tree) position))) - (cond - ((>= active-length node-edge-length) - (list t - (+ active-edge-index - node-edge-length) - (- active-length - node-edge-length) - node)) - (t - (list nil - active-edge-index - active-length - active-node))))) - -;;;###autoload -(defun st-extend-tree (tree last-added position remain - active-node active-edge-index - active-length character str) - "Extend a tree by CHARACTER. -The return value is -(tree last-added remain active-node - active-edge-index active-length)" - (let* ((need-sl 0) - (remain (1+ remain)) - continue-p breakp) - (while (and (not breakp) (> remain 0)) - (setq continue-p nil breakp nil) - (cond - ((= active-length 0) - (setq active-edge-index position))) - (let* ((actual-node (gethash active-node tree)) - (nxt (cond - (actual-node - (gethash (aref str active-edge-index) - (cadr (cdr (cdr actual-node)))))))) - (cond - ((null nxt) - (let ((leaf (st-new-node tree last-added position))) - (setq last-added leaf) - (puthash (aref str active-edge-index) - leaf - (cadr (cdr (cdr (gethash active-node tree))))) - (setq need-sl (st-add-suffix-link tree need-sl active-node))) - ;; rule 2 - ) - (t - (let* ((result (st-canonize - tree nxt position active-edge-index - active-length active-node))) - (cond - ((car result) - ;; observation 2 - (setq active-edge-index (cadr result)) - (setq active-length (caddr result)) - (setq active-node (cadr (cddr result))) - (setq continue-p t)) - (t - (cond - ((eq (aref str (+ active-length - (car (gethash nxt tree)))) - character) - ;; observation 1 - (setq active-length (1+ active-length)) - (setq need-sl (st-add-suffix-link tree need-sl active-node)) - (setq breakp t))) - (cond - (breakp) - (t ;; splitting - (let ((split (st-new-node - tree last-added (car (gethash nxt tree)) - (+ (car (gethash nxt tree)) active-length)))) - (setq last-added split) - (puthash - (aref str active-edge-index) - split (cadr (cdr (cdr (gethash active-node tree))))) - (let ((leaf (st-new-node tree last-added position))) - (setq last-added leaf) - (puthash character leaf - (cadddr (gethash split tree))) - (setcar (gethash nxt tree) - (+ (car (gethash nxt tree)) - active-length)) - (puthash (aref str (car (gethash nxt tree))) nxt - (cadddr (gethash split tree))) - ;; rule 2 - (setq need-sl - (st-add-suffix-link tree need-sl split))))))))))) - (cond - ((or continue-p breakp)) - (t - (setq remain (1- remain)) - (cond - ((and (eq active-node 1) ; root - (> active-length 0)) - (setq active-length (1- active-length)) - (setq active-edge-index - (1+ (- position remain)))) - (t - (setq active-node - (let ((slink (caddr (gethash active-node tree)))) - (cond - ((> slink 1) slink) - ; or root - (t 1)))))))))) - (list tree last-added remain active-node - active-edge-index active-length))) - -;;;###autoload -(defun st-build-for-str (str) - "Build the suffix tree for STR." - (let* ((position 0) - (character (ignore-error 'error (aref str position))) - (tree (make-hash-table)) - (last-added 0) - (remain 0) - (active-node (st-new-node tree last-added -1 -1)) - (active-edge-index 0) - (active-length 0) - result) - (setq last-added active-node) - (while character - (setq result (st-extend-tree tree last-added position remain - active-node active-edge-index - active-length character str)) - (setq tree (pop result)) - (setq last-added (pop result)) - (setq remain (pop result)) - (setq active-node (pop result)) - (setq active-edge-index (pop result)) - (setq active-length (pop result)) - (setq position (1+ position)) - (setq character (ignore-error 'error (aref str position)))) - tree)) - -;;;###autoload -(defun st-print-tree-for-str (str) - "Print the suffix tree for STR." - (let* ((symbol (make-symbol "test"))) - (set symbol (st-build-for-str str)) - (insert "suffix tree for \"" str "\"" "\n") - (st-print-tree (symbol-value symbol) str))) - -;;;###autoload -(defun st-print-tree (tree str) - "Print TREE with the aid of STR." - (let* ((symbol-tree (make-symbol "test-tree"))) - (set symbol-tree (hierarchy-new)) - (maphash - (lambda (key value) - (hierarchy-add-tree - (symbol-value symbol-tree) key nil - (lambda (item) - (hash-table-values (cadr (cdr (cdr (gethash item tree)))))))) - tree) - (hierarchy-print (symbol-value symbol-tree) - (lambda (item) - (cond ((= item 1) "root") - (t (let ((node (gethash item tree))) - (substring str (car node) (cond ((integerp (cadr node)) (cadr node)))) - ;; (format "%d): (%S, %S): (%d): \"%s\"" - ;; item - ;; (car node) - ;; (cadr node) - ;; (caddr node) - ;; (substring str (car node) (cond ((integerp (cadr node)) (cadr node)))) - ;; ) - ))))))) - -;;; Some printing functions - -(use-package "hierarchy" 'hierarchy) - -(provide 'suffix-tree) -;;; suffiex-tree.el ends here - -;;; archive - -;; ;;;###autoload -;; (defvar st-root nil -;; "The root of the suffix tree.") - -;; ;;;###autoload -;; (defvar st-position nil -;; "The current position in the string.") - -;; ;;;###autoload -;; (defvar st-wait-for-link nil -;; "The node that is waiting to have a suffix link.") - -;; ;;;###autoload -;; (defvar st-remain nil -;; "The number of remaining suffixes to insert.") - -;; ;;;###autoload -;; (defvar st-active-node nil -;; "Where to insert a new suffix.") - -;; ;;;###autoload -;; (defvar st-active-edge-index nil -;; "The index of the active edge.") - -;; ;;;###autoload -;; (defvar st-active-length nil -;; "How far down is the active point down the active edge.") - -;; (insert (format "after character %c, the tree becomes\n" character)) -;; (insert (format "string is %s\n" (substring str 0 (1+ position)))) -;; (insert (format "active-node: %d\n" active-node)) -;; (insert (format "active-edge: %c\n" (aref str active-edge-index))) -;; (insert (format "active-length: %d\n" active-length)) -;; (cond ((eq last-added 11) -;; (insert (format "slink: %d\n" (caddr (gethash active-node tree)))) -;; (insert (format "con: %S" continue-p)) -;; (insert (format "breakp: %S\n" breakp)))) -;; (st-print-tree tree (substring str 0 (1+ position))) -;; (insert "\n\n") - - -;; ;;;###autoload -;; (defvar st-output-buffer-name "*suffix-tree*" -;; "The name of the buffer that contains the output.") - -;; ;;;###autoload -;; (defun st-print-tree (tree node str prefix) -;; "Print TREE in the dedicated output buffer starting at NODE. -;; PREFIX is used to recursively call this function. - -;; We need STR since the information we stored in the tree -;; only contains the index into STR, -;; for the sake of optimizations." -;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil -;; (st-print-tree-internal tree node str prefix))) - -;; ;;;###autoload -;; (defun st-print-tree-internal (tree node str prefix) -;; "The function that really prints out the data." -;; (let* ((node-data (gethash node tree)) -;; (start (car node-data)) -;; (end (cadr node-data)) -;; (edges (cdddr node-data)) -;; (node-num (format "(%d)" node)) -;; (node-label (format "%s-%s-%s" -;; prefix node-num -;; (substring str start end)))) -;; (prin1 node-label) -;; ;; print the first edge -;; (let* ((prefix-base (concat -;; prefix -;; (make-string (length node-label) 32))) -;; ;; TODO -;; )))) - - |