summaryrefslogtreecommitdiff
path: root/comb
diff options
context:
space:
mode:
Diffstat (limited to 'comb')
-rw-r--r--comb/orderless-conf.el39
-rw-r--r--comb/suffiex-tree.txt217
-rw-r--r--comb/suffix-tree.el328
3 files changed, 39 insertions, 545 deletions
diff --git a/comb/orderless-conf.el b/comb/orderless-conf.el
new file mode 100644
index 0000000..8af75e4
--- /dev/null
+++ b/comb/orderless-conf.el
@@ -0,0 +1,39 @@
+;;; -*- lexical-binding: t; -*-
+
+(use-package "orderless" 'orderless
+ (setq completion-styles '(orderless partial-completion)
+ completion-category-defaults nil
+ completion-flex-nospace nil
+ completion-pcm-complete-word-inserts-delimiters t
+ completion-show-help nil
+ completion-ignore-case t
+ read-file-name-completion-ignore-case t
+ read-buffer-completion-ignore-case t
+ resize-mini-windows t)
+
+ (setq orderless-component-separator " +"
+ orderless-matching-styles '(orderless-prefixes
+ orderless-literal
+ orderless-regexp
+ orderless-strict-leading-initialism)
+ orderless-style-dispatchers '(equal-means-literal-dispatcher
+ comma-means-initial-dispatcher))
+
+ ;; unbind space in the minibuffer
+ (define-key minibuffer-local-completion-map (vector 32) nil)
+
+ ;;; dispatchers
+
+;;;###autoload
+ (defun equal-means-literal-dispatcher (pattern _index _total)
+ "If a pattern begins with an equal sign, then we use literals."
+ (cond
+ ((= (aref pattern 0) ?=)
+ (cons 'orderless-literal (substring pattern 1)))))
+
+;;;###autoload
+ (defun comma-means-initial-dispatcher (pattern _index _total)
+ "If a pattern begins with a comma, then we use initials to match."
+ (cond
+ ((= (aref pattern 0) ?,)
+ (cons 'orderless-strict-leading-initialism (substring pattern 1))))))
diff --git a/comb/suffiex-tree.txt b/comb/suffiex-tree.txt
deleted file mode 100644
index 51908a3..0000000
--- a/comb/suffiex-tree.txt
+++ /dev/null
@@ -1,217 +0,0 @@
-Title: Suffix Trees
-Author: JSDurand
-Created: 2020-01-03
--------------------
-
-======================================================================
- Motivation to implement this algorithm in Emacs
-======================================================================
-
-The reason I want to implement this algorithm is a problem I
-encountered in using the package "orderless", which provides a
-completion-style for the built-in completion system in Emacs. The
-problem is that the function "orderless-try-completion" does not
-handle completion aggressively in a way that conforms to the
-documentation of "try-completion". To be more precise, if there is
-only one match for the current text input, then this function returns
-that match (rather than comforming to the requirement in the
-documentation of "try-completion" by returning t, since it wants to
-highlight the matches by itself). This isn't a problem from the
-perspective of the user, and if there are no matches, then this
-correctly returns nil. But in any other cases, this function returns
-the original string, instead of the longest common substring, as a
-user might desire.
-
-Initially, this does not seem like a serious concern: the user could
-still select a completion from the "*Completions*" buffer once the
-list of candidates becomes small enough to easily manage. But as time
-goes by, this starts to frustrate me. For example, when there are only
-two candidates left, but one is a substring of another, then I cannot
-use the completion feature to quickly select (or "complete to") the
-shorter candidate, unless I type out the full candidate string, or to
-choose from the "*Completions*" buffer. For me this kind of defeats
-the main purpose of the built-in completion system.
-
-So I start wondering, is it possible to fix the problem by finding the
-longest common substring in all the matches? From a naïve first
-impression, this seems to be what the user might expect in most cases.
-
-
-
-======================================================================
- Choice of the algorithm
-======================================================================
-
-After some thinking and searching through the internet, I found that
-perhaps the most flexible and performant solution to the problem of
-finding the longest common substring(s) of multiple strings is to
-build a "generalized suffix tree" of them, and then use a tree
-traversal to find the longest common substring(s). Well, this is all
-fun and great. The only problem is that it is difficult to build a
-suffix tree (or a generalized one).
-
-So I decide to implement the seemingly fastest algorithm to construct
-a suffix tree of strings and hope that this can not only solve my
-problem but also help others out in some other problems in the future.
-
-
-
-======================================================================
- Definitions
-======================================================================
-
-I describe the basic definition of a suffix tree briefly below.
-
-In this document a string is a sequence of alphabets. In particular,
-for the case of Emacs, these alphabets are just numbers. We begin by
-considering a string S of length n. A suffix of S is a substring of S
-of the form S[i..n] for some i from 1 to n. And we also say the empty
-string is a suffix of S. A suffix tree of S is defined as a rooted
-tree T (so it has a node called "root" that is the ancestor of every
-other node) whose every edge has a label which is a substring of S
-that satisfies the following conditions:
-
-- Starting from the root of T, walking down any path to a leaf, and
- concatenating the labels along the way, then we will get a suffix of
- S. And every suffix of S is obtained in this way as well.
-- Every node has at least two out-going edges.
-- For every node, every two out-going edges cannot have labels that
- start with the same letter. (So two edges with labels both starting
- with 'a' cannot emanate from the same node.)
-
-Intuitively speaking, this is to list all suffixes of S as an edge
-from the root to a leaf, and then "merge" these suffixes so that any
-common prefix among some of them is in only one edge.
-
-
-
-======================================================================
- Description of the algorithm
-======================================================================
-
-Below is a breif description of the algorithm. For a description in
-"plain English", see the accepted answer to this Stack Overflow post.
-
-https://stackoverflow.com/questions/9452701/
-
-For a more detailed survey on the principle behind the algorithm and
-on many other related topics, see the book "Algorithms on Strings,
-Trees, and Sequences: Computer Science and Computational Biology" by
-Dan Gusfield, or if you prefer reading the original paper, then the
-original paper by Ukkonen is as follows.
-
-https://link.springer.com/article/10.1007/BF01206331
-
-(I am fortunate enough to be able to access the article. If you want
-to read that PDF and don't want to pay Springer, let me know and I can
-send you the file.)
-
-Given a string S of length n, we will first append a symbol that is
-not present anywhere in S, in order to ensure that no suffix of S is
-also the prefix of another suffix of S; otherwise S cannot have a
-suffix tree. I refer to this terminating symbol as $.
-
-Also an edge of the tree is not labelled explicitly by strings. To do
-so would violate already the linear time constraint. Instead, we
-represent each edge with a pair of integers, interpreted as the
-indices of the starting and the ending points of the associated
-substring in T.
-
-The algorithm has n + 1 iterations. We start with the following
-variables:
-
-- s = root
-- k = 1
-- i = 0
-# - A root node.
-# - An "active point" with value (root, nil, 0).
-# - "remainder" with value 1.
-
-Then we want to add n + 1 symbols to the tree iteratively.
-
-In the i-th iteration we look to add the i-th letter S (i) of S. In
-Emacs Lisp this is expressed as (aref S i).
-
-# substring
-# S[(i-remainder+1)..i]
-
-And in each iteration we do the following things:
-
-- (setq i (+ i 1))
-- (let ((result (update s k i)))
- (setq s (car result))
- (setq k (cadr result)))
- We update by adding the i-th letter S(i) to the current active point
- indicated by (s, k, i - 1), see below.
-- (let ((result (canonize s k i)))
- (setq s (car result))
- (setq k (cadr result)))
- We canonize the new active point returned by the update function.
- The process of canonization is to find the closest node to the
- point.
-
-Then we repeat until S(i) equals the terminating symbol (this way we
-avoid calculating the length of the string beforehand, since in Emacs
-Lisp the string does not have the length pre-calculated. Though this
-does not affect the overall time complexity, it might affect the
-practical performance.
-
-The function "update is described below.
-
-It taks three arguments: s, k, and i. First let oldr = root. Then let
-(end-p, r) be the result of (test-and-split s k (- i 1) S(i)).
-
-
-
-
-
-
-
-
-
-
-
-
-
-# begin a "remainder loop". Whenever the remainder
-# loop ends, we go to the next iteration.
-
-We compare the letter S(i) with the labels of edges going out from the
-active point. That is, if the active point is (node, edge_label, m),
-then this is the length m point in the edge going out from node with
-the first letter of label given as edge_label.
-
-If the letter is the prefix of some edge label, then we set the
-active point to (node, edge_label, m+1) and increment remainder by 1.
-If m+1 is greater than or equal to the length of the current edge,
-then set the active point to follow that edge to the new point.
-
-Then we end the remainder loop and skip to the next iteration.
-
----
-
-If the letter is not the prefix of any edge label emanating from the
-active point, and m > 0, then we split the point (node, edge_label, m)
-into a node and add a new leaf to that node, with label S(i). And then
-we decrement remainder by 1.
-
-If this is not the first time we split a node in the current
-iteration, then we add a "suffix link" from the previously created
-node to the newly created node.
-
-If the "node" in the specification of the active point is the root,
-then we set the active point to (root, S(i-remainder+1), m-1).
-
-If the "node" is not the root, then if the node has a suffix link to
-other_node, then we set the active point to the following:
-(other_node, S(i-remainder+1), m-1)
-
-If the node has no suffix link, then we set the active point to the
-following: (root, S(i-remainder+1), m-1)
-
----
-
-If the letter is not the prefix of any edge label emanating from the
-active point, and if m = 0, then we add a new leaf with label S(i) to
-node, and decrement remainder by 1. Then we follow the same rules to
-reset the active point.
diff --git a/comb/suffix-tree.el b/comb/suffix-tree.el
deleted file mode 100644
index af55ba9..0000000
--- a/comb/suffix-tree.el
+++ /dev/null
@@ -1,328 +0,0 @@
-;;; suffiex-tree.el --- Ukkonen algorithm for building a suffix tree -*- lexical-binding: t; -*-
-
-;;; Author: Durand
-;;; Version: 0.0.1
-
-;;; Commentary:
-
-;; Our node is represented as a list of the following elements:
-
-;; start , which is the starting index of the edge going from its parent
-;; node to this node
-;; end , the index of the end of the above edge
-;; suffix-link, the index of the node this node's suffix-link points to
-;; children , a hash-table of pairs of integers and indices of its
-;; children
-
-;; To compute the length of the edge going into NODE we use:
-;; (- (min end (1+ st-position)) start)
-;; which is actually how far the position is on that edge,
-;; if it is on that edge.
-
-;;; Code:
-
-;;;###autoload
-(defun st-min (&rest args)
- "Return the minimum among ARGS.
-If an argument is 'infty, then it is considered greater than
-every number."
- (apply #'min (delq nil
- (mapcar
- (lambda (arg)
- (cond
- ((number-or-marker-p arg) arg)
- ((eq arg 'infty) nil)))
- args))))
-
-;;;###autoload
-(defun st-edge-length (node position)
- "Return the length of the edge into NODE.
-See the comment above this function for the reason
-POSITION is here."
- (- (st-min (car (cdr node)) (1+ position))
- (car node)))
-
-;;;###autoload
-(defun st-new-node (tree last-added start &optional end)
- "Make a new node with START and END as the coming edge.
-Then add the new node to TREE.
-LAST-ADDED is the number of elements already in the TREE."
- (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf
- )
- (suffix-link 0) ;; The suffix link is initially 0
- (new-node
- (list start end
- suffix-link
- (make-hash-table))))
- (puthash (1+ last-added) new-node tree)
- (1+ last-added)))
-
-;;;###autoload
-(defun st-add-suffix-link (tree need-sl node)
- "If NEED-SL is positive, then add the suffix link.
-In particular, the node corresponding to NEED-SL in TREE
-gets a suffix link pointing to NODE.
-
-This always returns NODE."
- (cond
- ((and (> need-sl 1) (/= need-sl node))
- (setcar (cdr (cdr (gethash need-sl tree))) node)))
- node)
-
-;;;###autoload
-(defun st-canonize (tree node position active-edge-index active-length active-node)
- "Walk down TREE to find the correct active point."
- (let ((node-edge-length (st-edge-length (gethash node tree) position)))
- (cond
- ((>= active-length node-edge-length)
- (list t
- (+ active-edge-index
- node-edge-length)
- (- active-length
- node-edge-length)
- node))
- (t
- (list nil
- active-edge-index
- active-length
- active-node)))))
-
-;;;###autoload
-(defun st-extend-tree (tree last-added position remain
- active-node active-edge-index
- active-length character str)
- "Extend a tree by CHARACTER.
-The return value is
-(tree last-added remain active-node
- active-edge-index active-length)"
- (let* ((need-sl 0)
- (remain (1+ remain))
- continue-p breakp)
- (while (and (not breakp) (> remain 0))
- (setq continue-p nil breakp nil)
- (cond
- ((= active-length 0)
- (setq active-edge-index position)))
- (let* ((actual-node (gethash active-node tree))
- (nxt (cond
- (actual-node
- (gethash (aref str active-edge-index)
- (cadr (cdr (cdr actual-node))))))))
- (cond
- ((null nxt)
- (let ((leaf (st-new-node tree last-added position)))
- (setq last-added leaf)
- (puthash (aref str active-edge-index)
- leaf
- (cadr (cdr (cdr (gethash active-node tree)))))
- (setq need-sl (st-add-suffix-link tree need-sl active-node)))
- ;; rule 2
- )
- (t
- (let* ((result (st-canonize
- tree nxt position active-edge-index
- active-length active-node)))
- (cond
- ((car result)
- ;; observation 2
- (setq active-edge-index (cadr result))
- (setq active-length (caddr result))
- (setq active-node (cadr (cddr result)))
- (setq continue-p t))
- (t
- (cond
- ((eq (aref str (+ active-length
- (car (gethash nxt tree))))
- character)
- ;; observation 1
- (setq active-length (1+ active-length))
- (setq need-sl (st-add-suffix-link tree need-sl active-node))
- (setq breakp t)))
- (cond
- (breakp)
- (t ;; splitting
- (let ((split (st-new-node
- tree last-added (car (gethash nxt tree))
- (+ (car (gethash nxt tree)) active-length))))
- (setq last-added split)
- (puthash
- (aref str active-edge-index)
- split (cadr (cdr (cdr (gethash active-node tree)))))
- (let ((leaf (st-new-node tree last-added position)))
- (setq last-added leaf)
- (puthash character leaf
- (cadddr (gethash split tree)))
- (setcar (gethash nxt tree)
- (+ (car (gethash nxt tree))
- active-length))
- (puthash (aref str (car (gethash nxt tree))) nxt
- (cadddr (gethash split tree)))
- ;; rule 2
- (setq need-sl
- (st-add-suffix-link tree need-sl split)))))))))))
- (cond
- ((or continue-p breakp))
- (t
- (setq remain (1- remain))
- (cond
- ((and (eq active-node 1) ; root
- (> active-length 0))
- (setq active-length (1- active-length))
- (setq active-edge-index
- (1+ (- position remain))))
- (t
- (setq active-node
- (let ((slink (caddr (gethash active-node tree))))
- (cond
- ((> slink 1) slink)
- ; or root
- (t 1))))))))))
- (list tree last-added remain active-node
- active-edge-index active-length)))
-
-;;;###autoload
-(defun st-build-for-str (str)
- "Build the suffix tree for STR."
- (let* ((position 0)
- (character (ignore-error 'error (aref str position)))
- (tree (make-hash-table))
- (last-added 0)
- (remain 0)
- (active-node (st-new-node tree last-added -1 -1))
- (active-edge-index 0)
- (active-length 0)
- result)
- (setq last-added active-node)
- (while character
- (setq result (st-extend-tree tree last-added position remain
- active-node active-edge-index
- active-length character str))
- (setq tree (pop result))
- (setq last-added (pop result))
- (setq remain (pop result))
- (setq active-node (pop result))
- (setq active-edge-index (pop result))
- (setq active-length (pop result))
- (setq position (1+ position))
- (setq character (ignore-error 'error (aref str position))))
- tree))
-
-;;;###autoload
-(defun st-print-tree-for-str (str)
- "Print the suffix tree for STR."
- (let* ((symbol (make-symbol "test")))
- (set symbol (st-build-for-str str))
- (insert "suffix tree for \"" str "\"" "\n")
- (st-print-tree (symbol-value symbol) str)))
-
-;;;###autoload
-(defun st-print-tree (tree str)
- "Print TREE with the aid of STR."
- (let* ((symbol-tree (make-symbol "test-tree")))
- (set symbol-tree (hierarchy-new))
- (maphash
- (lambda (key value)
- (hierarchy-add-tree
- (symbol-value symbol-tree) key nil
- (lambda (item)
- (hash-table-values (cadr (cdr (cdr (gethash item tree))))))))
- tree)
- (hierarchy-print (symbol-value symbol-tree)
- (lambda (item)
- (cond ((= item 1) "root")
- (t (let ((node (gethash item tree)))
- (substring str (car node) (cond ((integerp (cadr node)) (cadr node))))
- ;; (format "%d): (%S, %S): (%d): \"%s\""
- ;; item
- ;; (car node)
- ;; (cadr node)
- ;; (caddr node)
- ;; (substring str (car node) (cond ((integerp (cadr node)) (cadr node))))
- ;; )
- )))))))
-
-;;; Some printing functions
-
-(use-package "hierarchy" 'hierarchy)
-
-(provide 'suffix-tree)
-;;; suffiex-tree.el ends here
-
-;;; archive
-
-;; ;;;###autoload
-;; (defvar st-root nil
-;; "The root of the suffix tree.")
-
-;; ;;;###autoload
-;; (defvar st-position nil
-;; "The current position in the string.")
-
-;; ;;;###autoload
-;; (defvar st-wait-for-link nil
-;; "The node that is waiting to have a suffix link.")
-
-;; ;;;###autoload
-;; (defvar st-remain nil
-;; "The number of remaining suffixes to insert.")
-
-;; ;;;###autoload
-;; (defvar st-active-node nil
-;; "Where to insert a new suffix.")
-
-;; ;;;###autoload
-;; (defvar st-active-edge-index nil
-;; "The index of the active edge.")
-
-;; ;;;###autoload
-;; (defvar st-active-length nil
-;; "How far down is the active point down the active edge.")
-
-;; (insert (format "after character %c, the tree becomes\n" character))
-;; (insert (format "string is %s\n" (substring str 0 (1+ position))))
-;; (insert (format "active-node: %d\n" active-node))
-;; (insert (format "active-edge: %c\n" (aref str active-edge-index)))
-;; (insert (format "active-length: %d\n" active-length))
-;; (cond ((eq last-added 11)
-;; (insert (format "slink: %d\n" (caddr (gethash active-node tree))))
-;; (insert (format "con: %S" continue-p))
-;; (insert (format "breakp: %S\n" breakp))))
-;; (st-print-tree tree (substring str 0 (1+ position)))
-;; (insert "\n\n")
-
-
-;; ;;;###autoload
-;; (defvar st-output-buffer-name "*suffix-tree*"
-;; "The name of the buffer that contains the output.")
-
-;; ;;;###autoload
-;; (defun st-print-tree (tree node str prefix)
-;; "Print TREE in the dedicated output buffer starting at NODE.
-;; PREFIX is used to recursively call this function.
-
-;; We need STR since the information we stored in the tree
-;; only contains the index into STR,
-;; for the sake of optimizations."
-;; (with-temp-buffer-window st-output-buffer-name '((display-buffer-at-bottom)) nil
-;; (st-print-tree-internal tree node str prefix)))
-
-;; ;;;###autoload
-;; (defun st-print-tree-internal (tree node str prefix)
-;; "The function that really prints out the data."
-;; (let* ((node-data (gethash node tree))
-;; (start (car node-data))
-;; (end (cadr node-data))
-;; (edges (cdddr node-data))
-;; (node-num (format "(%d)" node))
-;; (node-label (format "%s-%s-%s"
-;; prefix node-num
-;; (substring str start end))))
-;; (prin1 node-label)
-;; ;; print the first edge
-;; (let* ((prefix-base (concat
-;; prefix
-;; (make-string (length node-label) 32)))
-;; ;; TODO
-;; ))))
-
-