summaryrefslogtreecommitdiff
path: root/suffix tree/LCS.el
blob: 41bcec0941fa24ee6d96f9228161944b6ae93b87 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
;;; LCS.el --- Longest Common Substrings of multiple strings -*- lexical-binding: t; -*-

;;; Author: Durand
;;; Version: 0.0.1

;;; Commentary:

;; First generate a generalized suffix tree of the strings, then
;; traverses the tree in a depth-first manner to find the nodes under
;; which there are suffixes from each and every string in question.
;; Finally collect and return the list of all substrings which are
;; common to all input strings and which have the longest length.

;;; Code:

;;; Our building block
(require 'generalized-suffix-tree)

;;;###autoload
(defun lcs-string (strs)
  "Return the longest common substring of STRS."
  (lcs-interpret (lcs strs) strs))

;;;###autoload
(defun lcs-edge-length (node str-lens)
  "Return the length of the edge going into NODE.

Unlike `gst-edge-length', this does not need to know about
POSITION and NUM, since it is assumed that the tree is already
built before calling this function.

If the node is a leaf, its end is denoted by infty, and we
wouldn't know the exact length if we don't know the length of the
string it corresponds to, so we need the argument STR-LENS to
provide that information."
  (let ((start (car (cdr node)))
        (end (cond
              ((eq (car (cdr (cdr node))) 'infty)
               (nth (car node) str-lens))
              (t (car (cdr (cdr node)))))))
    (- end start)))

;;;###autoload
(defun lcs-get-parent-length (parents)
  "Return the length of the parent strings PARENTS."
  (let* ((ls parents)
         (result 0))
    (while (consp ls)
      (setq result (+ result (- (caddr (car ls))
                                (cadr (car ls)))))
      (setq ls (cdr ls)))
    result))

;;;###autoload
(defun lcs-get-env (node str-lens)
  "Return the information we want to keep from NODE on the stack.

STR-LENS is here to provide information about the ending index of
NODE if it is a lead. See the documentation for `lcs-edge-length'
for more."
  (let ((num (car node))
        (start (car (cdr node)))
        (end (cond
              ((eq (car (cdr (cdr node))) 'infty)
               (nth (car node) str-lens))
              (t (car (cdr (cdr node))))))
        (children (car (nthcdr 4 node))))
    (list num start end children)))

;;; For debug purposes

;;;###autoload
(defmacro lcs-debug (&rest vars)
  "Print the names and the values of VARS."
  `(insert
    (mapconcat 'identity
     (list
      ,@(mapcar
         (lambda (var)
           `(format "%s: %S"
             ,(symbol-name var)
             ,var))
         vars))
     "\n")
    "\n"))

;;;###autoload
(defun lcs-pretty-stack-element (element)
  "Return a pretty representation of ELEMENT."
  (concat
   "("
   (mapconcat
    (lambda (slot)
      (format "%S"
              (cond
               ((hash-table-p slot)
                (gst-pretty-hash-table slot))
               ((consp slot)
                (mapconcat #'lcs-pretty-stack-element slot ", "))
               (t slot))))
    element ", ")
   ")"))

;;;###autoload
(defun lcs-pretty-stack (stack)
  "Prettify STACK.

Apply `lcs-pretty-stack-element' to each element, and concatenate
them in a clean way."
  (concat
   "[ "
   (mapconcat #'lcs-pretty-stack-element
              stack
              ",")
   " ]"
   "\n"))

;;;###autoload
(defun lcs-prepare-env (long-env)
  "Take the first three elements out of LONG-ENV."
  (cons (car long-env)
        (cons (cadr long-env)
              (cons (caddr long-env)
                    nil))))

;;; Interpreting the result

;;;###autoload
(defun lcs-interpret (result strs &optional just-length)
  "Since the raw RESULT of `lcs' is not human-readable, we interpret it.
If JUST-LENGTH is non-nil, then this only returns the length of
the longest common substrings. Otherwise, it returns the longest
common substrings themselves.

Of course, without STRS, we don't even know what our strings are."
  (cond
   (just-length (car result))
   (t (mapcar
       (lambda (one-result)
         (let ((chain one-result)
               temp res-str)
           (while (consp one-result)
             (setq temp (car one-result))
             (setq res-str
                   (concat
                    (let ((nstr (nth (car temp) strs)))
                      (substring nstr
                                 (cadr temp)
                                 (min (caddr temp) (length nstr))))
                    res-str))
             (setq one-result (cdr one-result)))
           res-str))
       (cadr result)))))

;;; The main engine

;;;###autoload
(defun lcs (strs &optional just-length)
  "Return the longest common substring of a list STRS of strings.

If JUST-LENGTH is non-nil, then this only returns the length of the
longest common substring. Otherwise, it returns the length as well as the
longest common substring itself."
  (let* ((strs (mapcar (lambda (str)
                         (vconcat str (list -1)))
                       strs))
         (str-lens (mapcar #'length strs))
         (tree (gst-build-for-strs strs str-lens))
         (max-height 0)
         (bits-table (make-hash-table))
         (discovered-table (make-hash-table))
         (all-common (1- (expt 2 (length strs))))
         max-env stack current temp)
    ;; The format of elements on the stack is as follows.
    ;; (number start end children node-number parents)
    
    ;; Here number identifies the string this comes from. start, end
    ;; and children have the same meaning as in the tree. Node-number
    ;; is the number of the node that is recorded in the hash-table
    ;; tree. And finally parents is a list of triples that represents
    ;; parent segments.
    (setq stack (progn
                  (maphash
                   (lambda (letter node)
                     (setq temp
                           (cons
                            (append
                             (lcs-get-env (gethash node tree) str-lens)
                             (list node nil ; parent strings represented as a triple of integers
                                   ))
                            temp)))
                   (car (nthcdr 4 (gethash 1 tree))))
                  temp))
    (setq temp nil)
    ;; (insert (format "count: %d\n" count))
    ;; (lcs-debug all-common)
    ;; (insert "stack: " (lcs-pretty-stack stack))
    (while (consp stack)
      ;; (insert "stack: " (lcs-pretty-stack stack))
      (setq current (car stack))
      (let ((current-length (+ (lcs-get-parent-length (car (nthcdr 5 current)))
                               (- (min (caddr current)
                                       ;; The final character does
                                       ;; not count!
                                       (1- (nth (car current)
                                                str-lens)))
                                  (cadr current))))
            (current-env (list (car current)
                               (cadr current)
                               (caddr current)
                               (car (nthcdr 3 current))))
            (current-children (car (nthcdr 3 current))))
        ;; (insert (format "\n\ncount: %d\n" count))
        ;; (insert "current: " (lcs-pretty-stack-element current) "\n")
        ;; (lcs-debug current-length max-env max-height)
        ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children)))
        ;; (insert (format
        ;;          "discovered: %S\n"
        ;;          (gst-pretty-hash-table discovered-table)))
        ;; (insert (format
        ;;          "bits: %S\n"
        ;;          (gst-pretty-hash-table bits-table)))
        (cond
         ((not (hash-table-empty-p current-children))      ; There are children
          (cond
           ((gethash (car (nthcdr 4 current)) discovered-table)
            ;; This node is already traversed
            ;; This means we have already given bits to its children
            (let ((bit (progn
                         (setq temp 0)
                         (maphash (lambda (child-letter child-num)
                                    (setq
                                     temp
                                     (logior temp
                                             (gethash child-num bits-table))))
                                  current-children)
                         temp)
                       ;; An alternative
                       ;; (apply #'logior
                       ;;        (mapcar (lambda (num)
                       ;;                  (gethash num bits-table))
                       ;;                (hash-table-values current-children)))
                       ))
              (setq temp nil)
              (puthash (car (nthcdr 4 current)) bit bits-table)
              ;; (lcs-debug bit)
              ;; (insert (format
              ;;          "bits: %S\n"
              ;;          (gst-pretty-hash-table bits-table)))
              (cond
               ((= bit all-common)      ; a common substring
                (cond
                 ((> current-length max-height)
                  (setq max-height current-length)
                  (setq max-env (list
                                 (cons
                                  (list (car current)
                                        (cadr current)
                                        (caddr current))
                                  (mapcar #'lcs-prepare-env
                                          (car (nthcdr 5 current)))))))
                 ((= current-length max-height)
                  (setq max-env
                        (cons (cons
                               (list (car current)
                                     (cadr current)
                                     (caddr current))
                               (mapcar #'lcs-prepare-env
                                       (car (nthcdr 5 current))))
                              max-env)))))))
            (setq stack (cdr stack)))
           (t                           ; a new node!
            (setq stack
                  (append
                   (progn
                     (maphash
                      (lambda (letter node)
                        (setq temp
                              (cons
                               (append
                                (lcs-get-env (gethash node tree) str-lens)
                                (list node
                                      (cons current-env (car (nthcdr 5 current)))))
                               temp)))
                      current-children)
                     temp)
                   stack))
            (puthash (car (nthcdr 4 current)) 1 discovered-table)
            ;; (insert (format "current children: %S\n" (gst-pretty-hash-table current-children)))
            ;; (insert "stack: " (lcs-pretty-stack stack))
            (setq temp nil))))
         (t                             ; a leaf
          (let* ((node-number (car (nthcdr 4 current)))
                 (leaf-labels (car (nthcdr 5 (gethash node-number tree))))
                 (bit (apply #'logior
                             (mapcar (lambda (label)
                                       (ash 1 (car label)))
                                     leaf-labels))))
            ;; (insert "leaf")
            ;; (lcs-debug node-number leaf-labels bit)
            (puthash node-number bit bits-table)
            (cond
             ((= bit all-common)      ; a common substring
              (cond
               ((> current-length max-height)
                (setq max-height current-length)
                (setq max-env (list
                               (cons
                                (list (car current)
                                      (cadr current)
                                      (caddr current))
                                (mapcar #'lcs-prepare-env
                                        (car (nthcdr 5 current)))))))
               ((= current-length max-height)
                (setq max-env
                      (cons (cons
                             (list (car current)
                                   (cadr current)
                                   (caddr current))
                             (mapcar #'lcs-prepare-env
                                     (car (nthcdr 5 current))))
                            max-env))))))
            (setq stack (cdr stack)))))))
    (list max-height max-env)))

(provide 'lcs)
;;; LCS.el ends here