summaryrefslogtreecommitdiff
path: root/suffix tree/generalized-suffix-tree.el
blob: 043561490cd94a16a826454921cd306dd33a0566 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
;;; generalized-suffiex-tree.el --- Building a generalized suffix tree -*- lexical-binding: t; -*-

;;; Author: Durand
;;; Version: 0.0.2

;;; Commentary:

;; Our node is represented as a list of the following elements:

;; number     , from which string this edge comes
;; start      , which is the starting index of the edge going from its parent
;;              node to this node
;; end        , the index of the end of the above edge
;; suffix-link, the index of the node this node's suffix-link points to
;; children   , a hash-table of pairs of integers and indices of its
;;              children
;; leaf-labels, a list of pairs (number start), where
;;              number means the number-th string and
;;              start is the starting position of this edge.
;;              This element is nil for the internal nodes.

;; To compute the length of the edge going into NODE we use:
;; (- (min end (1+ position)) start)
;; which is actually how far the position is on that edge,
;; if it is on that edge.

;; We use only one terminating symbol, and that is -1.

;;; Code:

;;;###autoload
(defun gst-min (&rest args)
  "Return the minimum among ARGS.
If an argument is 'infty, then it is considered greater than
every number."
  (apply #'min (delq nil
                     (mapcar
                      (lambda (arg)
                        (cond
                         ((number-or-marker-p arg) arg)
                         ((eq arg 'infty) nil)))
                      args))))

;;;###autoload
(defun gst-edge-length (node position num str-lens)
  "Return the length of the edge into NODE.
See the comment above this function for the reason
POSITION and NUM are here.

And STR-LENS is here so that we know the lengths of previous strings."
  (let* ((node-active-num (car node))
         (leafp (eq (car (cdr (cdr node))) 'infty))
         (leaf-labels (cond (leafp (car (nthcdr 5 node))))))
    (- (cond
        ((and (/= node-active-num num)
              leafp)
         (nth node-active-num str-lens))
        ((/= node-active-num num)
         (caddr node))
        (t (gst-min (caddr node) (1+ position))))
       (cond
        ((and (/= node-active-num num)
              leafp)
         (or (cadr (assoc num leaf-labels #'eq))
             (cadr node)))
        (t (cadr node))))))

;;;###autoload
(defun gst-new-node (tree last-added number start &optional end suffix-start)
  "Make a new node with START and END as the coming edge.
NUMBER is the number of string the label belongs to.
Then add the new node to TREE.
LAST-ADDED is the number of elements already in the TREE."
  (let* ((end (or end 'infty) ;; 'infty represents the index of a leaf
              )
         (suffix-link 0) ;; The suffix link is initially 0
         (new-node
          (cond ((eq end 'infty)
                 (list number start end suffix-link (make-hash-table)
                       (list (list number suffix-start))))
                (t
                 (list number start end suffix-link (make-hash-table))))))
    (puthash (1+ last-added) new-node tree)
    (1+ last-added)))

;;;###autoload
(defun gst-add-leaf-label (tree leaf number start)
  "Add a label to LEAF in TREE.
NUMBER and START represent the label."
  (let* ((actual-node (gethash leaf tree))
         (leaf-labels (cdr (cdr (cdr (cdr (cdr actual-node)))))))
    (cond ((and (consp leaf-labels)
                (not (eq (caaar leaf-labels) number)))
           (setcar leaf-labels
                   (cons (list number start)
                         (car leaf-labels)))))))

;;;###autoload
(defun gst-add-suffix-link (tree need-sl node)
  "If NEED-SL is positive, then add the suffix link.
In particular, the node corresponding to NEED-SL in TREE
gets a suffix link pointing to NODE.

This always returns NODE."
  (let ((suffix-link-cdr (nthcdr 3 (gethash need-sl tree))))
    (cond
     ((and (> need-sl 1) (/= need-sl node)
           (= (car suffix-link-cdr) 0))
      (setcar suffix-link-cdr node))))
  node)

;;;###autoload
(defun gst-canonize (tree node position active-number active-edge-index active-length active-node
                          str-lens)
  "Walk down TREE to find the correct active point.

To be precise, we start from NODE, use POSITION, ACTIVE-NUMBER,
and STR-LENS to calculate the length of the edge, and compare
with ACTIVE-LENGTH. If it is less than that, then that means we
shall continue walking down the tree, so we return t in the first
element, while setting other variables to the correct values so
that the caller of this function can use the updated values.

ACTIVE-EDGE-INDEX is present solely for the purpose of updating."
  (let* ((actual-node (gethash node tree))
         (node-edge-length (gst-edge-length actual-node position active-number
                                            str-lens)))
    (cond
     ((>= active-length node-edge-length)
      (list t
            active-number
            (+ active-edge-index
               node-edge-length)
            (- active-length
               node-edge-length)
            node))
     (t
      (list nil
            active-number
            active-edge-index
            active-length
            active-node)))))

;;;###autoload
(defsubst gst-aref (strs num index)
  "Return the INDEX th element in NUM th element of STRS."
  (aref (nth num strs) index))

;;;###autoload
(defun gst-extend-tree (tree last-added position remain
                             active-node active-number active-edge-index
                             active-length character strs num str-lens)
  "Extend TREE by CHARACTER in NUM-th string of STRS.
The return value is
\(tree
 last-added    remain            active-node
 active-number active-edge-index active-length)

Other parameters: LAST-ADDED, POSITION, REMAIN, ACTIVE-NODE,
ACTIVE-NUMBER, ACTIVE-EDGE-INDEX, and ACTIVE-LENGTH have special
meanings in the algorithm.

STR-LENS can be supplied so that we don't have to calculate the
lengths repeatedly."
  (let* ((need-sl 0)
         (remain (1+ remain))
         continue-p breakp terminating-no-split)
    (while (and (not breakp) (> remain 0))
      (setq continue-p nil breakp nil)
      (setq terminating-no-split nil)
      (cond
       ((= active-length 0)
        (setq active-edge-index position)
        (setq active-number num)))
      (let* ((actual-node (gethash active-node tree))
             (nxt (cond
                   (actual-node
                    (gethash (gst-aref strs active-number active-edge-index)
                             (car (nthcdr 4 actual-node)))))))
        (cond
         ((null nxt)
          ;; We don't want many terminating characters branches.
          (cond
           ((and (eq character -1)
                 (eq (caddr actual-node) 'infty)
                 (eq (gst-aref strs (car actual-node)
                               (cadr actual-node))
                     -1)))
           (t
            (let ((leaf (gst-new-node tree last-added num position nil
                                      (- position remain -1))))
              (setq last-added leaf)
              (puthash (gst-aref strs active-number active-edge-index)
                       leaf
                       (car (nthcdr 4 (gethash active-node tree))))
              (setq need-sl (gst-add-suffix-link tree need-sl active-node)))))
          ;; rule 2
          )
         (t
          (let* ((result (gst-canonize
                          tree nxt position active-number
                          active-edge-index active-length active-node
                          str-lens)))
            (cond
             ((car result)
              ;; observation 2
              (setq active-number (car (cdr result)))
              (setq active-edge-index (car (cdr (cdr result))))
              (setq active-length (car (cdr (cdr (cdr result)))))
              (setq active-node (car (cdr (cdr (cdr (cdr result))))))
              (setq continue-p t))
             (t
              (cond
               ((eq (gst-aref strs
                              (car (gethash nxt tree))
                              (+ active-length
                                 (cadr (gethash nxt tree))))
                    character)
                ;; observation 1
                (setq active-length (1+ active-length))
                (setq need-sl (gst-add-suffix-link tree need-sl active-node))
                ;; (setq breakp t)
                ;; terminating symbol special handling
                (cond
                 ((eq character -1)     ; terminating symbol
                  (gst-add-leaf-label tree nxt num (- position remain -1))
                  (setq active-length (1- active-length))
                  ;; We don't want to split since this is a match. But
                  ;; we don't want to break since this is a
                  ;; terminating symbol. So we invent a new variable
                  ;; to do this.
                  (setq terminating-no-split t)
                  ;; But if remain = 1 then we do want to break.
                  (cond
                   ((eq remain 1)
                    (setq breakp t)))
                  )
                 (t
                  (setq breakp t)))))
              (cond
               ((or breakp terminating-no-split))
               (t                       ; splitting
                (let ((split (gst-new-node
                              tree last-added (car (gethash nxt tree)) (cadr (gethash nxt tree))
                              (+ (cadr (gethash nxt tree)) active-length))))
                  (setq last-added split)
                  (puthash
                   (gst-aref strs active-number active-edge-index)
                   split (cadr (cdr (cdr (cdr (gethash active-node tree))))))
                  (let ((leaf (gst-new-node tree last-added num position
                                            nil (- position remain -1))))
                    (setq last-added leaf)
                    (puthash character
                             leaf
                             (cadddr (cdr (gethash split tree))))
                    (setcar (cdr (gethash nxt tree))
                            (min
                             (+ (cadr (gethash nxt tree))
                                active-length)
                             (1- (nth (car (gethash nxt tree)) str-lens))))
                    (puthash (gst-aref strs
                                       (car (gethash nxt tree))
                                       (cadr (gethash nxt tree)))
                             nxt
                             (cadddr (cdr (gethash split tree))))
                    ;; rule 2
                    (setq need-sl
                          (gst-add-suffix-link tree need-sl split)))))))))))
        (cond
         ((or continue-p breakp))
         (t
          (setq remain (1- remain))
          (cond
           ((and (eq active-node 1) ; root
                 (> active-length 0))
            (setq active-length (1- active-length))
            (setq active-edge-index
                  (1+ (- position remain))))
           (t
            (setq active-node
                  (let ((slink (cadddr (gethash active-node tree))))
                    (cond       ; follow the suffix link or go to root
                     ((> slink 1) slink)
                     (t 1)))))))))
      ;; (cond ((eq character -1)
      ;;        (insert (format "Extending terminating character\n"))
      ;;        (insert "strs: " (format "%S" strs) "\n")
      ;;        (lcs-debug remain active-node active-edge-index
      ;;                   active-number active-length num
      ;;                   continue-p breakp terminating-no-split)
      ;;        (gst-print-tree tree strs)))
      )
    (list tree last-added remain active-node
          active-number active-edge-index active-length)))

;;;###autoload
(defun gst-build-for-strs (strs &optional str-lens)
  "Build the generalized suffix tree for STRS.
One can optionally provide STR-LENS to avoid calculating the
lengths again."
  (let* ((len (length strs))
         (index 0)
         (tree (make-hash-table))
         (last-added 0)
         (active-node (gst-new-node tree last-added index -1 -1))
         (str-lens (or str-lens (mapcar #'length strs))))
    (setq last-added active-node)
    (while (< index len)
      (let* ((position 0)
             (character (gst-aref strs index position))
             (remain 0)
             (active-node 1)            ; start from the root
             (active-number index)      ; root comes
             (active-edge-index 0)
             (active-length 0)
             result)                    ; temporary holder
        (while (and (< position (nth index str-lens)) character)
          ;; (setq old-character character)
          (setq result (gst-extend-tree tree last-added position remain
                                        active-node active-number active-edge-index
                                        active-length character strs index
                                        str-lens))
          (setq tree (pop result))
          (setq last-added (pop result))
          (setq remain (pop result))
          (setq active-node (pop result))
          (setq active-number (pop result))
          (setq active-edge-index (pop result))
          (setq active-length (pop result))
          (setq position (1+ position))
          (setq character (ignore-errors (gst-aref strs index position)))
          ;; (cond
          ;;  ((characterp old-character)
          ;;   (insert (format "After adding character %c:\n" old-character))
          ;;   (gst-print-tree tree
          ;;                   (append (seq-take strs index)
          ;;                           (list (concat
          ;;                                  (seq-take (nth index strs) position)
          ;;                                  "$"))))
          ;;   (insert "\n\n"))
          ;;  ((= old-character -1)
          ;;   (insert (format "After adding character -1:\n"))
          ;;   (gst-print-tree tree
          ;;                   (seq-take strs (1+ index)))
          ;;   (insert "\n\n")))
          ))
      (setq index (1+ index))
      ;; (insert "\n\n")
      )
    tree))

;;; Printing

;; This section is not necessary. It is here just to print the trees
;; to make sure we don't make some strange errors.

(require 'hierarchy)
(require 'seq)

;;;###autoload
(defun gst-print-strs (strs)
  "Print strings STRS nicely."
  (mapc (lambda (str)
          (mapc (lambda (char)
                  (cond ((characterp char) (insert char))
                        (t (insert ", "))))
                str))
        strs))

;;;###autoload
(defun gst-pretty-hash-table (table)
  "Only return the useful parts from TABLE.

To be precise, this returns the keys and values of the
hash-table, in the form of an alist."
  (let (keys-values-alist)
    (maphash
     (lambda (key value)
       (setq keys-values-alist
             (cons
              (cons key value)
              keys-values-alist)))
     table)
    keys-values-alist))

;;;###autoload
(defun gst-pretty-node (node)
  "Outputs a prettified NODE."
  (format "(%d %d %s %d %s %S)"
          (car node) (cadr node)
          (caddr node) (cadddr node)
          (gst-pretty-hash-table (car (nthcdr 4 node)))
          (car (nthcdr 5 node))))

;;;###autoload
(defun gst-print-tree-for-strs (strs)
  "Print the generalized suffix tree for STRS."
  (let* ((strs (mapcar (lambda (str)
                         (vconcat str (list -1)))
                       strs))
         (symbol (make-symbol "gtree")))
    (set symbol (gst-build-for-strs strs))
    (insert "Generalized suffix tree for: ")
    (gst-print-strs strs)
    (delete-region (- (point) 2) (point))
    (insert ":\n")
    (gst-print-tree (symbol-value symbol) strs)))

;;;###autoload
(defvar gst-full-error-p nil
  "Whether to print full nodes in the output.")

;;;###autoload
(defun gst-print-tree (tree strs)
  "Print TREE with the aid of STRS."
  (let* ((symbol-tree (make-symbol "new-hierarchy"))
         (strs (mapcar (lambda (str)
                         (mapcar (lambda (c)
                                   (cond ((eq c -1) ?$)
                                         (c)))
                                 str))
                       strs)))
    (set symbol-tree (hierarchy-new))
    (maphash
     (lambda (key value)
       (hierarchy-add-tree
        (symbol-value symbol-tree) key nil
        (lambda (item)
          (hash-table-values (car (nthcdr 4 (gethash item tree)))))))
     tree)
    (hierarchy-print
     (symbol-value symbol-tree)
     (lambda (item)
       (cond ((= item 1) "root")
             (t (let ((actual-str
                       (let* ((node (gethash item tree))
                              (leafp (eq (caddr node) 'infty))
                              (leaf-labels
                               (cond (leafp (car (nthcdr 5 node))))))
                         (concat
                          (seq-subseq
                           (nth (car node) strs)
                           (min (cadr node) (1- (length (nth (car node) strs))))
                           (cond (leafp -1)
                                 (t (caddr node))))
                          (cond
                           (leafp
                            (apply
                             #'concat
                             (mapcar
                              (lambda (label)
                                (let ((first (car label))
                                      (second (cadr label)))
                                  (format "$ (%d : %d)"
                                          first second)))
                              leaf-labels))))))))
                  (cond (gst-full-error-p
                         (concat
                          (format "key: %d, %s, %s" item actual-str
                                  (gst-pretty-node (gethash item tree)))))
                        (t actual-str)))))))))

(provide 'generalized-suffix-tree)

;;; generalized-suffix-tree.el ends here