blob: a6f6e7a8dd7dd5524b6e4a3ffbb505e4058cf047 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
(load-file "rep.so")
(setq rep-dir (vc-call-backend 'Git 'root default-directory))
(setq test-parser
(rep-new-parser
(with-temp-buffer
(insert-file-contents
(expand-file-name
"test.abnf"
(expand-file-name
"abnf grammars"
(expand-file-name "grammar" rep-dir))))
(buffer-string))))
(defvar input nil "A vector that represents a testing input.")
(setq input (vector 3 0 2 2 2 1 1 1 0 1))
(rep-recognize test-parser input)
(rep-parse test-parser input)
;; (rep-parse-string
;; test-parser
;; "* title\nprice: 512\nnote: this is a test\n")
;; (rep-parse-string
;; test-parser
;; "183t3ru")
;; Below is an experimental tokenizer implemented in Emacs Lisp.
;; The tokens for the test grammar are as follows:
;;
;; 0. SP
;; 1. %xA
;; 2. TEXT
;; 3. %x2A
;; 4. note:
;; 5. price:
;; 6. DIGIT
(defun example-tokenize (str)
"Tokenize the string STR.
The function returns a pair of vectors, one of positive integers
and the other of corresponding spans in the original input. The
span is represented as a cons-cell, whose `car' is the starting
position in the input, and whose `cdr' is the length of the span.
The tokens are as follows:
0. SP
1. %xA
2. TEXT
3. %x2A
4. note:
5. price:
6. DIGIT"
(let ((i 0) (len (length str)) result result-spans)
(while (< i len)
(let ((element (aref str i)))
(cond
((= element 32)
(setq result (cons 0 result))
(setq result-spans (cons (cons i 1) result-spans))
(setq i (1+ i)))
((= element #xa)
(setq result (cons 1 result))
(setq result-spans (cons (cons i 1) result-spans))
(setq i (1+ i)))
((and (= element #x2a)
(or (= i 0) (= (aref str (1- i)) #xa)))
(setq result (cons 3 result))
(setq result-spans (cons (cons i 1) result-spans))
(setq i (1+ i)))
((and (<= ?0 element) (<= element ?9))
(let ((j i) temp stop)
(while (and (not stop) (< j len))
(setq temp (aref str j))
(cond
((and (<= ?0 temp) (<= temp ?9))
(setq j (min (1+ j) len)))
((setq stop t))))
(setq result (cons 6 result))
(setq result-spans
(cons (cons i (- j i)) result-spans))
(setq i j)))
((and (= element ?n)
(< (+ i 4) len)
(= (aref str (+ i 1)) ?o)
(= (aref str (+ i 2)) ?t)
(= (aref str (+ i 3)) ?e)
(= (aref str (+ i 4)) ?:))
(setq result (cons 4 result))
(setq result-spans (cons (cons i 5) result-spans))
(setq i (+ i 5)))
((and (= element ?p)
(< (+ i 5) len)
(= (aref str (+ i 1)) ?r)
(= (aref str (+ i 2)) ?i)
(= (aref str (+ i 3)) ?c)
(= (aref str (+ i 4)) ?e)
(= (aref str (+ i 5)) ?:))
(setq result (cons 5 result))
(setq result-spans (cons (cons i 6) result-spans))
(setq i (+ i 6)))
((setq result-spans (cons (cons i 1) result-spans))
(let ((j i) stop temp)
(while (and (not stop) (< j len))
(setq temp (aref str j))
(cond
((= temp #xa) (setq stop t))
((setq j (min (1+ j) len)))))
(setq result (cons 2 result))
(setq result-spans (cons (cons i (- j i)) result-spans))
(setq i j))))))
(cons (apply #'vector (nreverse result))
(apply #'vector (nreverse result-spans)))))
(defvar test-document (expand-file-name
"test.document"
(expand-file-name
"test-data"
rep-dir))
"A document for testing purposes.")
(defvar input-spans nil
"A vector that represents spans of tokens in the input.")
(let ((result (example-tokenize (with-temp-buffer
(insert-file-contents test-document)
(buffer-substring-no-properties
(point-min) (point-max))))))
(setq input (car result))
(setq input-spans (cdr result)))
(rep-recognize test-parser input)
(rep-parse test-parser input)
|