From 7033187abaf42772097377c0a1ffc2cd4cefdada Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 4 Aug 2023 10:12:04 +0800 Subject: minor adjustments Not bug deals but adjustments of details. --- src/test.el | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'src/test.el') diff --git a/src/test.el b/src/test.el index d106a03..a6f6e7a 100644 --- a/src/test.el +++ b/src/test.el @@ -26,3 +26,112 @@ ;; (rep-parse-string ;; test-parser ;; "183t3ru") + +;; Below is an experimental tokenizer implemented in Emacs Lisp. + +;; The tokens for the test grammar are as follows: +;; +;; 0. SP +;; 1. %xA +;; 2. TEXT +;; 3. %x2A +;; 4. note: +;; 5. price: +;; 6. DIGIT + +(defun example-tokenize (str) + "Tokenize the string STR. +The function returns a pair of vectors, one of positive integers +and the other of corresponding spans in the original input. The +span is represented as a cons-cell, whose `car' is the starting +position in the input, and whose `cdr' is the length of the span. + +The tokens are as follows: + +0. SP +1. %xA +2. TEXT +3. %x2A +4. note: +5. price: +6. DIGIT" + (let ((i 0) (len (length str)) result result-spans) + (while (< i len) + (let ((element (aref str i))) + (cond + ((= element 32) + (setq result (cons 0 result)) + (setq result-spans (cons (cons i 1) result-spans)) + (setq i (1+ i))) + ((= element #xa) + (setq result (cons 1 result)) + (setq result-spans (cons (cons i 1) result-spans)) + (setq i (1+ i))) + ((and (= element #x2a) + (or (= i 0) (= (aref str (1- i)) #xa))) + (setq result (cons 3 result)) + (setq result-spans (cons (cons i 1) result-spans)) + (setq i (1+ i))) + ((and (<= ?0 element) (<= element ?9)) + (let ((j i) temp stop) + (while (and (not stop) (< j len)) + (setq temp (aref str j)) + (cond + ((and (<= ?0 temp) (<= temp ?9)) + (setq j (min (1+ j) len))) + ((setq stop t)))) + (setq result (cons 6 result)) + (setq result-spans + (cons (cons i (- j i)) result-spans)) + (setq i j))) + ((and (= element ?n) + (< (+ i 4) len) + (= (aref str (+ i 1)) ?o) + (= (aref str (+ i 2)) ?t) + (= (aref str (+ i 3)) ?e) + (= (aref str (+ i 4)) ?:)) + (setq result (cons 4 result)) + (setq result-spans (cons (cons i 5) result-spans)) + (setq i (+ i 5))) + ((and (= element ?p) + (< (+ i 5) len) + (= (aref str (+ i 1)) ?r) + (= (aref str (+ i 2)) ?i) + (= (aref str (+ i 3)) ?c) + (= (aref str (+ i 4)) ?e) + (= (aref str (+ i 5)) ?:)) + (setq result (cons 5 result)) + (setq result-spans (cons (cons i 6) result-spans)) + (setq i (+ i 6))) + ((setq result-spans (cons (cons i 1) result-spans)) + (let ((j i) stop temp) + (while (and (not stop) (< j len)) + (setq temp (aref str j)) + (cond + ((= temp #xa) (setq stop t)) + ((setq j (min (1+ j) len))))) + (setq result (cons 2 result)) + (setq result-spans (cons (cons i (- j i)) result-spans)) + (setq i j)))))) + (cons (apply #'vector (nreverse result)) + (apply #'vector (nreverse result-spans))))) + +(defvar test-document (expand-file-name + "test.document" + (expand-file-name + "test-data" + rep-dir)) + "A document for testing purposes.") + +(defvar input-spans nil + "A vector that represents spans of tokens in the input.") + +(let ((result (example-tokenize (with-temp-buffer + (insert-file-contents test-document) + (buffer-substring-no-properties + (point-min) (point-max)))))) + (setq input (car result)) + (setq input-spans (cdr result))) + +(rep-recognize test-parser input) +(rep-parse test-parser input) -- cgit v1.2.3-18-g5258