summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--durand-dict.el796
1 files changed, 676 insertions, 120 deletions
diff --git a/durand-dict.el b/durand-dict.el
index 09b6e06..0e1f8e7 100644
--- a/durand-dict.el
+++ b/durand-dict.el
@@ -32,130 +32,686 @@
;;; Code:
+;; structure library
+(require 'cl-lib)
+
+;; parsing library
+(require 'dom)
+
+;; regular expression building library
+(require 'rx)
+
+;; URL fetching library
+(require 'url)
+
+;;;; Derived mode
+
(define-derived-mode dudict-mode org-mode "DUDICT"
- "Major mode for viewing my custom dictionaries.")
-
-(defun dudict-parse-agarathi (word)
- "Request for the definition of Tamil WORD from the dictionary
-website agarathi.
-
-WORD should be acceptable by agarathi: no processing is done to
-convert from some transliteration to the Tamil script."
- (let ((buffer
- (url-retrieve-synchronously
- (concat "https://agarathi.com/word/" word)
- t t 20))
- result titles count)
- (with-current-buffer buffer
- (let* ((dom (libxml-parse-html-region
- (point-min) (point-max)))
- (timeline-ul (delq
- nil
- (mapcar
- (lambda (node)
- (cond ((string= (dom-attr node 'class)
- "timeline")
- node)))
- (dom-by-tag dom 'ul))))
- (timelines (delq
- nil
- (mapcar
- (lambda (node)
- (let ((class (dom-attr node 'class)))
- (cond
- ((or
- (string= class "timeline-normal")
- (string= class "timeline-inverted"))
- node))))
- (dom-by-tag timeline-ul 'li))))
- (timeline-panels
- (mapcar
- (lambda (node) (dom-by-class node "timeline-panel"))
- timelines))
- (timeline-panels
- (delq
- nil
- (mapcar (lambda (node)
- (let ((result
- (dudict-parse-agarathi-panel node)))
- (cond
- ((string=
- (cadr result)
- "Sponsored Links")
- nil)
- (result))))
- timeline-panels))))
- (setq result timeline-panels)
- (kill-buffer buffer)))
- (setq buffer (get-buffer-create (concat "dudict - " word)))
- (with-current-buffer buffer
- (delete-region (point-min) (point-max))
- (goto-char (point-min))
- (mapc
- (lambda (panel)
- (setq titles (car panel))
- (setq panel (cdr panel))
- ;; insert panel source
- (insert "* " (car panel))
- (setq panel (cdr panel))
- (setq count 0)
- (mapc
- (lambda (def)
- (newline)
- (insert
- (format
- "** %s\n\n"
- (dom-text (nth count titles))))
- (setq count (1+ count))
- (mapc
- (lambda (def)
- (insert
- (mapconcat
- #'identity
- (split-string def "\n" t "\\s-+")
- " ")
- " ")
- ;; (newline)
- )
- def)
- (newline))
- panel)
- (newline 2))
- result))
- (switch-to-buffer buffer)
- (dudict-mode)
- (setq-local truncate-lines nil)
- (goto-char (point-min))))
-
-(defun dudict-parse-agarathi-panel (panel)
- "A helper function to find the source of the panel and parse the
-contents of PANEL into a list of definitions.
-
-This is designed to be used by the function `dudict-parse-agarathi'."
- (let* ((panel (car panel))
- (panel-title
- (dom-texts (car (dom-by-class panel "timeline-title"))))
- (titles (delq
- nil
- (mapcar
- (lambda (node)
- (cond ((dom-attr node 'class) nil) (node)))
- (dom-by-tag panel 'h3))))
- (description
+ "Major mode for viewing my custom dictionaries."
+ (setq truncate-lines nil)
+ (view-mode t))
+
+;;;; Framework
+
+;;;;; Struct
+
+(cl-defstruct dudict-search
+ "Provides methods to (optionally) pre-process, to search, to
+parse, and to present the search results to the user."
+ (pre
+ nil
+ :documentation "Preprocessing function, optional"
+ :type function)
+ (search
+ nil
+ :documentation "Searching function, required"
+ :type function)
+ (parse
+ nil
+ :documentation "Parsing function, required"
+ :type function)
+ (present
+ nil
+ :documentation "Function to print the search result to the user."
+ :type function)
+ (name
+ "dudict"
+ :documentation
+ "Search name that is used to make the presentation buffer."
+ :type string))
+
+;;;;; Main framework function
+;; Main framework function
+(defun dudict-search (search-method word)
+ "Search for the definition of WORD by means of SEARCH-METHOD.
+SEARCH-METHOD should be a struct `dudict-search' that provides
+methods to pre-process, to search, and to post-process the search
+results. All this does is to integrate various methods provided."
+ (cond
+ ((dudict-search-p search-method))
+ ((user-error "Requires a dudict-search struct, but found %S"
+ search-method)))
+ (let* ((name (dudict-search-name search-method))
+ (preprocess (dudict-search-pre search-method))
+ (search (dudict-search-search search-method))
+ (parse (dudict-search-parse search-method))
+ (present (dudict-search-present search-method))
+ (present-name (concat name " - " word))
+ temp-buffer parse-result)
+ (cond
+ ((setq temp-buffer (get-buffer present-name))
+ ;; already searched
+ (switch-to-buffer temp-buffer))
+ (t
+ (cond
+ ((functionp preprocess)
+ (setq word (funcall preprocess word))))
+ (cond
+ ((functionp search))
+ (user-error "Requires a function to search, but found %S"
+ search))
+ (setq temp-buffer (funcall search word))
+ (cond
+ ((functionp parse))
+ (user-error "Requires a function to parse, but found %S"
+ parse))
+ (with-current-buffer temp-buffer
+ (setq parse-result (funcall parse)))
+ (kill-buffer temp-buffer)
+ (cond
+ ((functionp present))
+ ((user-error "Requires a function to present, but found %S"
+ present)))
+ (setq temp-buffer (get-buffer-create present-name))
+ (with-current-buffer temp-buffer
+ (delete-region (point-min) (point-max))
+ (funcall present parse-result))
+ (switch-to-buffer temp-buffer)
+ (dudict-mode)
+ (goto-char (point-min))))))
+
+;;;; agarathi dictionary
+;; For searching the agarathi dictionary, we for now do not need to
+;; preprocess the search term, but in the future it might be useful to
+;; add some preprocessing so that we do not need to enter in the Tamil
+;; script (some sources do not always use the Tamil scripts).
+;;
+;; The search function just queries the website and returns the
+;; buffer.
+;;
+;; The parsing function converts from the Document Object Model to our
+;; custom structs: This is for the ease and the convenience of
+;; presentation.
+;;
+;; Then the present function can iterate through the parse result
+;; easily and present the search result to the user in a
+;; well-structured format that is easy to read and understand.
+;;
+;; The name is surely the string "அகராதி" (अकरादि).
+
+;;;;; Search Struct
+
+(defvar dudict-search-agarathi nil
+ "The search struct that is used to search the agarathi dictionary.")
+
+(setq dudict-search-agarathi
+ (make-dudict-search
+ :name "அகராதி"
+ :pre nil
+ :search (function
+ (lambda (word)
+ (url-retrieve-synchronously
+ (concat "https://agarathi.com/word/" word)
+ t t 20)))
+ :parse #'dudict-parse-agarathi
+ :present #'dudict-present-agarathi))
+
+;;;;; Short-cut
+
+(defvar dudict-agarathi-history nil
+ "History variable for the agarathi search.")
+
+(defun dudict-search-agarathi (word)
+ "A short-cut that passes `dudict-search-agarathi' and WORD to
+`dudict-search'."
+ (interactive
+ (read-string "Enter search term: " nil 'dudict-agarathi-history))
+ (dudict-search dudict-search-agarathi word))
+
+;;;;; Structures of the dictionary website
+
+;;;;;; The entire time line
+(cl-defstruct agarathi-timeline
+ "An entire agarathi search result is called a timeline.
+It contains a list of panels."
+ (panels
+ nil
+ :documentation "The list of panels contained in the timeline."
+ :type list))
+
+;; verifying function
+(defun agarathi-check-valid-timeline (timeline)
+ "Check that an object is a valid timeline structure."
+ (declare (side-effect-free t) (pure t))
+ (and
+ (agarathi-timeline-p timeline)
+ (let ((ls (agarathi-timeline-panels timeline))
+ (result t)
+ temp)
+ (and
+ (listp ls)
+ (progn
+ (while (consp ls)
+ (setq temp (car-safe ls))
+ (setq ls (cdr-safe ls))
+ (cond
+ ((agarathi-check-valid-panel temp))
+ (t
+ (setq result nil)
+ (setq ls nil))))
+ result)))))
+
+;;;;;; A panel
+(cl-defstruct agarathi-panel
+ "A panel contains the search results from some single source.
+It contains a title and a list of descriptions, where each
+description corresponds to a set of definitions, roughly
+speaking."
+ (title
+ ""
+ :documentation "The title of the panel.
+Usually this is the source of the definition. When this is a
+pure advertisement panel, its title would be \"Sponsored Links\"."
+ :type string)
+ (descriptions
+ nil
+ :documentation "The list of descriptions contained in the panel."
+ :type list))
+
+;; verifying function
+(defun agarathi-check-valid-panel (panel)
+ "Check that an object is a valid panel structure."
+ (declare (side-effect-free t) (pure t))
+ (and
+ (agarathi-panel-p panel)
+ (let ((title (agarathi-panel-title panel))
+ (ls (agarathi-panel-descriptions panel))
+ (result t)
+ temp)
+ (and
+ (stringp title)
+ (listp ls)
+ (progn
+ (while (consp ls)
+ (setq temp (car-safe ls))
+ (setq ls (cdr-safe ls))
+ (cond
+ ((agarathi-check-valid-description temp))
+ (t
+ (setq result nil)
+ (setq ls nil))))
+ result)))))
+
+;;;;;; A Description
+(cl-defstruct agarathi-description
+ "A description contains some definitions along with some optional
+examples.
+
+It contains a title along with a list of definitions and a list
+of examples.
+
+Definitions and examples are basically strings."
+ (title
+ ""
+ :documentation "The title of a description.
+This in the convention of the dictionary website means the word
+that this description is describing. Since the dictionary can
+return descriptions that are not exactly equal to the searched
+term, it is necessary to keep this information for every
+description."
+ :type string)
+ (definitions
+ nil
+ :documentation "The list of definitions contained in the description."
+ :type list)
+ (examples
+ nil
+ :documentation "The list of examples contained in the description."
+ :type list))
+
+;; verifying function
+(defun agarathi-check-valid-description (description)
+ "Check that an object is a valid description structure."
+ (declare (side-effect-free t) (pure t))
+ (and
+ (agarathi-description-p description)
+ ;; check definitions
+ (let ((title (agarathi-description-title description))
+ (ls (agarathi-description-definitions description))
+ (result t)
+ temp)
+ (and
+ (stringp title)
+ (listp ls)
+ (progn
+ (while (consp ls)
+ (setq temp (car-safe ls))
+ (setq ls (cdr-safe ls))
+ (cond ((stringp temp)) (t (setq result nil) (setq ls nil))))
+ result)))
+ ;; check examples
+ (let ((ls (agarathi-description-examples description))
+ (result t)
+ temp)
+ (and
+ (listp ls)
+ (progn
+ (while (consp ls)
+ (setq temp (car-safe ls))
+ (setq ls (cdr-safe ls))
+ (cond ((stringp temp)) (t (setq result nil) (setq ls nil))))
+ result)))))
+
+;;;;; main function
+
+(defun dudict-parse-agarathi ()
+ "Parse the search result from the agarathi website.
+The current buffer is assumed to contain the search results already."
+ (let* ((dom (libxml-parse-html-region (point-min) (point-max)))
+ (timeline-ul
+ (car-safe
+ (delq
+ nil
+ (mapcar
+ (lambda (node)
+ (cond ((string= (dom-attr node 'class)
+ "timeline")
+ node)))
+ (dom-by-tag dom 'ul))))))
+ (cond
+ ((null timeline-ul)
+ (message "No timeline is found.")
+ nil)
+ ((dudict-parse-agarathi-timeline timeline-ul)))))
+
+;;;;; Archived code
+;; (defun dudict-search-agarathi (word)
+;; "Request for the definition of Tamil WORD from the dictionary
+;; website agarathi.
+
+;; WORD should be acceptable by agarathi: no processing is done to
+;; convert from some transliteration to the Tamil script."
+;; (let* ((name (concat "dudict - " word))
+;; (buffer-found (get-buffer name))
+;; (buffer
+;; (cond
+;; (buffer-found)
+;; ((url-retrieve-synchronously
+;; (concat "https://agarathi.com/word/" word)
+;; t t 20))))
+;; result titles count)
+;; (cond
+;; ((not buffer-found)
+;; (with-current-buffer buffer
+;; (let* ((dom (libxml-parse-html-region
+;; (point-min) (point-max)))
+;; (timeline-ul (delq
+;; nil
+;; (mapcar
+;; (lambda (node)
+;; (cond ((string= (dom-attr node 'class)
+;; "timeline")
+;; node)))
+;; (dom-by-tag dom 'ul))))
+;; (timelines (delq
+;; nil
+;; (mapcar
+;; (lambda (node)
+;; (let ((class (dom-attr node 'class)))
+;; (cond
+;; ((or
+;; (string= class "timeline-normal")
+;; (string=
+;; class "timeline-inverted"))
+;; node))))
+;; (dom-by-tag timeline-ul 'li))))
+;; (timeline-panels
+;; (mapcar
+;; (lambda (node) (dom-by-class node "timeline-panel"))
+;; timelines))
+;; (timeline-panels
+;; (delq
+;; nil
+;; (mapcar (lambda (node)
+;; (let ((result
+;; (dudict-parse-agarathi-panel node)))
+;; (cond
+;; ((string=
+;; (cadr result)
+;; "Sponsored Links")
+;; nil)
+;; (result))))
+;; timeline-panels))))
+;; (setq result timeline-panels)
+;; (kill-buffer buffer)))
+;; (setq buffer (get-buffer-create name))
+;; (with-current-buffer buffer
+;; (delete-region (point-min) (point-max))
+;; (goto-char (point-min))
+;; (mapc
+;; (lambda (panel)
+;; (setq titles (car panel))
+;; (setq panel (cdr panel))
+;; ;; insert panel source
+;; (insert "* " (car panel))
+;; (setq panel (cdr panel))
+;; (setq count 0)
+;; (mapc
+;; (lambda (def)
+;; (newline)
+;; (insert
+;; (format
+;; "** %s\n\n"
+;; (dom-text (nth count titles))))
+;; (setq count (1+ count))
+;; (mapc
+;; (lambda (def)
+;; (insert
+;; (mapconcat
+;; #'identity
+;; (split-string def "\n" t "\\s-+")
+;; " ")
+;; " ")
+;; ;; (newline)
+;; )
+;; def)
+;; (newline))
+;; panel)
+;; (newline 2))
+;; result))))
+;; (switch-to-buffer buffer)
+;; (dudict-mode)
+;; (goto-char (point-min))))
+
+;;;;; Parse a timeline
+(defun dudict-parse-agarathi-timeline (dom)
+ "Convert a DOM object into a timeline structure.
+See the documentation for the cl-type 'agarathi-timeline' to know more
+about the timeline structure."
+ (let* ((panels (dom-by-class dom "timeline-panel"))
+ (parsed-panels (mapcar #'dudict-parse-agarathi-panel panels))
+ (filtered-panels
(delq
nil
(mapcar
- (lambda (desc)
- (mapcar
- (lambda (node)
- (cond
- ((listp node) (dom-texts node))
- ((stringp node) node)))
- (dom-children desc)))
- (dom-by-class panel "description")))))
- (cons
- titles
- (cons panel-title (mapcar #'flatten-list description)))))
+ (lambda (panel)
+ (cond
+ ((dudict-agarathi-is-ad-panel panel) nil)
+ (panel)))
+ parsed-panels))))
+ (make-agarathi-timeline :panels filtered-panels)))
+
+;;;;; Filter out ads
+(defun dudict-agarathi-is-ad-panel (panel)
+ "If PANEL is a pure advertisement panel, return t, else return
+nil."
+ (declare (side-effect-free t) (pure t))
+ (and
+ ;; just in case
+ (agarathi-panel-p panel)
+ (string= (agarathi-panel-title panel) "Sponsored Links")))
+
+;;;;; Parse a panel
+
+(defun dudict-parse-agarathi-panel (dom)
+ "Convert a DOM object into a panel structure.
+See the documentation for the cl-type 'agarathi-panel' to know more
+about the panel structure."
+ (declare (side-effect-free t) (pure t))
+ (let* ((title
+ (cond
+ ((car-safe (dom-by-class dom "timeline-title")))
+ ("No title found")))
+ (title
+ (cond
+ ((listp title) (dom-texts title))
+ (title)))
+ (timeline-body (car-safe (dom-by-class dom "timeline-body")))
+ (style (cond ((string=
+ (dom-attr timeline-body 'id)
+ "winslow")
+ 'quoted)
+ ('direct)))
+ (descriptions
+ (dudict-parse-agarathi-descriptions timeline-body style)))
+ (make-agarathi-panel
+ :title title
+ :descriptions descriptions)))
+
+;; (defun dudict-parse-agarathi-panel (dom)
+;; "Convert a DOM object into a panel structure.
+;; See the documentation for the cl-type 'agarathi-panel' to know more
+;; about the panel structure."
+;; (let* ((panel (car dom))
+;; (panel-title
+;; (dom-texts (car (dom-by-class panel "timeline-title"))))
+;; (titles (delq
+;; nil
+;; (mapcar
+;; (lambda (node)
+;; (cond ((dom-attr node 'class) nil) (node)))
+;; (dom-by-tag panel 'h3))))
+;; (description
+;; (delq
+;; nil
+;; (mapcar
+;; (lambda (desc)
+;; (mapcar
+;; (lambda (node)
+;; (cond
+;; ((listp node) (dom-texts node))
+;; ((stringp node) node)))
+;; (dom-children desc)))
+;; (dom-by-class panel "description")))))
+;; (cons
+;; titles
+;; (cons panel-title (mapcar #'flatten-list description)))))
+
+;;;;; Parse descriptions
+(defun dudict-parse-agarathi-descriptions (dom style)
+ "Convert a DOM object into a list of description structures in the
+STYLE style.
+
+See the documentation of the function
+`dudict-parse-agarathi-description' for how STYLE is needed.
+
+See the documentation for the cl-type 'agarathi-description' to know
+more about the description structure."
+ (declare (side-effect-free t) (pure t))
+ ;; We walk the children of the document object model.
+ ;;
+ ;; The basic procedure is that, when we encounter a 'h3' tag, the
+ ;; text of that tag is the title of the following 'div' tag, which
+ ;; must have the class 'description'.
+ (let* ((children (dom-children dom))
+ (title "No title found")
+ child description result)
+ (while (consp children)
+ (setq child (car-safe children))
+ (setq children (cdr-safe children))
+ (cond
+ ((not (listp child)))
+ ((eq (dom-tag child) 'h3) (setq title (dom-text child)))
+ ((string= (dom-attr child 'class) "description")
+ (setq description (dudict-parse-agarathi-description
+ child style))
+ ;; Actually the parse result contains only the definitions and
+ ;; examples.
+ (setq description
+ (make-agarathi-description
+ :title title
+ :definitions (car-safe description)
+ :examples (cdr-safe description)))
+ (setq result (cons description result)))))
+ (reverse result)))
+
+;;;;; Parse one description
+(defun dudict-parse-agarathi-description (dom style)
+ "Convert a list of DOM objects into a description structure in the
+STYLE style.
+
+STYLE is either `direct' or `quoted'. Since some dictoinaries
+present the definitions as direct children of the 'description'
+tag, while some present them as the 'blockquote' children, we
+must have this information STYLE to correctly parse the
+descriptions.
+
+Note that the title is not parsed here, but in the function
+`dudict-parse-agarathi-descriptions', so we only return the cons
+of the definitions and examples contained in the description.
+
+See the documentation for the cl-type 'agarathi-description' to
+know more about the description structure."
+ (declare (side-effect-free t) (pure t))
+ (cond
+ ((listp dom))
+ ((error "DOM should be a list, but got %S: %S"
+ (type-of dom) dom)))
+ (cond
+ ((memq style (list 'direct 'quoted)))
+ ((error
+ "STYLE should be either 'direct' or 'quoted', but got %S"
+ style)))
+ (cond
+ ((eq style 'quoted)
+ (setq dom (car-safe (dom-by-tag dom 'blockquote)))))
+ ;; We walk the children of the document object model.
+ ;;
+ ;; Basically, if we encounter an element that is not a 'blockquote'
+ ;; tag, we store all of these texts into a string and later parse
+ ;; them into definitions. When we encounter 'blockquote' tags, we
+ ;; store them into examples instead.
+ (let* ((children (dom-children dom))
+ (temp-string (string))
+ definitions examples child)
+ (while (consp children)
+ (setq child (car-safe children))
+ (setq children (cdr-safe children))
+ (cond
+ ((not
+ (and
+ (listp child)
+ (eq (dom-tag child) 'blockquote)))
+ (setq temp-string
+ (concat temp-string
+ (cond
+ ((listp child) (dom-texts child))
+ ((format "%s" child))))))
+ ((cond
+ ((null definitions)
+ (setq definitions
+ (dudict-parse-agarathi-definitions temp-string))
+ (setq temp-string (string)))
+ ((setq examples (cons (dom-texts child) examples)))))))
+ (cond
+ ((and
+ (not (string-empty-p temp-string))
+ (null definitions))
+ (setq definitions
+ (dudict-parse-agarathi-definitions temp-string))))
+ (setq examples (reverse examples))
+ (cons definitions examples)))
+
+;;;;; Parse into definitions
+(defun dudict-parse-agarathi-definitions (str)
+ "Parse a string STR into a list of definitions.
+For now this just looks for the following format.
+
+SPACE NUMBER DOT SPACE
+
+More precise parsing might added in the future, if such needs
+arise."
+ (declare (side-effect-free t) (pure t))
+ (save-match-data
+ (let* ((len (length str))
+ (old-index 0)
+ (index 0)
+ result)
+ (while (< index len)
+ (cond
+ ((setq index
+ (string-match
+ (rx-to-string
+ (list 'seq
+ 'space
+ (list 'one-or-more 'digit)
+ ?. 'space)
+ t)
+ str index))
+ (setq result
+ (cons
+ (substring-no-properties
+ str old-index index)
+ result))
+ (setq old-index index)
+ (setq index (+ index 4)))
+ ((setq index len)
+ (setq result
+ (cons
+ (substring-no-properties str old-index)
+ result)))))
+ (reverse result))))
+
+;;;;; Insert the parse result into buffer
+(defun dudict-present-agarathi (timeline)
+ "Insert the TIMELINE structure into the current buffer."
+ (cond
+ ((agarathi-timeline-p timeline))
+ ((error "Requires a timeline, but found %S" timeline)))
+ (let* ((panels (agarathi-timeline-panels timeline))
+ panel
+ descriptions description
+ definitions examples)
+ (while (consp panels)
+ (setq panel (car panels))
+ (setq panels (cdr panels))
+ (insert "* " (agarathi-panel-title panel))
+ (newline)
+ (setq descriptions (agarathi-panel-descriptions panel))
+ (while (consp descriptions)
+ (setq description (car descriptions))
+ (setq descriptions (cdr descriptions))
+ (insert
+ "** "
+ (dudict-trim (agarathi-description-title description)))
+ (newline)
+ (setq definitions
+ (agarathi-description-definitions description))
+ (mapc
+ (lambda (def) (insert "- " (dudict-trim def)) (newline))
+ definitions)
+ (newline)
+ (setq examples
+ (agarathi-description-examples description))
+ (cond
+ (examples
+ (insert "*** Examples")
+ (newline)
+ (mapc
+ (lambda (ex) (insert "+ " (dudict-trim ex)) (newline))
+ examples)))))))
+
+(defun dudict-trim (str)
+ "Replace any block of spaces from STR by one space."
+ (declare (side-effect-free t) (pure t))
+ (replace-regexp-in-string
+ (rx-to-string
+ (list 'seq (list 'one-or-more 'space))
+ t)
+ (string 32)
+ str))
+
+;;;;; Test
(dudict-parse-agarathi "கறுப்பு")