diff options
-rw-r--r-- | durand-dict.el | 796 |
1 files changed, 676 insertions, 120 deletions
diff --git a/durand-dict.el b/durand-dict.el index 09b6e06..0e1f8e7 100644 --- a/durand-dict.el +++ b/durand-dict.el @@ -32,130 +32,686 @@ ;;; Code: +;; structure library +(require 'cl-lib) + +;; parsing library +(require 'dom) + +;; regular expression building library +(require 'rx) + +;; URL fetching library +(require 'url) + +;;;; Derived mode + (define-derived-mode dudict-mode org-mode "DUDICT" - "Major mode for viewing my custom dictionaries.") - -(defun dudict-parse-agarathi (word) - "Request for the definition of Tamil WORD from the dictionary -website agarathi. - -WORD should be acceptable by agarathi: no processing is done to -convert from some transliteration to the Tamil script." - (let ((buffer - (url-retrieve-synchronously - (concat "https://agarathi.com/word/" word) - t t 20)) - result titles count) - (with-current-buffer buffer - (let* ((dom (libxml-parse-html-region - (point-min) (point-max))) - (timeline-ul (delq - nil - (mapcar - (lambda (node) - (cond ((string= (dom-attr node 'class) - "timeline") - node))) - (dom-by-tag dom 'ul)))) - (timelines (delq - nil - (mapcar - (lambda (node) - (let ((class (dom-attr node 'class))) - (cond - ((or - (string= class "timeline-normal") - (string= class "timeline-inverted")) - node)))) - (dom-by-tag timeline-ul 'li)))) - (timeline-panels - (mapcar - (lambda (node) (dom-by-class node "timeline-panel")) - timelines)) - (timeline-panels - (delq - nil - (mapcar (lambda (node) - (let ((result - (dudict-parse-agarathi-panel node))) - (cond - ((string= - (cadr result) - "Sponsored Links") - nil) - (result)))) - timeline-panels)))) - (setq result timeline-panels) - (kill-buffer buffer))) - (setq buffer (get-buffer-create (concat "dudict - " word))) - (with-current-buffer buffer - (delete-region (point-min) (point-max)) - (goto-char (point-min)) - (mapc - (lambda (panel) - (setq titles (car panel)) - (setq panel (cdr panel)) - ;; insert panel source - (insert "* " (car panel)) - (setq panel (cdr panel)) - (setq count 0) - (mapc - (lambda (def) - (newline) - (insert - (format - "** %s\n\n" - (dom-text (nth count titles)))) - (setq count (1+ count)) - (mapc - (lambda (def) - (insert - (mapconcat - #'identity - (split-string def "\n" t "\\s-+") - " ") - " ") - ;; (newline) - ) - def) - (newline)) - panel) - (newline 2)) - result)) - (switch-to-buffer buffer) - (dudict-mode) - (setq-local truncate-lines nil) - (goto-char (point-min)))) - -(defun dudict-parse-agarathi-panel (panel) - "A helper function to find the source of the panel and parse the -contents of PANEL into a list of definitions. - -This is designed to be used by the function `dudict-parse-agarathi'." - (let* ((panel (car panel)) - (panel-title - (dom-texts (car (dom-by-class panel "timeline-title")))) - (titles (delq - nil - (mapcar - (lambda (node) - (cond ((dom-attr node 'class) nil) (node))) - (dom-by-tag panel 'h3)))) - (description + "Major mode for viewing my custom dictionaries." + (setq truncate-lines nil) + (view-mode t)) + +;;;; Framework + +;;;;; Struct + +(cl-defstruct dudict-search + "Provides methods to (optionally) pre-process, to search, to +parse, and to present the search results to the user." + (pre + nil + :documentation "Preprocessing function, optional" + :type function) + (search + nil + :documentation "Searching function, required" + :type function) + (parse + nil + :documentation "Parsing function, required" + :type function) + (present + nil + :documentation "Function to print the search result to the user." + :type function) + (name + "dudict" + :documentation + "Search name that is used to make the presentation buffer." + :type string)) + +;;;;; Main framework function +;; Main framework function +(defun dudict-search (search-method word) + "Search for the definition of WORD by means of SEARCH-METHOD. +SEARCH-METHOD should be a struct `dudict-search' that provides +methods to pre-process, to search, and to post-process the search +results. All this does is to integrate various methods provided." + (cond + ((dudict-search-p search-method)) + ((user-error "Requires a dudict-search struct, but found %S" + search-method))) + (let* ((name (dudict-search-name search-method)) + (preprocess (dudict-search-pre search-method)) + (search (dudict-search-search search-method)) + (parse (dudict-search-parse search-method)) + (present (dudict-search-present search-method)) + (present-name (concat name " - " word)) + temp-buffer parse-result) + (cond + ((setq temp-buffer (get-buffer present-name)) + ;; already searched + (switch-to-buffer temp-buffer)) + (t + (cond + ((functionp preprocess) + (setq word (funcall preprocess word)))) + (cond + ((functionp search)) + (user-error "Requires a function to search, but found %S" + search)) + (setq temp-buffer (funcall search word)) + (cond + ((functionp parse)) + (user-error "Requires a function to parse, but found %S" + parse)) + (with-current-buffer temp-buffer + (setq parse-result (funcall parse))) + (kill-buffer temp-buffer) + (cond + ((functionp present)) + ((user-error "Requires a function to present, but found %S" + present))) + (setq temp-buffer (get-buffer-create present-name)) + (with-current-buffer temp-buffer + (delete-region (point-min) (point-max)) + (funcall present parse-result)) + (switch-to-buffer temp-buffer) + (dudict-mode) + (goto-char (point-min)))))) + +;;;; agarathi dictionary +;; For searching the agarathi dictionary, we for now do not need to +;; preprocess the search term, but in the future it might be useful to +;; add some preprocessing so that we do not need to enter in the Tamil +;; script (some sources do not always use the Tamil scripts). +;; +;; The search function just queries the website and returns the +;; buffer. +;; +;; The parsing function converts from the Document Object Model to our +;; custom structs: This is for the ease and the convenience of +;; presentation. +;; +;; Then the present function can iterate through the parse result +;; easily and present the search result to the user in a +;; well-structured format that is easy to read and understand. +;; +;; The name is surely the string "அகராதி" (अकरादि). + +;;;;; Search Struct + +(defvar dudict-search-agarathi nil + "The search struct that is used to search the agarathi dictionary.") + +(setq dudict-search-agarathi + (make-dudict-search + :name "அகராதி" + :pre nil + :search (function + (lambda (word) + (url-retrieve-synchronously + (concat "https://agarathi.com/word/" word) + t t 20))) + :parse #'dudict-parse-agarathi + :present #'dudict-present-agarathi)) + +;;;;; Short-cut + +(defvar dudict-agarathi-history nil + "History variable for the agarathi search.") + +(defun dudict-search-agarathi (word) + "A short-cut that passes `dudict-search-agarathi' and WORD to +`dudict-search'." + (interactive + (read-string "Enter search term: " nil 'dudict-agarathi-history)) + (dudict-search dudict-search-agarathi word)) + +;;;;; Structures of the dictionary website + +;;;;;; The entire time line +(cl-defstruct agarathi-timeline + "An entire agarathi search result is called a timeline. +It contains a list of panels." + (panels + nil + :documentation "The list of panels contained in the timeline." + :type list)) + +;; verifying function +(defun agarathi-check-valid-timeline (timeline) + "Check that an object is a valid timeline structure." + (declare (side-effect-free t) (pure t)) + (and + (agarathi-timeline-p timeline) + (let ((ls (agarathi-timeline-panels timeline)) + (result t) + temp) + (and + (listp ls) + (progn + (while (consp ls) + (setq temp (car-safe ls)) + (setq ls (cdr-safe ls)) + (cond + ((agarathi-check-valid-panel temp)) + (t + (setq result nil) + (setq ls nil)))) + result))))) + +;;;;;; A panel +(cl-defstruct agarathi-panel + "A panel contains the search results from some single source. +It contains a title and a list of descriptions, where each +description corresponds to a set of definitions, roughly +speaking." + (title + "" + :documentation "The title of the panel. +Usually this is the source of the definition. When this is a +pure advertisement panel, its title would be \"Sponsored Links\"." + :type string) + (descriptions + nil + :documentation "The list of descriptions contained in the panel." + :type list)) + +;; verifying function +(defun agarathi-check-valid-panel (panel) + "Check that an object is a valid panel structure." + (declare (side-effect-free t) (pure t)) + (and + (agarathi-panel-p panel) + (let ((title (agarathi-panel-title panel)) + (ls (agarathi-panel-descriptions panel)) + (result t) + temp) + (and + (stringp title) + (listp ls) + (progn + (while (consp ls) + (setq temp (car-safe ls)) + (setq ls (cdr-safe ls)) + (cond + ((agarathi-check-valid-description temp)) + (t + (setq result nil) + (setq ls nil)))) + result))))) + +;;;;;; A Description +(cl-defstruct agarathi-description + "A description contains some definitions along with some optional +examples. + +It contains a title along with a list of definitions and a list +of examples. + +Definitions and examples are basically strings." + (title + "" + :documentation "The title of a description. +This in the convention of the dictionary website means the word +that this description is describing. Since the dictionary can +return descriptions that are not exactly equal to the searched +term, it is necessary to keep this information for every +description." + :type string) + (definitions + nil + :documentation "The list of definitions contained in the description." + :type list) + (examples + nil + :documentation "The list of examples contained in the description." + :type list)) + +;; verifying function +(defun agarathi-check-valid-description (description) + "Check that an object is a valid description structure." + (declare (side-effect-free t) (pure t)) + (and + (agarathi-description-p description) + ;; check definitions + (let ((title (agarathi-description-title description)) + (ls (agarathi-description-definitions description)) + (result t) + temp) + (and + (stringp title) + (listp ls) + (progn + (while (consp ls) + (setq temp (car-safe ls)) + (setq ls (cdr-safe ls)) + (cond ((stringp temp)) (t (setq result nil) (setq ls nil)))) + result))) + ;; check examples + (let ((ls (agarathi-description-examples description)) + (result t) + temp) + (and + (listp ls) + (progn + (while (consp ls) + (setq temp (car-safe ls)) + (setq ls (cdr-safe ls)) + (cond ((stringp temp)) (t (setq result nil) (setq ls nil)))) + result))))) + +;;;;; main function + +(defun dudict-parse-agarathi () + "Parse the search result from the agarathi website. +The current buffer is assumed to contain the search results already." + (let* ((dom (libxml-parse-html-region (point-min) (point-max))) + (timeline-ul + (car-safe + (delq + nil + (mapcar + (lambda (node) + (cond ((string= (dom-attr node 'class) + "timeline") + node))) + (dom-by-tag dom 'ul)))))) + (cond + ((null timeline-ul) + (message "No timeline is found.") + nil) + ((dudict-parse-agarathi-timeline timeline-ul))))) + +;;;;; Archived code +;; (defun dudict-search-agarathi (word) +;; "Request for the definition of Tamil WORD from the dictionary +;; website agarathi. + +;; WORD should be acceptable by agarathi: no processing is done to +;; convert from some transliteration to the Tamil script." +;; (let* ((name (concat "dudict - " word)) +;; (buffer-found (get-buffer name)) +;; (buffer +;; (cond +;; (buffer-found) +;; ((url-retrieve-synchronously +;; (concat "https://agarathi.com/word/" word) +;; t t 20)))) +;; result titles count) +;; (cond +;; ((not buffer-found) +;; (with-current-buffer buffer +;; (let* ((dom (libxml-parse-html-region +;; (point-min) (point-max))) +;; (timeline-ul (delq +;; nil +;; (mapcar +;; (lambda (node) +;; (cond ((string= (dom-attr node 'class) +;; "timeline") +;; node))) +;; (dom-by-tag dom 'ul)))) +;; (timelines (delq +;; nil +;; (mapcar +;; (lambda (node) +;; (let ((class (dom-attr node 'class))) +;; (cond +;; ((or +;; (string= class "timeline-normal") +;; (string= +;; class "timeline-inverted")) +;; node)))) +;; (dom-by-tag timeline-ul 'li)))) +;; (timeline-panels +;; (mapcar +;; (lambda (node) (dom-by-class node "timeline-panel")) +;; timelines)) +;; (timeline-panels +;; (delq +;; nil +;; (mapcar (lambda (node) +;; (let ((result +;; (dudict-parse-agarathi-panel node))) +;; (cond +;; ((string= +;; (cadr result) +;; "Sponsored Links") +;; nil) +;; (result)))) +;; timeline-panels)))) +;; (setq result timeline-panels) +;; (kill-buffer buffer))) +;; (setq buffer (get-buffer-create name)) +;; (with-current-buffer buffer +;; (delete-region (point-min) (point-max)) +;; (goto-char (point-min)) +;; (mapc +;; (lambda (panel) +;; (setq titles (car panel)) +;; (setq panel (cdr panel)) +;; ;; insert panel source +;; (insert "* " (car panel)) +;; (setq panel (cdr panel)) +;; (setq count 0) +;; (mapc +;; (lambda (def) +;; (newline) +;; (insert +;; (format +;; "** %s\n\n" +;; (dom-text (nth count titles)))) +;; (setq count (1+ count)) +;; (mapc +;; (lambda (def) +;; (insert +;; (mapconcat +;; #'identity +;; (split-string def "\n" t "\\s-+") +;; " ") +;; " ") +;; ;; (newline) +;; ) +;; def) +;; (newline)) +;; panel) +;; (newline 2)) +;; result)))) +;; (switch-to-buffer buffer) +;; (dudict-mode) +;; (goto-char (point-min)))) + +;;;;; Parse a timeline +(defun dudict-parse-agarathi-timeline (dom) + "Convert a DOM object into a timeline structure. +See the documentation for the cl-type 'agarathi-timeline' to know more +about the timeline structure." + (let* ((panels (dom-by-class dom "timeline-panel")) + (parsed-panels (mapcar #'dudict-parse-agarathi-panel panels)) + (filtered-panels (delq nil (mapcar - (lambda (desc) - (mapcar - (lambda (node) - (cond - ((listp node) (dom-texts node)) - ((stringp node) node))) - (dom-children desc))) - (dom-by-class panel "description"))))) - (cons - titles - (cons panel-title (mapcar #'flatten-list description))))) + (lambda (panel) + (cond + ((dudict-agarathi-is-ad-panel panel) nil) + (panel))) + parsed-panels)))) + (make-agarathi-timeline :panels filtered-panels))) + +;;;;; Filter out ads +(defun dudict-agarathi-is-ad-panel (panel) + "If PANEL is a pure advertisement panel, return t, else return +nil." + (declare (side-effect-free t) (pure t)) + (and + ;; just in case + (agarathi-panel-p panel) + (string= (agarathi-panel-title panel) "Sponsored Links"))) + +;;;;; Parse a panel + +(defun dudict-parse-agarathi-panel (dom) + "Convert a DOM object into a panel structure. +See the documentation for the cl-type 'agarathi-panel' to know more +about the panel structure." + (declare (side-effect-free t) (pure t)) + (let* ((title + (cond + ((car-safe (dom-by-class dom "timeline-title"))) + ("No title found"))) + (title + (cond + ((listp title) (dom-texts title)) + (title))) + (timeline-body (car-safe (dom-by-class dom "timeline-body"))) + (style (cond ((string= + (dom-attr timeline-body 'id) + "winslow") + 'quoted) + ('direct))) + (descriptions + (dudict-parse-agarathi-descriptions timeline-body style))) + (make-agarathi-panel + :title title + :descriptions descriptions))) + +;; (defun dudict-parse-agarathi-panel (dom) +;; "Convert a DOM object into a panel structure. +;; See the documentation for the cl-type 'agarathi-panel' to know more +;; about the panel structure." +;; (let* ((panel (car dom)) +;; (panel-title +;; (dom-texts (car (dom-by-class panel "timeline-title")))) +;; (titles (delq +;; nil +;; (mapcar +;; (lambda (node) +;; (cond ((dom-attr node 'class) nil) (node))) +;; (dom-by-tag panel 'h3)))) +;; (description +;; (delq +;; nil +;; (mapcar +;; (lambda (desc) +;; (mapcar +;; (lambda (node) +;; (cond +;; ((listp node) (dom-texts node)) +;; ((stringp node) node))) +;; (dom-children desc))) +;; (dom-by-class panel "description"))))) +;; (cons +;; titles +;; (cons panel-title (mapcar #'flatten-list description))))) + +;;;;; Parse descriptions +(defun dudict-parse-agarathi-descriptions (dom style) + "Convert a DOM object into a list of description structures in the +STYLE style. + +See the documentation of the function +`dudict-parse-agarathi-description' for how STYLE is needed. + +See the documentation for the cl-type 'agarathi-description' to know +more about the description structure." + (declare (side-effect-free t) (pure t)) + ;; We walk the children of the document object model. + ;; + ;; The basic procedure is that, when we encounter a 'h3' tag, the + ;; text of that tag is the title of the following 'div' tag, which + ;; must have the class 'description'. + (let* ((children (dom-children dom)) + (title "No title found") + child description result) + (while (consp children) + (setq child (car-safe children)) + (setq children (cdr-safe children)) + (cond + ((not (listp child))) + ((eq (dom-tag child) 'h3) (setq title (dom-text child))) + ((string= (dom-attr child 'class) "description") + (setq description (dudict-parse-agarathi-description + child style)) + ;; Actually the parse result contains only the definitions and + ;; examples. + (setq description + (make-agarathi-description + :title title + :definitions (car-safe description) + :examples (cdr-safe description))) + (setq result (cons description result))))) + (reverse result))) + +;;;;; Parse one description +(defun dudict-parse-agarathi-description (dom style) + "Convert a list of DOM objects into a description structure in the +STYLE style. + +STYLE is either `direct' or `quoted'. Since some dictoinaries +present the definitions as direct children of the 'description' +tag, while some present them as the 'blockquote' children, we +must have this information STYLE to correctly parse the +descriptions. + +Note that the title is not parsed here, but in the function +`dudict-parse-agarathi-descriptions', so we only return the cons +of the definitions and examples contained in the description. + +See the documentation for the cl-type 'agarathi-description' to +know more about the description structure." + (declare (side-effect-free t) (pure t)) + (cond + ((listp dom)) + ((error "DOM should be a list, but got %S: %S" + (type-of dom) dom))) + (cond + ((memq style (list 'direct 'quoted))) + ((error + "STYLE should be either 'direct' or 'quoted', but got %S" + style))) + (cond + ((eq style 'quoted) + (setq dom (car-safe (dom-by-tag dom 'blockquote))))) + ;; We walk the children of the document object model. + ;; + ;; Basically, if we encounter an element that is not a 'blockquote' + ;; tag, we store all of these texts into a string and later parse + ;; them into definitions. When we encounter 'blockquote' tags, we + ;; store them into examples instead. + (let* ((children (dom-children dom)) + (temp-string (string)) + definitions examples child) + (while (consp children) + (setq child (car-safe children)) + (setq children (cdr-safe children)) + (cond + ((not + (and + (listp child) + (eq (dom-tag child) 'blockquote))) + (setq temp-string + (concat temp-string + (cond + ((listp child) (dom-texts child)) + ((format "%s" child)))))) + ((cond + ((null definitions) + (setq definitions + (dudict-parse-agarathi-definitions temp-string)) + (setq temp-string (string))) + ((setq examples (cons (dom-texts child) examples))))))) + (cond + ((and + (not (string-empty-p temp-string)) + (null definitions)) + (setq definitions + (dudict-parse-agarathi-definitions temp-string)))) + (setq examples (reverse examples)) + (cons definitions examples))) + +;;;;; Parse into definitions +(defun dudict-parse-agarathi-definitions (str) + "Parse a string STR into a list of definitions. +For now this just looks for the following format. + +SPACE NUMBER DOT SPACE + +More precise parsing might added in the future, if such needs +arise." + (declare (side-effect-free t) (pure t)) + (save-match-data + (let* ((len (length str)) + (old-index 0) + (index 0) + result) + (while (< index len) + (cond + ((setq index + (string-match + (rx-to-string + (list 'seq + 'space + (list 'one-or-more 'digit) + ?. 'space) + t) + str index)) + (setq result + (cons + (substring-no-properties + str old-index index) + result)) + (setq old-index index) + (setq index (+ index 4))) + ((setq index len) + (setq result + (cons + (substring-no-properties str old-index) + result))))) + (reverse result)))) + +;;;;; Insert the parse result into buffer +(defun dudict-present-agarathi (timeline) + "Insert the TIMELINE structure into the current buffer." + (cond + ((agarathi-timeline-p timeline)) + ((error "Requires a timeline, but found %S" timeline))) + (let* ((panels (agarathi-timeline-panels timeline)) + panel + descriptions description + definitions examples) + (while (consp panels) + (setq panel (car panels)) + (setq panels (cdr panels)) + (insert "* " (agarathi-panel-title panel)) + (newline) + (setq descriptions (agarathi-panel-descriptions panel)) + (while (consp descriptions) + (setq description (car descriptions)) + (setq descriptions (cdr descriptions)) + (insert + "** " + (dudict-trim (agarathi-description-title description))) + (newline) + (setq definitions + (agarathi-description-definitions description)) + (mapc + (lambda (def) (insert "- " (dudict-trim def)) (newline)) + definitions) + (newline) + (setq examples + (agarathi-description-examples description)) + (cond + (examples + (insert "*** Examples") + (newline) + (mapc + (lambda (ex) (insert "+ " (dudict-trim ex)) (newline)) + examples))))))) + +(defun dudict-trim (str) + "Replace any block of spaces from STR by one space." + (declare (side-effect-free t) (pure t)) + (replace-regexp-in-string + (rx-to-string + (list 'seq (list 'one-or-more 'space)) + t) + (string 32) + str)) + +;;;;; Test (dudict-parse-agarathi "கறுப்பு") |