PageRenderTime 35ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/english-parser/wiktionary.lisp

https://github.com/nixeagle/acumen
Lisp | 288 lines | 239 code | 43 blank | 6 comment | 7 complexity | 509416296ea02440ca7faaa4a1bdd582 MD5 | raw file
  1. (defpackage #:wiktionary
  2. (:use :cl :anaphora :alexandria :iterate :eos)
  3. (:export #:lookup-pos))
  4. ;;; Much of this is generic to any mediawiki wiki.
  5. (in-package :wiktionary)
  6. (in-suite* root)
  7. (defmacro doto (arg1 &rest args)
  8. "Written in 10 minutes while talking to scott.
  9. Gist of this is replace the second item in a list with arg1 before
  10. evaluating the whole thing inside of a progn."
  11. (once-only (arg1)
  12. `(progn ,@(mapcar (lambda (it) (apply #'list (car it)
  13. arg1 (cdr it)))
  14. args)
  15. ,arg1)))
  16. (defparameter +title-name->keyword-mapping+
  17. ;; Specified at:
  18. ;; http://en.wiktionary.org/wiki/Wiktionary:Entry_layout_explained/POS_headers#Standard_POS_headers
  19. (alist-hash-table `(("Noun" . :noun)
  20. ("Symbol" . :symbol)
  21. ("Verb" . :verb)
  22. ("Adjective" . :adjective)
  23. ("Adverb" . :adverb)
  24. ("Particle" . :particle)
  25. ("Pronoun" . :pronoun)
  26. ("Conjunction" . :conjunction)
  27. ("Interjection" . :interjection)
  28. ("Preposition" . :preposition)
  29. ("Proper noun" . :proper-noun)
  30. ("Article" . :article)
  31. ("Prepositional phrase" . :phrase))
  32. :test #'equalp))
  33. (defparameter +template-name->keyword-mapping+
  34. (alist-hash-table `(("en-adj" . :adjective)
  35. ("en-adj-more" . :adjective-comparative)
  36. ("en-adj-most" . :adjective-superlative)
  37. ("en-adv" . :adverb)
  38. ("en-noun" . :noun)
  39. ("en-plural-noun" . :noun-plural)
  40. ("en-verb" . :verb)
  41. ("en-adverb" . :adverb)
  42. ("en-adjective" . :adjective)
  43. ("en-proper-noun" . :proper-noun)
  44. ("en-proper noun" . :proper-noun)
  45. ("en-infl" . :infliction)
  46. ("en-conj" . :conjugate)
  47. ("en-cont" . :contraction)
  48. ("en-det" . :determiner)
  49. ("en-intj" . :interjection)
  50. ("en-interjection" . :interjection)
  51. ("en-prep" . :preposition)
  52. ("en-preposition" . :preposition)
  53. ("en-phrase" . :phrase)
  54. ("infl" . :infliction)
  55. ("inflection of" . :infliction)
  56. ("inflection of " . :infliction)
  57. ("inflected form of" . :infliction)
  58. ("en-pron" . :pronoun)
  59. ("en-pronoun" . :pronoun)
  60. ("en-term" . :term-template)
  61. ("initialism" . :initialism)
  62. ("abbreviation" :abbreviation)
  63. ("abbreviation of" . :abbreviation)
  64. ("acronyms" . :acronymn)
  65. ("en-usage-h-an" . :en-usage-h-an)
  66. ("en-part" . :particle)
  67. ("en-plural noun" . :noun-plural)
  68. ("en-usage-foreignism" . :en-usage-foreignism)
  69. ("en-noun-reg-es" . :en-noun-reg-es)
  70. ("en-noun-irreg" . :en-noun-irreg)
  71. ("en-noun-unc" . :en-noun-unc)
  72. ("en-verb2" . :verb)
  73. ("en-noun2" . :noun)
  74. ("en-infl-noun" . :en-infl-noun)
  75. ("en-usage-verb-particle-solid" :en-usage-verb-particle-solid))
  76. :test #'equalp))
  77. (defparameter +en-wiktionary-unsupported-mapping+
  78. (alist-hash-table `(("." . (make-word :pos (list :symbol)))))
  79. "Some symbols don't map well to webpage names on wiktionary. These are
  80. manually looked up so that the pos tagger can at least have the data.")
  81. (defparameter +interesting-language-headers+ (list "English" "Translingual")
  82. "These are interesting headers that we care about. Change these to
  83. something else if we want to load a non english lexicon.")
  84. (defparameter +always-interesting-headers+ (list "Proper Noun")
  85. "These headers are always interesting in the sense that no matter what
  86. language we are parsing we need to always include these headers.")
  87. (defvar *dictionary* (make-hash-table :test #'equal)
  88. "Dictionary of words!")
  89. (defun load-wiktionary-database (full-file-path)
  90. "Load the wiktionary dump if it has already been parsed/saved.
  91. This may not be safe in sbcl."
  92. (bt:make-thread (lambda ()
  93. (with-open-file (s full-file-path)
  94. (let ((*print-pretty* nil)
  95. (*print-circle* nil)
  96. (*print-readably* t)
  97. (*package* (find-package :wiktionary)))
  98. (setq *dictionary* (read s))))
  99. (print "DONE LOADING WIKTIONARY DB"))
  100. :name "wiktionary dict read"))
  101. (defun save-wiktionary-database (full-file-path)
  102. (bt:make-thread (lambda ()
  103. (with-open-file (s full-file-path
  104. :direction :output
  105. :if-exists :supersede)
  106. (let ((*print-pretty* nil)
  107. (*print-circle* nil)
  108. (*print-readably* t)
  109. (*package* (find-package :wiktionary)))
  110. (print *dictionary* s))))))
  111. (deftype english-parts-of-speech ()
  112. '(member :verb :noun :pronoun :adjective :adverb
  113. :preposition :conjunction :interjection))
  114. (defstruct (word (:type vector))
  115. (pos nil :type list))
  116. (defun run-enwiktionary-filter (source &optional (count 1))
  117. (let ((namespaces (mediawiki-dump-parser::namespace-names source)))
  118. (iter (for x from 1 to count)
  119. (for title = (parse-mediawiki-page-title source))
  120. (when (mediawiki-dump-parser::mainspacep title namespaces)
  121. (let* ((text (parse-mediawiki-page-text source))
  122. (sections (parse-mediawiki-sections text)))
  123. (multiple-value-bind (interesting-text
  124. interesting-titles)
  125. (list-interesting-text sections)
  126. (if (zerop (length interesting-text))
  127. (collect title)
  128. (setf (gethash title *dictionary*)
  129. (make-word
  130. :pos (remove-duplicates
  131. (remove nil
  132. (append (mapcar #'POS-template-to-type
  133. (list-wiktionary-templates-{{en interesting-text))
  134. (mapcar #'POS-title-to-type
  135. interesting-titles)) :test #'equal)))))))))))
  136. (defun list-wiktionary-templates-{{en (text)
  137. (ppcre:all-matches-as-strings "{{(en-|infl|abbreviation|acronyms)[^}]+}}" text))
  138. (defun POS-title-to-type (title-string)
  139. (gethash title-string
  140. +title-name->keyword-mapping+ nil))
  141. (defun POS-template-to-type (template-string)
  142. (aif (position #\| template-string)
  143. (cons (template-name->keyword (subseq template-string 2 it))
  144. (subseq template-string (1+ it) (- (length template-string) 2)))
  145. (template-name->keyword (subseq template-string 2 (- (length template-string) 2)))))
  146. (defun template-name->keyword (name)
  147. (gethash name +TEMPLATE-NAME->KEYWORD-MAPPING+ nil))
  148. (defun parse-mediawiki-page-title (source)
  149. "Grab the next page title in SOURCE."
  150. (klacks:find-element source "title")
  151. (nth-value 1 (klacks:peek-next source)))
  152. (defun parse-mediawiki-page-text (source)
  153. (klacks:find-element source "text")
  154. (if (eql :characters (klacks:peek-next source))
  155. (with-output-to-string (*standard-output*)
  156. (iter
  157. (princ (klacks:current-characters source))
  158. (while (eql :characters (klacks:peek-next source)))))
  159. ""))
  160. (defun parse-mediawiki-sections (text)
  161. (iter (for section-text in (cl-ppcre:split "(==+[^=]+)==+\\\n" text :WITH-REGISTERS-P t))
  162. (for title previous section-text)
  163. (for n from 1)
  164. (when (oddp n)
  165. (collect (list (cons (and title (subseq title (position #\= title :test-not #'eql)))
  166. (or (position #\= title :test-not #'eql) 0))
  167. section-text)))))
  168. (defun list-interesting-text (sections)
  169. (let ((english-level nil)
  170. (titles nil))
  171. (values
  172. (apply #'concatenate 'string
  173. (nreverse
  174. (reduce (lambda (last cur)
  175. (destructuring-bind ((title . level) text) cur
  176. (let ((englishp (member title +interesting-language-headers+ :test #'equalp)))
  177. (when englishp
  178. (setq english-level level))
  179. (if english-level
  180. (if (or (< english-level level) englishp)
  181. (progn
  182. (push title titles)
  183. (push text last))
  184. (progn (setq english-level nil)
  185. last))
  186. (if (member title +always-interesting-headers+ :test #'equalp)
  187. (progn
  188. (push title titles)
  189. (push text last))
  190. last)))))
  191. sections
  192. :initial-value nil)))
  193. titles)))
  194. (defun string-upcase-first-letter (word)
  195. "Upcase the first letter of WORD.
  196. We don't want to use `string-capitalize' here because we do not want to
  197. change the case of the remaining letters."
  198. (declare (type string word))
  199. (string-upcase word :start 0 :end 1))
  200. (defun string-downcase-first-letter (word)
  201. (declare (type string word))
  202. (string-downcase word :start 0 :end 1))
  203. (defun ensure-word-POS-keyword (list-or-keyword)
  204. "Return a part of speech symbol from LIST-OR-KEYWORD
  205. This assumes LIST-OR-KEYWORD actually has the right keyword as the input
  206. or in the car of the given list."
  207. (declare (type (or keyword list) list-or-keyword))
  208. (if (consp list-or-keyword)
  209. (car list-or-keyword)
  210. list-or-keyword))
  211. (defun lookup-pos (word)
  212. "Look up WORD's the parts of speech.
  213. When WORD is in the dictionary the second value will be t, otherwise we
  214. return nil for the second value."
  215. (let ((pos-list (gethash word *dictionary* :unknown)))
  216. (if (eql pos-list :unknown)
  217. (if (string= word (string-downcase-first-letter word))
  218. (values nil nil)
  219. (lookup-pos (string-downcase-first-letter word)))
  220. (values (remove-duplicates (mapcar #'ensure-word-POS-keyword (word-pos pos-list))) t))))
  221. (defun unknownp (arg)
  222. "True if ARG is an unknown word token."
  223. (let ((arg (if (stringp arg) (gethash arg *dictionary*) arg)))
  224. (or (not (word-p arg))
  225. (member :unknown (word-pos arg) :key #'cdr))))
  226. ;;;;;;;;;;;;;
  227. ;;; Tests
  228. (test (english-determiner :suite root)
  229. "Verify against ground truth that we find and tag properly all determiners."
  230. (flet ((generate-determiners ()
  231. (mapcar (lambda (x)
  232. (is (member :determiner (lookup-pos x))
  233. "Grammar missing english determiner for word '~A'." x))
  234. (acumen.english-parser.determiner:list-english-determiners))))
  235. (generate-determiners)))
  236. (test (string-upcase-first-letter :suite root)
  237. "The first letter should always get upcased, but the case of what
  238. follows must stay the same."
  239. (is (string= "Hi" (string-upcase-first-letter "hi")))
  240. (is (string= "HI" (string-upcase-first-letter "hI"))))
  241. (test (list-wiktionary-templates-{{en :suite root)
  242. "Test some examples of what this regex should be matching."
  243. (is (string= "{{en-foo|hi}}" (first (list-wiktionary-templates-{{en "{{en-foo|hi}}"))))
  244. (is (equal nil (list-wiktionary-templates-{{en "{{foo|blah}}"))))
  245. ;;; END