PageRenderTime 45ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/emacspeak-29.0/lisp/dtk-unicode.el

#
Emacs Lisp | 330 lines | 208 code | 55 blank | 67 comment | 23 complexity | a7407f9563d34efc407f653e0778175c MD5 | raw file
Possible License(s): MIT
  1. ;;; dtk-unicode.el --- Pronounce more characters correctly
  2. ;;{{{ Header: Lukas
  3. ;; Copyright 2007 Lukas Loehrer
  4. ;;; TVR: Integrated into Emacspeak July 6, 2008
  5. ;;; Using patch from Lukas.
  6. ;;
  7. ;; Author: Lukas Loehrer <loehrerl |at| gmx.net>
  8. ;; Version: $Id: dtk-unicode.el 6000 2008-10-14 21:14:19Z tv.raman.tv $
  9. ;; Keywords: TTS, Unicode
  10. ;;}}}
  11. ;;{{{ LCD Archive entry:
  12. ;;; LCD Archive Entry:
  13. ;;; emacspeak| T. V. Raman |raman@cs.cornell.edu
  14. ;;; A speech interface to Emacs |
  15. ;;; $Date: 2008-07-06 10:18:30 -0700 (Sun, 06 Jul 2008) $ |
  16. ;;; $Revision: 4670 $ |
  17. ;;; Location undetermined
  18. ;;;
  19. ;;}}}
  20. ;;{{{ Copyright:
  21. ;;;Copyright (C) 1995 -- 2007, T. V. Raman
  22. ;;; Copyright (c) 1994, 1995 by Digital Equipment Corporation.
  23. ;;; All Rights Reserved.
  24. ;;;
  25. ;;; This file is not part of GNU Emacs, but the same permissions apply.
  26. ;;;
  27. ;;; GNU Emacs is free software; you can redistribute it and/or modify
  28. ;;; it under the terms of the GNU General Public License as published by
  29. ;;; the Free Software Foundation; either version 2, or (at your option)
  30. ;;; any later version.
  31. ;;;
  32. ;;; GNU Emacs is distributed in the hope that it will be useful,
  33. ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  34. ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  35. ;;; GNU General Public License for more details.
  36. ;;;
  37. ;;; You should have received a copy of the GNU General Public License
  38. ;;; along with GNU Emacs; see the file COPYING. If not, write to
  39. ;;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  40. ;;}}}
  41. ;;{{{ Introduction
  42. ;;; Commentary:
  43. ;;
  44. ;;; This Provides Unicode support to the speech layer.
  45. ;;; Code:
  46. ;;}}}
  47. ;;{{{ Preamble
  48. (require 'cl)
  49. (declaim (optimize (safety 0) (speed 3)))
  50. (require 'descr-text)
  51. ;;}}}
  52. ;;{{{ Customizations
  53. (defgroup dtk-unicode
  54. nil
  55. "Customization group for dtk-unicode."
  56. :group 'emacspeak
  57. :prefix "dtk-unicode-")
  58. (defcustom dtk-unicode-character-replacement-alist
  59. '(
  60. (?? . "-") ; START OF GUARDED AREA
  61. (?° . " degrees ") ; degree sign
  62. (?“ . "\"") ;LEFT DOUBLE QUOTATION MARK
  63. (?” . "\"") ; RIGHT DOUBLE QUOTATION MARK
  64. (?? . "*") ; STAR OPERATOR
  65. (?‘ . " backquote ") ; LEFT SINGLE QUOTATION MARK
  66. (?’ . "'") ; right SINGLE QUOTATION MARK
  67. (?? . "-") ; hyphen
  68. (?– . "--") ; n-dash
  69. (?— . "---") ; m-dash
  70. (?? . "----") ; horizontal bar
  71. (?? . "||") ; vertical bar
  72. (?… . "...") ; ellipses
  73. (?• . " bullet ") ; bullet
  74. (?? . " ... ") ; message-waiting
  75. (?™ . "TM") ; trademark
  76. (?? . "ff") ; latin small ligature ff
  77. (?? . "fi") ; latin small ligature fi
  78. (?? . "fl") ; latin small ligature fl
  79. (?? . "ffi") ; latin small ligature ffi
  80. (?? . "Ffl") ; latin small ligature ffl
  81. )
  82. "Explicit replacements for some characters."
  83. :group 'dtk-unicode
  84. :type '(alist
  85. :key-type (character :tag "character")
  86. :value-type (string :tag "replacement"))
  87. )
  88. (defcustom dtk-unicode-name-transformation-rules-alist
  89. '(
  90. ("^greek\\( small\\| capital\\)? letter \\(.*\\)$" . (lambda (s) (match-string 2 s)))
  91. ("\\(.*\\) sign$" . (lambda (s) (match-string 1 s)))
  92. )
  93. "Alist of character name transformation rules."
  94. :group 'dtk-unicode
  95. :type '(repeat (cons :value ("." . identity)
  96. (regexp :tag "pattern")
  97. (function :tag "transformation")))
  98. )
  99. ;;}}}
  100. ;;{{{ Variables
  101. (defvar dtk-unicode-untouched-charsets
  102. '(ascii latin-iso8859-1)
  103. "*Characters of these charsets are completely ignored by dtk-unicode-replace-chars.")
  104. (defvar dtk-unicode-handlers
  105. '(dtk-unicode-user-table-handler dtk-unicode-full-table-handler)
  106. "List of functions which are called in in this order for replacing an unspeakable character.
  107. A handler returns a non-nil value if the replacement was successful, nil otherwise.")
  108. ;;}}}
  109. ;;{{{ Helper functions
  110. (defun dtk-unicode-charset-limits (charset)
  111. "Return rough lower and upper limits for character codes in CHARSET."
  112. (cond
  113. ((eq charset 'ascii)
  114. (list 0 127))
  115. ((eq charset 'eight-bit-control)
  116. (list 128 159))
  117. ((eq charset 'eight-bit-graphic)
  118. (list 160 255))
  119. (t
  120. (let* ((dim (charset-dimension charset))
  121. (chars (charset-chars charset))
  122. min max)
  123. (if (eq chars 96)
  124. (setq min 32 max 127)
  125. (setq min 33 max 126))
  126. (list (make-char charset min min) (make-char charset max max))))))
  127. (defun dtk-unicode-build-skip-regexp (charsets)
  128. "Construct regexp to match all but the characters in dtk-unicode-untouched-charsets."
  129. (format "[^%s]"
  130. (loop for charset in charsets
  131. when (charsetp charset)
  132. concat (apply 'format "%c-%c" (dtk-unicode-charset-limits charset)))))
  133. (defvar dtk-unicode-charset-filter-regexp
  134. (dtk-unicode-build-skip-regexp dtk-unicode-untouched-charsets)
  135. "Regular exppression that matches characters not in dtk-unicode-untouched-charsets.")
  136. (defun dtk-unicode-update-untouched-charsets (charsets)
  137. "Update list of charsets we will not touch."
  138. (setq dtk-unicode-untouched-charsets charsets)
  139. (setq dtk-unicode-charset-filter-regexp (dtk-unicode-build-skip-regexp dtk-unicode-untouched-charsets)))
  140. (eval-and-compile
  141. (if (> emacs-major-version 22)
  142. (progn
  143. (defmacro with-charset-priority (charsets &rest body)
  144. "Execute BODY like `progn' with CHARSETS at the front of priority list.
  145. CHARSETS is a list of charsets. See
  146. `set-charset-priority'. This affects the implicit sorting of lists of
  147. charsets returned by operations such as `find-charset-region'."
  148. (let ((current (make-symbol "current")))
  149. `(let ((,current (charset-priority-list)))
  150. (apply #'set-charset-priority ,charsets)
  151. (unwind-protect
  152. (progn ,@body)
  153. (apply #'set-charset-priority ,current)))))
  154. (defun dtk-unicode-char-in-charsets-p (char charsets)
  155. "Return t if CHAR is a member of one in the charsets in CHARSETS."
  156. (with-charset-priority charsets
  157. (memq (char-charset char) charsets))))
  158. ;; emacs-major-version <= 22
  159. (defun dtk-unicode-char-in-charsets-p (char charsets)
  160. "Return t if CHAR is a member of one in the charsets in CHARSETS."
  161. (memq (char-charset char) charsets))))
  162. (defsubst dtk-unicode-char-untouched-p (char)
  163. "Return t if char is a member of one of the charsets in dtk-unicode-untouched-charsets."
  164. (dtk-unicode-char-in-charsets-p char dtk-unicode-untouched-charsets))
  165. (defvar dtk-unicode-cache (make-hash-table)
  166. "Cache for unicode data lookups.")
  167. (defadvice describe-char-unicode-data (around dtk-unicode pre act)
  168. "Cache result."
  169. (let* ((char (ad-get-arg 0))
  170. (result (gethash char dtk-unicode-cache 'not-found)))
  171. (if (eq result 'not-found)
  172. (progn
  173. ad-do-it
  174. (puthash char ad-return-value dtk-unicode-cache))
  175. (setq ad-return-value result))))
  176. (defsubst dtk-unicode-char-properties (char)
  177. "Return unicode properties for CHAR.
  178. Converts char to unicode if necessary (for emacs 22)."
  179. (let ((unicode (encode-char char 'ucs)))
  180. (and unicode (condition-case nil
  181. (let ((emacspeak-speak-cue-errors nil)
  182. (emacspeak-speak-messages nil))
  183. (describe-char-unicode-data unicode))
  184. (error nil)))))
  185. (defsubst dtk-unicode-char-property (char prop-name)
  186. "Get character property by name."
  187. (second (assoc prop-name (dtk-unicode-char-properties char))))
  188. (defun dtk-unicode-name-for-char (char)
  189. "Return unicode name for character CHAR.
  190. nil if CHAR is not in Unicode."
  191. (let ((name (dtk-unicode-char-property char "Name")))
  192. (when (and (stringp name) (string-equal name "<control>"))
  193. (setq name (dtk-unicode-char-property char "Old name")))
  194. (and (stringp name) (downcase name))))
  195. (defsubst dtk-unicode-char-punctuation-p (char)
  196. "Use unicode properties to determine whether CHAR is a ppunctuation character."
  197. (let ((category (dtk-unicode-char-property char "Category"))
  198. (case-fold-search t))
  199. (when (stringp category)
  200. (string-match "punctuation" category))))
  201. (defsubst dtk-unicode-apply-name-transformation-rules (name)
  202. "Apply transformation rules in dtk-unicode-name-transformation-rules-alist to NAME."
  203. (funcall
  204. (or (assoc-default name dtk-unicode-name-transformation-rules-alist 'string-match)
  205. 'identity)
  206. name))
  207. (defun dtk-unicode-uncustomize-char (char)
  208. "Delete custom replacement for CHAR.
  209. When called interactively, CHAR defaults to the character after point."
  210. (interactive (list (following-char)))
  211. (setq dtk-unicode-character-replacement-alist
  212. (loop for elem in dtk-unicode-character-replacement-alist
  213. unless (eq (car elem) char) collect elem)))
  214. (defun dtk-unicode-customize-char (char replacement)
  215. "Add a custom replacement string for CHAR.
  216. When called interactively, CHAR defaults to the character after point."
  217. (interactive
  218. (let ((char (following-char)))
  219. (list char
  220. (read-string
  221. (format "Replacement for %c (0x%x) from charset %s: " char char (char-charset char))))))
  222. (push (cons char replacement) dtk-unicode-character-replacement-alist))
  223. ;;}}}
  224. ;;{{{ Character replacement handlers
  225. (defsubst dtk-unicode-user-table-handler (char)
  226. "Return user defined replacement character if it exists."
  227. (cdr (assq char dtk-unicode-character-replacement-alist)))
  228. (defsubst dtk-unicode-full-table-handler (char)
  229. "Uses the unicode data file to find the name of CHAR."
  230. (let ((char-desc (dtk-unicode-name-for-char char)))
  231. (when char-desc
  232. (format " %s " (dtk-unicode-apply-name-transformation-rules char-desc)))))
  233. ;;}}}
  234. ;;{{{ External interface
  235. (defun dtk-unicode-full-name-for-char (char)
  236. "Return full name of CHAR.
  237. This is meant to be used in places where the user asks for a detailed description of CHAR."
  238. (dtk-unicode-name-for-char char))
  239. (defun dtk-unicode-short-name-for-char (char)
  240. "Return name of CHAR.
  241. This is meant to be used in places where the user asks for a short description of CHAR."
  242. (if (memq char dtk-unicode-untouched-charsets)
  243. (char-to-string char)
  244. (dtk-unicode-name-for-char char)))
  245. (defun dtk-unicode-replace-chars (mode)
  246. "Replace unicode characters in current buffer with something more TTS friendly.
  247. This is the main entry point for this module.
  248. The argument MODE specifies the current punctuation mode.
  249. Does nothing for unibyte buffers."
  250. (when enable-multibyte-characters
  251. (let ((inhibit-read-only t))
  252. (goto-char (point-min))
  253. (while (re-search-forward dtk-unicode-charset-filter-regexp nil t)
  254. (let* ((pos (match-beginning 0))
  255. (char (char-after pos))
  256. (replacement
  257. (save-match-data
  258. (if (and (eq mode 'none) (dtk-unicode-char-punctuation-p char))
  259. " "
  260. (run-hook-with-args-until-success 'dtk-unicode-handlers char)))))
  261. (when replacement
  262. (let ((props (text-properties-at pos)))
  263. (replace-match replacement t t nil)
  264. (when props
  265. (set-text-properties pos (point) props)))))))))
  266. ;;}}}
  267. (provide 'dtk-unicode)
  268. ;;{{{ emacs local variables
  269. ;;; local variables:
  270. ;;; coding: utf-8
  271. ;;; folded-file: t
  272. ;;; byte-compile-dynamic: t
  273. ;;; end:
  274. ;;}}}
  275. ;;; dtk-unicode.el ends here