/contrib/groff/src/preproc/refer/token.cpp

https://bitbucket.org/freebsd/freebsd-head/ · C++ · 378 lines · 320 code · 35 blank · 23 comment · 72 complexity · 7a88558e2d2b6f8d266e8639d1a9fd61 MD5 · raw file

  1. // -*- C++ -*-
  2. /* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
  3. Written by James Clark (jjc@jclark.com)
  4. This file is part of groff.
  5. groff is free software; you can redistribute it and/or modify it under
  6. the terms of the GNU General Public License as published by the Free
  7. Software Foundation; either version 2, or (at your option) any later
  8. version.
  9. groff is distributed in the hope that it will be useful, but WITHOUT ANY
  10. WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  12. for more details.
  13. You should have received a copy of the GNU General Public License along
  14. with groff; see the file COPYING. If not, write to the Free Software
  15. Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  16. #include "refer.h"
  17. #include "token.h"
  18. #define TOKEN_TABLE_SIZE 1009
  19. // I believe in Icelandic thorn sorts after z.
  20. #define THORN_SORT_KEY "{"
  21. struct token_table_entry {
  22. const char *tok;
  23. token_info ti;
  24. token_table_entry();
  25. };
  26. token_table_entry token_table[TOKEN_TABLE_SIZE];
  27. int ntokens = 0;
  28. static void skip_name(const char **ptr, const char *end)
  29. {
  30. if (*ptr < end) {
  31. switch (*(*ptr)++) {
  32. case '(':
  33. if (*ptr < end) {
  34. *ptr += 1;
  35. if (*ptr < end)
  36. *ptr += 1;
  37. }
  38. break;
  39. case '[':
  40. while (*ptr < end)
  41. if (*(*ptr)++ == ']')
  42. break;
  43. break;
  44. }
  45. }
  46. }
  47. int get_token(const char **ptr, const char *end)
  48. {
  49. if (*ptr >= end)
  50. return 0;
  51. char c = *(*ptr)++;
  52. if (c == '\\' && *ptr < end) {
  53. switch (**ptr) {
  54. default:
  55. *ptr += 1;
  56. break;
  57. case '(':
  58. case '[':
  59. skip_name(ptr, end);
  60. break;
  61. case '*':
  62. case 'f':
  63. *ptr += 1;
  64. skip_name(ptr, end);
  65. break;
  66. }
  67. }
  68. return 1;
  69. }
  70. token_info::token_info()
  71. : type(TOKEN_OTHER), sort_key(0), other_case(0)
  72. {
  73. }
  74. void token_info::set(token_type t, const char *sk, const char *oc)
  75. {
  76. assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
  77. type = t;
  78. sort_key = sk;
  79. other_case = oc;
  80. }
  81. void token_info::sortify(const char *start, const char *end, string &result)
  82. const
  83. {
  84. if (sort_key)
  85. result += sort_key;
  86. else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
  87. for (; start < end; start++)
  88. if (csalpha(*start))
  89. result += cmlower(*start);
  90. }
  91. }
  92. int token_info::sortify_non_empty(const char *start, const char *end) const
  93. {
  94. if (sort_key)
  95. return *sort_key != '\0';
  96. if (type != TOKEN_UPPER && type != TOKEN_LOWER)
  97. return 0;
  98. for (; start < end; start++)
  99. if (csalpha(*start))
  100. return 1;
  101. return 0;
  102. }
  103. void token_info::lower_case(const char *start, const char *end,
  104. string &result) const
  105. {
  106. if (type != TOKEN_UPPER) {
  107. while (start < end)
  108. result += *start++;
  109. }
  110. else if (other_case)
  111. result += other_case;
  112. else {
  113. while (start < end)
  114. result += cmlower(*start++);
  115. }
  116. }
  117. void token_info::upper_case(const char *start, const char *end,
  118. string &result) const
  119. {
  120. if (type != TOKEN_LOWER) {
  121. while (start < end)
  122. result += *start++;
  123. }
  124. else if (other_case)
  125. result += other_case;
  126. else {
  127. while (start < end)
  128. result += cmupper(*start++);
  129. }
  130. }
  131. token_table_entry::token_table_entry()
  132. : tok(0)
  133. {
  134. }
  135. static void store_token(const char *tok, token_type typ,
  136. const char *sk = 0, const char *oc = 0)
  137. {
  138. unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
  139. for (;;) {
  140. if (token_table[n].tok == 0) {
  141. if (++ntokens == TOKEN_TABLE_SIZE)
  142. assert(0);
  143. token_table[n].tok = tok;
  144. break;
  145. }
  146. if (strcmp(tok, token_table[n].tok) == 0)
  147. break;
  148. if (n == 0)
  149. n = TOKEN_TABLE_SIZE - 1;
  150. else
  151. --n;
  152. }
  153. token_table[n].ti.set(typ, sk, oc);
  154. }
  155. token_info default_token_info;
  156. const token_info *lookup_token(const char *start, const char *end)
  157. {
  158. unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
  159. for (;;) {
  160. if (token_table[n].tok == 0)
  161. break;
  162. if (strlen(token_table[n].tok) == size_t(end - start)
  163. && memcmp(token_table[n].tok, start, end - start) == 0)
  164. return &(token_table[n].ti);
  165. if (n == 0)
  166. n = TOKEN_TABLE_SIZE - 1;
  167. else
  168. --n;
  169. }
  170. return &default_token_info;
  171. }
  172. static void init_ascii()
  173. {
  174. const char *p;
  175. for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
  176. char buf[2];
  177. buf[0] = *p;
  178. buf[1] = '\0';
  179. store_token(strsave(buf), TOKEN_LOWER);
  180. buf[0] = cmupper(buf[0]);
  181. store_token(strsave(buf), TOKEN_UPPER);
  182. }
  183. for (p = "0123456789"; *p; p++) {
  184. char buf[2];
  185. buf[0] = *p;
  186. buf[1] = '\0';
  187. const char *s = strsave(buf);
  188. store_token(s, TOKEN_OTHER, s);
  189. }
  190. for (p = ".,:;?!"; *p; p++) {
  191. char buf[2];
  192. buf[0] = *p;
  193. buf[1] = '\0';
  194. store_token(strsave(buf), TOKEN_PUNCT);
  195. }
  196. store_token("-", TOKEN_HYPHEN);
  197. }
  198. static void store_letter(const char *lower, const char *upper,
  199. const char *sort_key = 0)
  200. {
  201. store_token(lower, TOKEN_LOWER, sort_key, upper);
  202. store_token(upper, TOKEN_UPPER, sort_key, lower);
  203. }
  204. static void init_letter(unsigned char uc_code, unsigned char lc_code,
  205. const char *sort_key)
  206. {
  207. char lbuf[2];
  208. lbuf[0] = lc_code;
  209. lbuf[1] = 0;
  210. char ubuf[2];
  211. ubuf[0] = uc_code;
  212. ubuf[1] = 0;
  213. store_letter(strsave(lbuf), strsave(ubuf), sort_key);
  214. }
  215. static void init_latin1()
  216. {
  217. init_letter(0xc0, 0xe0, "a");
  218. init_letter(0xc1, 0xe1, "a");
  219. init_letter(0xc2, 0xe2, "a");
  220. init_letter(0xc3, 0xe3, "a");
  221. init_letter(0xc4, 0xe4, "a");
  222. init_letter(0xc5, 0xe5, "a");
  223. init_letter(0xc6, 0xe6, "ae");
  224. init_letter(0xc7, 0xe7, "c");
  225. init_letter(0xc8, 0xe8, "e");
  226. init_letter(0xc9, 0xe9, "e");
  227. init_letter(0xca, 0xea, "e");
  228. init_letter(0xcb, 0xeb, "e");
  229. init_letter(0xcc, 0xec, "i");
  230. init_letter(0xcd, 0xed, "i");
  231. init_letter(0xce, 0xee, "i");
  232. init_letter(0xcf, 0xef, "i");
  233. init_letter(0xd0, 0xf0, "d");
  234. init_letter(0xd1, 0xf1, "n");
  235. init_letter(0xd2, 0xf2, "o");
  236. init_letter(0xd3, 0xf3, "o");
  237. init_letter(0xd4, 0xf4, "o");
  238. init_letter(0xd5, 0xf5, "o");
  239. init_letter(0xd6, 0xf6, "o");
  240. init_letter(0xd8, 0xf8, "o");
  241. init_letter(0xd9, 0xf9, "u");
  242. init_letter(0xda, 0xfa, "u");
  243. init_letter(0xdb, 0xfb, "u");
  244. init_letter(0xdc, 0xfc, "u");
  245. init_letter(0xdd, 0xfd, "y");
  246. init_letter(0xde, 0xfe, THORN_SORT_KEY);
  247. store_token("\337", TOKEN_LOWER, "ss", "SS");
  248. store_token("\377", TOKEN_LOWER, "y", "Y");
  249. }
  250. static void init_two_char_letter(char l1, char l2, char u1, char u2,
  251. const char *sk = 0)
  252. {
  253. char buf[6];
  254. buf[0] = '\\';
  255. buf[1] = '(';
  256. buf[2] = l1;
  257. buf[3] = l2;
  258. buf[4] = '\0';
  259. const char *p = strsave(buf);
  260. buf[2] = u1;
  261. buf[3] = u2;
  262. store_letter(p, strsave(buf), sk);
  263. buf[1] = '[';
  264. buf[4] = ']';
  265. buf[5] = '\0';
  266. p = strsave(buf);
  267. buf[2] = l1;
  268. buf[3] = l2;
  269. store_letter(strsave(buf), p, sk);
  270. }
  271. static void init_special_chars()
  272. {
  273. const char *p;
  274. for (p = "':^`~"; *p; p++)
  275. for (const char *q = "aeiouy"; *q; q++) {
  276. // Use a variable to work around bug in gcc 2.0
  277. char c = cmupper(*q);
  278. init_two_char_letter(*p, *q, *p, c);
  279. }
  280. for (p = "/l/o~n,coeaeij"; *p; p += 2) {
  281. // Use variables to work around bug in gcc 2.0
  282. char c0 = cmupper(p[0]);
  283. char c1 = cmupper(p[1]);
  284. init_two_char_letter(p[0], p[1], c0, c1);
  285. }
  286. init_two_char_letter('v', 's', 'v', 'S', "s");
  287. init_two_char_letter('v', 'z', 'v', 'Z', "z");
  288. init_two_char_letter('o', 'a', 'o', 'A', "a");
  289. init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
  290. init_two_char_letter('-', 'd', '-', 'D');
  291. store_token("\\(ss", TOKEN_LOWER, 0, "SS");
  292. store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
  293. store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
  294. store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
  295. store_token("\\(hy", TOKEN_HYPHEN);
  296. store_token("\\[hy]", TOKEN_HYPHEN);
  297. store_token("\\(en", TOKEN_RANGE_SEP);
  298. store_token("\\[en]", TOKEN_RANGE_SEP);
  299. }
  300. static void init_strings()
  301. {
  302. char buf[6];
  303. buf[0] = '\\';
  304. buf[1] = '*';
  305. for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
  306. buf[2] = *p;
  307. buf[3] = '\0';
  308. store_token(strsave(buf), TOKEN_ACCENT);
  309. buf[2] = '[';
  310. buf[3] = *p;
  311. buf[4] = ']';
  312. buf[5] = '\0';
  313. store_token(strsave(buf), TOKEN_ACCENT);
  314. }
  315. // -ms special letters
  316. store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
  317. store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
  318. store_letter("\\*(d-", "\\*(D-");
  319. store_letter("\\*[d-]", "\\*[D-]");
  320. store_letter("\\*(ae", "\\*(Ae", "ae");
  321. store_letter("\\*[ae]", "\\*[Ae]", "ae");
  322. store_letter("\\*(oe", "\\*(Oe", "oe");
  323. store_letter("\\*[oe]", "\\*[Oe]", "oe");
  324. store_token("\\*3", TOKEN_LOWER, "y", "Y");
  325. store_token("\\*8", TOKEN_LOWER, "ss", "SS");
  326. store_token("\\*q", TOKEN_LOWER, "o", "O");
  327. }
  328. struct token_initer {
  329. token_initer();
  330. };
  331. static token_initer the_token_initer;
  332. token_initer::token_initer()
  333. {
  334. init_ascii();
  335. init_latin1();
  336. init_special_chars();
  337. init_strings();
  338. default_token_info.set(TOKEN_OTHER);
  339. }