PageRenderTime 56ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/intl/unicharutil/tools/ucgendat.c

https://bitbucket.org/soko/mozilla-central
C | 1457 lines | 805 code | 190 blank | 462 comment | 287 complexity | 67b5814f74cf1f0a8cfced61f36b1ebf MD5 | raw file
Possible License(s): GPL-2.0, JSON, 0BSD, LGPL-3.0, AGPL-1.0, MIT, MPL-2.0-no-copyleft-exception, BSD-3-Clause, LGPL-2.1, Apache-2.0
  1. /*
  2. * Copyright 1996, 1997, 1998 Computing Research Labs,
  3. * New Mexico State University
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a
  6. * copy of this software and associated documentation files (the "Software"),
  7. * to deal in the Software without restriction, including without limitation
  8. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9. * and/or sell copies of the Software, and to permit persons to whom the
  10. * Software is furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in
  13. * all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19. * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20. * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21. * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22. */
  23. #ifndef lint
  24. #ifdef __GNUC__
  25. static char rcsid[] __attribute__ ((unused)) = "$Id: ucgendat.c,v 1.1 1999/01/08 00:19:21 ftang%netscape.com Exp $";
  26. #else
  27. static char rcsid[] = "$Id: ucgendat.c,v 1.1 1999/01/08 00:19:21 ftang%netscape.com Exp $";
  28. #endif
  29. #endif
  30. #include <stdio.h>
  31. #include <stdlib.h>
  32. #include <string.h>
  33. #ifndef WIN32
  34. #include <unistd.h>
  35. #endif
  36. #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
  37. ((cc) >= 'A' && (cc) <= 'F') ||\
  38. ((cc) >= 'a' && (cc) <= 'f'))
  39. /*
  40. * A header written to the output file with the byte-order-mark and the number
  41. * of property nodes.
  42. */
  43. static unsigned short hdr[2] = {0xfeff, 0};
  44. #define NUMPROPS 49
  45. #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
  46. typedef struct {
  47. char *name;
  48. int len;
  49. } _prop_t;
  50. /*
  51. * List of properties expected to be found in the Unicode Character Database
  52. * including some implementation specific properties.
  53. *
  54. * The implementation specific properties are:
  55. * Cm = Composed (can be decomposed)
  56. * Nb = Non-breaking
  57. * Sy = Symmetric (has left and right forms)
  58. * Hd = Hex digit
  59. * Qm = Quote marks
  60. * Mr = Mirroring
  61. * Ss = Space, other
  62. * Cp = Defined character
  63. */
  64. static _prop_t props[NUMPROPS] = {
  65. {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
  66. {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
  67. {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
  68. {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
  69. {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1},
  70. {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1},
  71. {"S", 1}, {"WS", 2}, {"ON", 2},
  72. {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
  73. {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}
  74. };
  75. typedef struct {
  76. unsigned long *ranges;
  77. unsigned short used;
  78. unsigned short size;
  79. } _ranges_t;
  80. static _ranges_t proptbl[NUMPROPS];
  81. /*
  82. * Make sure this array is sized to be on a 4-byte boundary at compile time.
  83. */
  84. static unsigned short propcnt[NEEDPROPS];
  85. /*
  86. * Array used to collect a decomposition before adding it to the decomposition
  87. * table.
  88. */
  89. static unsigned long dectmp[64];
  90. static unsigned long dectmp_size;
  91. typedef struct {
  92. unsigned long code;
  93. unsigned short size;
  94. unsigned short used;
  95. unsigned long *decomp;
  96. } _decomp_t;
  97. /*
  98. * List of decomposition. Created and expanded in order as the characters are
  99. * encountered.
  100. */
  101. static _decomp_t *decomps;
  102. static unsigned long decomps_used;
  103. static unsigned long decomps_size;
  104. /*
  105. * Types and lists for handling lists of case mappings.
  106. */
  107. typedef struct {
  108. unsigned long key;
  109. unsigned long other1;
  110. unsigned long other2;
  111. } _case_t;
  112. static _case_t *upper;
  113. static _case_t *lower;
  114. static _case_t *title;
  115. static unsigned long upper_used;
  116. static unsigned long upper_size;
  117. static unsigned long lower_used;
  118. static unsigned long lower_size;
  119. static unsigned long title_used;
  120. static unsigned long title_size;
  121. /*
  122. * Array used to collect case mappings before adding them to a list.
  123. */
  124. static unsigned long cases[3];
  125. /*
  126. * An array to hold ranges for combining classes.
  127. */
  128. static unsigned long *ccl;
  129. static unsigned long ccl_used;
  130. static unsigned long ccl_size;
  131. /*
  132. * Structures for handling numbers.
  133. */
  134. typedef struct {
  135. unsigned long code;
  136. unsigned long idx;
  137. } _codeidx_t;
  138. typedef struct {
  139. short numerator;
  140. short denominator;
  141. } _num_t;
  142. /*
  143. * Arrays to hold the mapping of codes to numbers.
  144. */
  145. static _codeidx_t *ncodes;
  146. static unsigned long ncodes_used;
  147. static unsigned long ncodes_size;
  148. static _num_t *nums;
  149. static unsigned long nums_used;
  150. static unsigned long nums_size;
  151. /*
  152. * Array for holding numbers.
  153. */
  154. static _num_t *nums;
  155. static unsigned long nums_used;
  156. static unsigned long nums_size;
  157. static void
  158. #ifdef __STDC__
  159. add_range(unsigned long start, unsigned long end, char *p1, char *p2)
  160. #else
  161. add_range(start, end, p1, p2)
  162. unsigned long start, end;
  163. char *p1, *p2;
  164. #endif
  165. {
  166. int i, j, k, len;
  167. _ranges_t *rlp;
  168. char *name;
  169. for (k = 0; k < 2; k++) {
  170. if (k == 0) {
  171. name = p1;
  172. len = 2;
  173. } else {
  174. if (p2 == 0)
  175. break;
  176. name = p2;
  177. len = 1;
  178. }
  179. for (i = 0; i < NUMPROPS; i++) {
  180. if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
  181. break;
  182. }
  183. if (i == NUMPROPS)
  184. continue;
  185. rlp = &proptbl[i];
  186. /*
  187. * Resize the range list if necessary.
  188. */
  189. if (rlp->used == rlp->size) {
  190. if (rlp->size == 0)
  191. rlp->ranges = (unsigned long *)
  192. malloc(sizeof(unsigned long) << 3);
  193. else
  194. rlp->ranges = (unsigned long *)
  195. realloc((char *) rlp->ranges,
  196. sizeof(unsigned long) * (rlp->size + 8));
  197. rlp->size += 8;
  198. }
  199. /*
  200. * If this is the first code for this property list, just add it
  201. * and return.
  202. */
  203. if (rlp->used == 0) {
  204. rlp->ranges[0] = start;
  205. rlp->ranges[1] = end;
  206. rlp->used += 2;
  207. continue;
  208. }
  209. /*
  210. * Optimize the case of adding the range to the end.
  211. */
  212. j = rlp->used - 1;
  213. if (start > rlp->ranges[j]) {
  214. j = rlp->used;
  215. rlp->ranges[j++] = start;
  216. rlp->ranges[j++] = end;
  217. rlp->used = j;
  218. continue;
  219. }
  220. /*
  221. * Need to locate the insertion point.
  222. */
  223. for (i = 0;
  224. i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
  225. /*
  226. * If the start value lies in the current range, then simply set the
  227. * new end point of the range to the end value passed as a parameter.
  228. */
  229. if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
  230. rlp->ranges[i + 1] = end;
  231. return;
  232. }
  233. /*
  234. * Shift following values up by two.
  235. */
  236. for (j = rlp->used; j > i; j -= 2) {
  237. rlp->ranges[j] = rlp->ranges[j - 2];
  238. rlp->ranges[j + 1] = rlp->ranges[j - 1];
  239. }
  240. /*
  241. * Add the new range at the insertion point.
  242. */
  243. rlp->ranges[i] = start;
  244. rlp->ranges[i + 1] = end;
  245. rlp->used += 2;
  246. }
  247. }
  248. static void
  249. #ifdef __STDC__
  250. ordered_range_insert(unsigned long c, char *name, int len)
  251. #else
  252. ordered_range_insert(c, name, len)
  253. unsigned long c;
  254. char *name;
  255. int len;
  256. #endif
  257. {
  258. int i, j;
  259. unsigned long s, e;
  260. _ranges_t *rlp;
  261. if (len == 0)
  262. return;
  263. for (i = 0; i < NUMPROPS; i++) {
  264. if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
  265. break;
  266. }
  267. if (i == NUMPROPS)
  268. return;
  269. /*
  270. * Have a match, so insert the code in order.
  271. */
  272. rlp = &proptbl[i];
  273. /*
  274. * Resize the range list if necessary.
  275. */
  276. if (rlp->used == rlp->size) {
  277. if (rlp->size == 0)
  278. rlp->ranges = (unsigned long *)
  279. malloc(sizeof(unsigned long) << 3);
  280. else
  281. rlp->ranges = (unsigned long *)
  282. realloc((char *) rlp->ranges,
  283. sizeof(unsigned long) * (rlp->size + 8));
  284. rlp->size += 8;
  285. }
  286. /*
  287. * If this is the first code for this property list, just add it
  288. * and return.
  289. */
  290. if (rlp->used == 0) {
  291. rlp->ranges[0] = rlp->ranges[1] = c;
  292. rlp->used += 2;
  293. return;
  294. }
  295. /*
  296. * Optimize the cases of extending the last range and adding new ranges to
  297. * the end.
  298. */
  299. j = rlp->used - 1;
  300. e = rlp->ranges[j];
  301. s = rlp->ranges[j - 1];
  302. if (c == e + 1) {
  303. /*
  304. * Extend the last range.
  305. */
  306. rlp->ranges[j] = c;
  307. return;
  308. }
  309. if (c > e + 1) {
  310. /*
  311. * Start another range on the end.
  312. */
  313. j = rlp->used;
  314. rlp->ranges[j] = rlp->ranges[j + 1] = c;
  315. rlp->used += 2;
  316. return;
  317. }
  318. if (c >= s)
  319. /*
  320. * The code is a duplicate of a code in the last range, so just return.
  321. */
  322. return;
  323. /*
  324. * The code should be inserted somewhere before the last range in the
  325. * list. Locate the insertion point.
  326. */
  327. for (i = 0;
  328. i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
  329. s = rlp->ranges[i];
  330. e = rlp->ranges[i + 1];
  331. if (c == e + 1)
  332. /*
  333. * Simply extend the current range.
  334. */
  335. rlp->ranges[i + 1] = c;
  336. else if (c < s) {
  337. /*
  338. * Add a new entry before the current location. Shift all entries
  339. * before the current one up by one to make room.
  340. */
  341. for (j = rlp->used; j > i; j -= 2) {
  342. rlp->ranges[j] = rlp->ranges[j - 2];
  343. rlp->ranges[j + 1] = rlp->ranges[j - 1];
  344. }
  345. rlp->ranges[i] = rlp->ranges[i + 1] = c;
  346. rlp->used += 2;
  347. }
  348. }
  349. static void
  350. #ifdef __STDC__
  351. add_decomp(unsigned long code)
  352. #else
  353. add_decomp(code)
  354. unsigned long code;
  355. #endif
  356. {
  357. unsigned long i, j, size;
  358. /*
  359. * Add the code to the composite property.
  360. */
  361. ordered_range_insert(code, "Cm", 2);
  362. /*
  363. * Locate the insertion point for the code.
  364. */
  365. for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
  366. /*
  367. * Allocate space for a new decomposition.
  368. */
  369. if (decomps_used == decomps_size) {
  370. if (decomps_size == 0)
  371. decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
  372. else
  373. decomps = (_decomp_t *)
  374. realloc((char *) decomps,
  375. sizeof(_decomp_t) * (decomps_size + 8));
  376. (void) memset((char *) (decomps + decomps_size), 0,
  377. sizeof(_decomp_t) << 3);
  378. decomps_size += 8;
  379. }
  380. if (i < decomps_used && code != decomps[i].code) {
  381. /*
  382. * Shift the decomps up by one if the codes don't match.
  383. */
  384. for (j = decomps_used; j > i; j--)
  385. (void) memcpy((char *) &decomps[j], (char *) &decomps[j - 1],
  386. sizeof(_decomp_t));
  387. }
  388. /*
  389. * Insert or replace a decomposition.
  390. */
  391. size = dectmp_size + (4 - (dectmp_size & 3));
  392. if (decomps[i].size < size) {
  393. if (decomps[i].size == 0)
  394. decomps[i].decomp = (unsigned long *)
  395. malloc(sizeof(unsigned long) * size);
  396. else
  397. decomps[i].decomp = (unsigned long *)
  398. realloc((char *) decomps[i].decomp,
  399. sizeof(unsigned long) * size);
  400. decomps[i].size = size;
  401. }
  402. if (decomps[i].code != code)
  403. decomps_used++;
  404. decomps[i].code = code;
  405. decomps[i].used = dectmp_size;
  406. (void) memcpy((char *) decomps[i].decomp, (char *) dectmp,
  407. sizeof(unsigned long) * dectmp_size);
  408. }
  409. static void
  410. #ifdef __STDC__
  411. add_title(unsigned long code)
  412. #else
  413. add_title(code)
  414. unsigned long code;
  415. #endif
  416. {
  417. unsigned long i, j;
  418. /*
  419. * Always map the code to itself.
  420. */
  421. cases[2] = code;
  422. if (title_used == title_size) {
  423. if (title_size == 0)
  424. title = (_case_t *) malloc(sizeof(_case_t) << 3);
  425. else
  426. title = (_case_t *) realloc((char *) title,
  427. sizeof(_case_t) * (title_size + 8));
  428. title_size += 8;
  429. }
  430. /*
  431. * Locate the insertion point.
  432. */
  433. for (i = 0; i < title_used && code > title[i].key; i++) ;
  434. if (i < title_used) {
  435. /*
  436. * Shift the array up by one.
  437. */
  438. for (j = title_used; j > i; j--)
  439. (void) memcpy((char *) &title[j], (char *) &title[j - 1],
  440. sizeof(_case_t));
  441. }
  442. title[i].key = cases[2]; /* Title */
  443. title[i].other1 = cases[0]; /* Upper */
  444. title[i].other2 = cases[1]; /* Lower */
  445. title_used++;
  446. }
  447. static void
  448. #ifdef __STDC__
  449. add_upper(unsigned long code)
  450. #else
  451. add_upper(code)
  452. unsigned long code;
  453. #endif
  454. {
  455. unsigned long i, j;
  456. /*
  457. * Always map the code to itself.
  458. */
  459. cases[0] = code;
  460. /*
  461. * If the title case character is not present, then make it the same as
  462. * the upper case.
  463. */
  464. if (cases[2] == 0)
  465. cases[2] = code;
  466. if (upper_used == upper_size) {
  467. if (upper_size == 0)
  468. upper = (_case_t *) malloc(sizeof(_case_t) << 3);
  469. else
  470. upper = (_case_t *) realloc((char *) upper,
  471. sizeof(_case_t) * (upper_size + 8));
  472. upper_size += 8;
  473. }
  474. /*
  475. * Locate the insertion point.
  476. */
  477. for (i = 0; i < upper_used && code > upper[i].key; i++) ;
  478. if (i < upper_used) {
  479. /*
  480. * Shift the array up by one.
  481. */
  482. for (j = upper_used; j > i; j--)
  483. (void) memcpy((char *) &upper[j], (char *) &upper[j - 1],
  484. sizeof(_case_t));
  485. }
  486. upper[i].key = cases[0]; /* Upper */
  487. upper[i].other1 = cases[1]; /* Lower */
  488. upper[i].other2 = cases[2]; /* Title */
  489. upper_used++;
  490. }
  491. static void
  492. #ifdef __STDC__
  493. add_lower(unsigned long code)
  494. #else
  495. add_lower(code)
  496. unsigned long code;
  497. #endif
  498. {
  499. unsigned long i, j;
  500. /*
  501. * Always map the code to itself.
  502. */
  503. cases[1] = code;
  504. /*
  505. * If the title case character is empty, then make it the same as the
  506. * upper case.
  507. */
  508. if (cases[2] == 0)
  509. cases[2] = cases[0];
  510. if (lower_used == lower_size) {
  511. if (lower_size == 0)
  512. lower = (_case_t *) malloc(sizeof(_case_t) << 3);
  513. else
  514. lower = (_case_t *) realloc((char *) lower,
  515. sizeof(_case_t) * (lower_size + 8));
  516. lower_size += 8;
  517. }
  518. /*
  519. * Locate the insertion point.
  520. */
  521. for (i = 0; i < lower_used && code > lower[i].key; i++) ;
  522. if (i < lower_used) {
  523. /*
  524. * Shift the array up by one.
  525. */
  526. for (j = lower_used; j > i; j--)
  527. (void) memcpy((char *) &lower[j], (char *) &lower[j - 1],
  528. sizeof(_case_t));
  529. }
  530. lower[i].key = cases[1]; /* Lower */
  531. lower[i].other1 = cases[0]; /* Upper */
  532. lower[i].other2 = cases[2]; /* Title */
  533. lower_used++;
  534. }
  535. static void
  536. #ifdef __STDC__
  537. ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
  538. #else
  539. ordered_ccl_insert(c, ccl_code)
  540. unsigned long c, ccl_code;
  541. #endif
  542. {
  543. unsigned long i, j;
  544. if (ccl_used == ccl_size) {
  545. if (ccl_size == 0)
  546. ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
  547. else
  548. ccl = (unsigned long *)
  549. realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
  550. ccl_size += 24;
  551. }
  552. /*
  553. * Optimize adding the first item.
  554. */
  555. if (ccl_used == 0) {
  556. ccl[0] = ccl[1] = c;
  557. ccl[2] = ccl_code;
  558. ccl_used += 3;
  559. return;
  560. }
  561. /*
  562. * Handle the special case of extending the range on the end. This
  563. * requires that the combining class codes are the same.
  564. */
  565. if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
  566. ccl[ccl_used - 2] = c;
  567. return;
  568. }
  569. /*
  570. * Handle the special case of adding another range on the end.
  571. */
  572. if (c > ccl[ccl_used - 2] + 1 ||
  573. (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
  574. ccl[ccl_used++] = c;
  575. ccl[ccl_used++] = c;
  576. ccl[ccl_used++] = ccl_code;
  577. return;
  578. }
  579. /*
  580. * Locate either the insertion point or range for the code.
  581. */
  582. for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
  583. if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
  584. /*
  585. * Extend an existing range.
  586. */
  587. ccl[i + 1] = c;
  588. return;
  589. } else if (c < ccl[i]) {
  590. /*
  591. * Start a new range before the current location.
  592. */
  593. for (j = ccl_used; j > i; j -= 3) {
  594. ccl[j] = ccl[j - 3];
  595. ccl[j - 1] = ccl[j - 4];
  596. ccl[j - 2] = ccl[j - 5];
  597. }
  598. ccl[i] = ccl[i + 1] = c;
  599. ccl[i + 2] = ccl_code;
  600. }
  601. }
  602. /*
  603. * Adds a number if it does not already exist and returns an index value
  604. * multiplied by 2.
  605. */
  606. static unsigned long
  607. #ifdef __STDC__
  608. make_number(short num, short denom)
  609. #else
  610. make_number(num, denom)
  611. short num, denom;
  612. #endif
  613. {
  614. unsigned long n;
  615. /*
  616. * Determine if the number already exists.
  617. */
  618. for (n = 0; n < nums_used; n++) {
  619. if (nums[n].numerator == num && nums[n].denominator == denom)
  620. return n << 1;
  621. }
  622. if (nums_used == nums_size) {
  623. if (nums_size == 0)
  624. nums = (_num_t *) malloc(sizeof(_num_t) << 3);
  625. else
  626. nums = (_num_t *) realloc((char *) nums,
  627. sizeof(_num_t) * (nums_size + 8));
  628. nums_size += 8;
  629. }
  630. n = nums_used++;
  631. nums[n].numerator = num;
  632. nums[n].denominator = denom;
  633. return n << 1;
  634. }
  635. static void
  636. #ifdef __STDC__
  637. add_number(unsigned long code, short num, short denom)
  638. #else
  639. add_number(code, num, denom)
  640. unsigned long code;
  641. short num, denom;
  642. #endif
  643. {
  644. unsigned long i, j;
  645. /*
  646. * Insert the code in order.
  647. */
  648. for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
  649. /*
  650. * Handle the case of the codes matching and simply replace the number
  651. * that was there before.
  652. */
  653. if (ncodes_used > 0 && code == ncodes[i].code) {
  654. ncodes[i].idx = make_number(num, denom);
  655. return;
  656. }
  657. /*
  658. * Resize the array if necessary.
  659. */
  660. if (ncodes_used == ncodes_size) {
  661. if (ncodes_size == 0)
  662. ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
  663. else
  664. ncodes = (_codeidx_t *)
  665. realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
  666. ncodes_size += 8;
  667. }
  668. /*
  669. * Shift things around to insert the code if necessary.
  670. */
  671. if (i < ncodes_used) {
  672. for (j = ncodes_used; j > i; j--) {
  673. ncodes[j].code = ncodes[j - 1].code;
  674. ncodes[j].idx = ncodes[j - 1].idx;
  675. }
  676. }
  677. ncodes[i].code = code;
  678. ncodes[i].idx = make_number(num, denom);
  679. ncodes_used++;
  680. }
  681. /*
  682. * This routine assumes that the line is a valid Unicode Character Database
  683. * entry.
  684. */
  685. static void
  686. #ifdef __STDC__
  687. read_cdata(FILE *in)
  688. #else
  689. read_cdata(in)
  690. FILE *in;
  691. #endif
  692. {
  693. unsigned long i, lineno, skip, code, ccl_code;
  694. short wnum, neg, number[2];
  695. char line[512], *s, *e;
  696. lineno = skip = 0;
  697. while (fscanf(in, "%[^\n]\n", line) != EOF) {
  698. lineno++;
  699. /*
  700. * Skip blank lines and lines that start with a '#'.
  701. */
  702. if (line[0] == 0 || line[0] == '#')
  703. continue;
  704. /*
  705. * If lines need to be skipped, do it here.
  706. */
  707. if (skip) {
  708. skip--;
  709. continue;
  710. }
  711. /*
  712. * Collect the code. The code can be up to 6 hex digits in length to
  713. * allow surrogates to be specified.
  714. */
  715. for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
  716. code <<= 4;
  717. if (*s >= '0' && *s <= '9')
  718. code += *s - '0';
  719. else if (*s >= 'A' && *s <= 'F')
  720. code += (*s - 'A') + 10;
  721. else if (*s >= 'a' && *s <= 'f')
  722. code += (*s - 'a') + 10;
  723. }
  724. /*
  725. * Handle the following special cases:
  726. * 1. 4E00-9FA5 CJK Ideographs.
  727. * 2. AC00-D7A3 Hangul Syllables.
  728. * 3. D800-DFFF Surrogates.
  729. * 4. E000-F8FF Private Use Area.
  730. * 5. F900-FA2D Han compatibility.
  731. */
  732. switch (code) {
  733. case 0x4e00:
  734. /*
  735. * The Han ideographs.
  736. */
  737. add_range(0x4e00, 0x9fff, "Lo", "L");
  738. /*
  739. * Add the characters to the defined category.
  740. */
  741. add_range(0x4e00, 0x9fa5, "Cp", 0);
  742. skip = 1;
  743. break;
  744. case 0xac00:
  745. /*
  746. * The Hangul syllables.
  747. */
  748. add_range(0xac00, 0xd7a3, "Lo", "L");
  749. /*
  750. * Add the characters to the defined category.
  751. */
  752. add_range(0xac00, 0xd7a3, "Cp", 0);
  753. skip = 1;
  754. break;
  755. case 0xd800:
  756. /*
  757. * Make a range of all surrogates and assume some default
  758. * properties.
  759. */
  760. add_range(0x010000, 0x10ffff, "Cs", "L");
  761. skip = 5;
  762. break;
  763. case 0xe000:
  764. /*
  765. * The Private Use area. Add with a default set of properties.
  766. */
  767. add_range(0xe000, 0xf8ff, "Co", "L");
  768. skip = 1;
  769. break;
  770. case 0xf900:
  771. /*
  772. * The CJK compatibility area.
  773. */
  774. add_range(0xf900, 0xfaff, "Lo", "L");
  775. /*
  776. * Add the characters to the defined category.
  777. */
  778. add_range(0xf900, 0xfaff, "Cp", 0);
  779. skip = 1;
  780. }
  781. if (skip)
  782. continue;
  783. /*
  784. * Add the code to the defined category.
  785. */
  786. ordered_range_insert(code, "Cp", 2);
  787. /*
  788. * Locate the first character property field.
  789. */
  790. for (i = 0; *s != 0 && i < 2; s++) {
  791. if (*s == ';')
  792. i++;
  793. }
  794. for (e = s; *e && *e != ';'; e++) ;
  795. ordered_range_insert(code, s, e - s);
  796. /*
  797. * Locate the combining class code.
  798. */
  799. for (s = e; *s != 0 && i < 3; s++) {
  800. if (*s == ';')
  801. i++;
  802. }
  803. /*
  804. * Convert the combining class code from decimal.
  805. */
  806. for (ccl_code = 0, e = s; *e && *e != ';'; e++)
  807. ccl_code = (ccl_code * 10) + (*e - '0');
  808. /*
  809. * Add the code if it not 0.
  810. */
  811. if (ccl_code != 0)
  812. ordered_ccl_insert(code, ccl_code);
  813. /*
  814. * Locate the second character property field.
  815. */
  816. for (s = e; *s != 0 && i < 4; s++) {
  817. if (*s == ';')
  818. i++;
  819. }
  820. for (e = s; *e && *e != ';'; e++) ;
  821. ordered_range_insert(code, s, e - s);
  822. /*
  823. * Check for a decomposition.
  824. */
  825. s = ++e;
  826. if (*s != ';' && *s != '<') {
  827. /*
  828. * Collect the codes of the decomposition.
  829. */
  830. for (dectmp_size = 0; *s != ';'; ) {
  831. /*
  832. * Skip all leading non-hex digits.
  833. */
  834. while (!ishdigit(*s))
  835. s++;
  836. for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
  837. dectmp[dectmp_size] <<= 4;
  838. if (*s >= '0' && *s <= '9')
  839. dectmp[dectmp_size] += *s - '0';
  840. else if (*s >= 'A' && *s <= 'F')
  841. dectmp[dectmp_size] += (*s - 'A') + 10;
  842. else if (*s >= 'a' && *s <= 'f')
  843. dectmp[dectmp_size] += (*s - 'a') + 10;
  844. }
  845. dectmp_size++;
  846. }
  847. /*
  848. * If there is more than one code in the temporary decomposition
  849. * array, then add the character with its decomposition.
  850. */
  851. if (dectmp_size > 1)
  852. add_decomp(code);
  853. }
  854. /*
  855. * Skip to the number field.
  856. */
  857. for (i = 0; i < 3 && *s; s++) {
  858. if (*s == ';')
  859. i++;
  860. }
  861. /*
  862. * Scan the number in.
  863. */
  864. number[0] = number[1] = 0;
  865. for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
  866. if (*e == '-') {
  867. neg = 1;
  868. continue;
  869. }
  870. if (*e == '/') {
  871. /*
  872. * Move the the denominator of the fraction.
  873. */
  874. if (neg)
  875. number[wnum] *= -1;
  876. neg = 0;
  877. e++;
  878. wnum++;
  879. }
  880. number[wnum] = (number[wnum] * 10) + (*e - '0');
  881. }
  882. if (e > s) {
  883. /*
  884. * Adjust the denominator in case of integers and add the number.
  885. */
  886. if (wnum == 0)
  887. number[1] = number[0];
  888. add_number(code, number[0], number[1]);
  889. }
  890. /*
  891. * Skip to the start of the possible case mappings.
  892. */
  893. for (s = e, i = 0; i < 4 && *s; s++) {
  894. if (*s == ';')
  895. i++;
  896. }
  897. /*
  898. * Collect the case mappings.
  899. */
  900. cases[0] = cases[1] = cases[2] = 0;
  901. for (i = 0; i < 3; i++) {
  902. while (ishdigit(*s)) {
  903. cases[i] <<= 4;
  904. if (*s >= '0' && *s <= '9')
  905. cases[i] += *s - '0';
  906. else if (*s >= 'A' && *s <= 'F')
  907. cases[i] += (*s - 'A') + 10;
  908. else if (*s >= 'a' && *s <= 'f')
  909. cases[i] += (*s - 'a') + 10;
  910. s++;
  911. }
  912. if (*s == ';')
  913. s++;
  914. }
  915. if (cases[0] && cases[1])
  916. /*
  917. * Add the upper and lower mappings for a title case character.
  918. */
  919. add_title(code);
  920. else if (cases[1])
  921. /*
  922. * Add the lower and title case mappings for the upper case
  923. * character.
  924. */
  925. add_upper(code);
  926. else if (cases[0])
  927. /*
  928. * Add the upper and title case mappings for the lower case
  929. * character.
  930. */
  931. add_lower(code);
  932. }
  933. }
  934. static _decomp_t *
  935. #ifdef __STDC__
  936. find_decomp(unsigned long code)
  937. #else
  938. find_decomp(code)
  939. unsigned long code;
  940. #endif
  941. {
  942. long l, r, m;
  943. l = 0;
  944. r = decomps_used - 1;
  945. while (l <= r) {
  946. m = (l + r) >> 1;
  947. if (code > decomps[m].code)
  948. l = m + 1;
  949. else if (code < decomps[m].code)
  950. r = m - 1;
  951. else
  952. return &decomps[m];
  953. }
  954. return 0;
  955. }
  956. static void
  957. #ifdef __STDC__
  958. decomp_it(_decomp_t *d)
  959. #else
  960. decomp_it(d)
  961. _decomp_t *d;
  962. #endif
  963. {
  964. unsigned long i;
  965. _decomp_t *dp;
  966. for (i = 0; i < d->used; i++) {
  967. if ((dp = find_decomp(d->decomp[i])) != 0)
  968. decomp_it(dp);
  969. else
  970. dectmp[dectmp_size++] = d->decomp[i];
  971. }
  972. }
  973. /*
  974. * Expand all decompositions by recursively decomposing each character
  975. * in the decomposition.
  976. */
  977. static void
  978. #ifdef __STDC__
  979. expand_decomp(void)
  980. #else
  981. expand_decomp()
  982. #endif
  983. {
  984. unsigned long i;
  985. for (i = 0; i < decomps_used; i++) {
  986. dectmp_size = 0;
  987. decomp_it(&decomps[i]);
  988. if (dectmp_size > 0)
  989. add_decomp(decomps[i].code);
  990. }
  991. }
  992. static void
  993. #ifdef __STDC__
  994. write_cdata(char *opath)
  995. #else
  996. write_cdata(opath)
  997. char *opath;
  998. #endif
  999. {
  1000. FILE *out;
  1001. unsigned long i, idx, bytes, nprops;
  1002. unsigned short casecnt[2];
  1003. char path[BUFSIZ];
  1004. /*****************************************************************
  1005. *
  1006. * Generate the ctype data.
  1007. *
  1008. *****************************************************************/
  1009. /*
  1010. * Open the ctype.dat file.
  1011. */
  1012. sprintf(path, "%s/ctype.dat", opath);
  1013. if ((out = fopen(path, "wb")) == 0)
  1014. return;
  1015. /*
  1016. * Collect the offsets for the properties. The offsets array is
  1017. * on a 4-byte boundary to keep things efficient for architectures
  1018. * that need such a thing.
  1019. */
  1020. for (i = idx = 0; i < NUMPROPS; i++) {
  1021. propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
  1022. idx += proptbl[i].used;
  1023. }
  1024. /*
  1025. * Add the sentinel index which is used by the binary search as the upper
  1026. * bound for a search.
  1027. */
  1028. propcnt[i] = idx;
  1029. /*
  1030. * Record the actual number of property lists. This may be different than
  1031. * the number of offsets actually written because of aligning on a 4-byte
  1032. * boundary.
  1033. */
  1034. hdr[1] = NUMPROPS;
  1035. /*
  1036. * Calculate the byte count needed and pad the property counts array to a
  1037. * 4-byte boundary.
  1038. */
  1039. if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
  1040. bytes += 4 - (bytes & 3);
  1041. nprops = bytes / sizeof(unsigned short);
  1042. bytes += sizeof(unsigned long) * idx;
  1043. /*
  1044. * Write the header.
  1045. */
  1046. fwrite((char *) hdr, sizeof(unsigned short), 2, out);
  1047. /*
  1048. * Write the byte count.
  1049. */
  1050. fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
  1051. /*
  1052. * Write the property list counts.
  1053. */
  1054. fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
  1055. /*
  1056. * Write the property lists.
  1057. */
  1058. for (i = 0; i < NUMPROPS; i++) {
  1059. if (proptbl[i].used > 0)
  1060. fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
  1061. proptbl[i].used, out);
  1062. }
  1063. fclose(out);
  1064. /*****************************************************************
  1065. *
  1066. * Generate the case mapping data.
  1067. *
  1068. *****************************************************************/
  1069. /*
  1070. * Open the case.dat file.
  1071. */
  1072. sprintf(path, "%s/case.dat", opath);
  1073. if ((out = fopen(path, "wb")) == 0)
  1074. return;
  1075. /*
  1076. * Write the case mapping tables.
  1077. */
  1078. hdr[1] = upper_used + lower_used + title_used;
  1079. casecnt[0] = upper_used;
  1080. casecnt[1] = lower_used;
  1081. /*
  1082. * Write the header.
  1083. */
  1084. fwrite((char *) hdr, sizeof(unsigned short), 2, out);
  1085. /*
  1086. * Write the upper and lower case table sizes.
  1087. */
  1088. fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
  1089. if (upper_used > 0)
  1090. /*
  1091. * Write the upper case table.
  1092. */
  1093. fwrite((char *) upper, sizeof(_case_t), upper_used, out);
  1094. if (lower_used > 0)
  1095. /*
  1096. * Write the lower case table.
  1097. */
  1098. fwrite((char *) lower, sizeof(_case_t), lower_used, out);
  1099. if (title_used > 0)
  1100. /*
  1101. * Write the title case table.
  1102. */
  1103. fwrite((char *) title, sizeof(_case_t), title_used, out);
  1104. fclose(out);
  1105. /*****************************************************************
  1106. *
  1107. * Generate the decomposition data.
  1108. *
  1109. *****************************************************************/
  1110. /*
  1111. * Fully expand all decompositions before generating the output file.
  1112. */
  1113. expand_decomp();
  1114. /*
  1115. * Open the decomp.dat file.
  1116. */
  1117. sprintf(path, "%s/decomp.dat", opath);
  1118. if ((out = fopen(path, "wb")) == 0)
  1119. return;
  1120. hdr[1] = decomps_used;
  1121. /*
  1122. * Write the header.
  1123. */
  1124. fwrite((char *) hdr, sizeof(unsigned short), 2, out);
  1125. /*
  1126. * Write a temporary byte count which will be calculated as the
  1127. * decompositions are written out.
  1128. */
  1129. bytes = 0;
  1130. fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
  1131. if (decomps_used) {
  1132. /*
  1133. * Write the list of decomp nodes.
  1134. */
  1135. for (i = idx = 0; i < decomps_used; i++) {
  1136. fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
  1137. fwrite((char *) &idx, sizeof(unsigned long), 1, out);
  1138. idx += decomps[i].used;
  1139. }
  1140. /*
  1141. * Write the sentinel index as the last decomp node.
  1142. */
  1143. fwrite((char *) &idx, sizeof(unsigned long), 1, out);
  1144. /*
  1145. * Write the decompositions themselves.
  1146. */
  1147. for (i = 0; i < decomps_used; i++)
  1148. fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
  1149. decomps[i].used, out);
  1150. /*
  1151. * Seek back to the beginning and write the byte count.
  1152. */
  1153. bytes = (sizeof(unsigned long) * idx) +
  1154. (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
  1155. fseek(out, sizeof(unsigned short) << 1, 0L);
  1156. fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
  1157. fclose(out);
  1158. }
  1159. /*****************************************************************
  1160. *
  1161. * Generate the combining class data.
  1162. *
  1163. *****************************************************************/
  1164. /*
  1165. * Open the cmbcl.dat file.
  1166. */
  1167. sprintf(path, "%s/cmbcl.dat", opath);
  1168. if ((out = fopen(path, "wb")) == 0)
  1169. return;
  1170. /*
  1171. * Set the number of ranges used. Each range has a combining class which
  1172. * means each entry is a 3-tuple.
  1173. */
  1174. hdr[1] = ccl_used / 3;
  1175. /*
  1176. * Write the header.
  1177. */
  1178. fwrite((char *) hdr, sizeof(unsigned short), 2, out);
  1179. /*
  1180. * Write out the byte count to maintain header size.
  1181. */
  1182. bytes = ccl_used * sizeof(unsigned long);
  1183. fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
  1184. if (ccl_used > 0)
  1185. /*
  1186. * Write the combining class ranges out.
  1187. */
  1188. fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
  1189. fclose(out);
  1190. /*****************************************************************
  1191. *
  1192. * Generate the number data.
  1193. *
  1194. *****************************************************************/
  1195. /*
  1196. * Open the num.dat file.
  1197. */
  1198. sprintf(path, "%s/num.dat", opath);
  1199. if ((out = fopen(path, "wb")) == 0)
  1200. return;
  1201. /*
  1202. * The count part of the header will be the total number of codes that
  1203. * have numbers.
  1204. */
  1205. hdr[1] = (unsigned short) (ncodes_used << 1);
  1206. bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
  1207. /*
  1208. * Write the header.
  1209. */
  1210. fwrite((char *) hdr, sizeof(unsigned short), 2, out);
  1211. /*
  1212. * Write out the byte count to maintain header size.
  1213. */
  1214. fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
  1215. /*
  1216. * Now, if number mappings exist, write them out.
  1217. */
  1218. if (ncodes_used > 0) {
  1219. fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
  1220. fwrite((char *) nums, sizeof(_num_t), nums_used, out);
  1221. }
  1222. fclose(out);
  1223. }
  1224. void
  1225. #ifdef __STDC__
  1226. main(int argc, char *argv[])
  1227. #else
  1228. main(argc, argv)
  1229. int argc;
  1230. char *argv[];
  1231. #endif
  1232. {
  1233. FILE *in;
  1234. char *prog, *opath;
  1235. if ((prog = strrchr(argv[0], '/')) != 0)
  1236. prog++;
  1237. else
  1238. prog = argv[0];
  1239. opath = 0;
  1240. in = stdin;
  1241. argc--;
  1242. argv++;
  1243. while (argc > 0) {
  1244. if (argv[0][0] == '-' && argv[0][1] == 'o') {
  1245. argc--;
  1246. argv++;
  1247. opath = argv[0];
  1248. } else {
  1249. if (in != stdin)
  1250. fclose(in);
  1251. if ((in = fopen(argv[0], "rb")) == 0)
  1252. fprintf(stderr, "%s: unable to open ctype file %s\n",
  1253. prog, argv[0]);
  1254. else {
  1255. read_cdata(in);
  1256. fclose(in);
  1257. in = 0;
  1258. }
  1259. }
  1260. argc--;
  1261. argv++;
  1262. }
  1263. if (opath == 0)
  1264. opath = ".";
  1265. write_cdata(opath);
  1266. exit(0);
  1267. }