PageRenderTime 41ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/sombok-2.2.1/src/sombok.c

#
C | 587 lines | 552 code | 35 blank | 0 comment | 270 complexity | d683ac1408c5510ab3c62322bfa915ab MD5 | raw file
Possible License(s): AGPL-1.0
  1. #include <sys/types.h>
  2. #include <fcntl.h>
  3. #include <stdio.h>
  4. #include <unistd.h>
  5. #include <assert.h>
  6. #include "sombok.h"
  7. #define BUFLEN (8192)
  8. static char buf[BUFLEN];
  9. static char *encbuf = NULL;
  10. static unistr_t unistr = { NULL, 0 };
  11. static unichar_t newline_str[] = { 0x000A };
  12. static
  13. unichar_t hextou(unichar_t * str, int len)
  14. {
  15. size_t i;
  16. unichar_t c, u = 0;
  17. for (i = 0; i < len; i++) {
  18. c = str[i];
  19. if ((unichar_t) '0' <= c && c <= (unichar_t) '9')
  20. u = u * 16 + c - (unichar_t) '0';
  21. else if ((unichar_t) 'a' <= c && c <= (unichar_t) 'f')
  22. u = u * 16 + c - (unichar_t) 'a' + 10;
  23. else if ((unichar_t) 'A' <= c && c <= (unichar_t) 'F')
  24. u = u * 16 + c - (unichar_t) 'A' + 10;
  25. else
  26. return (unichar_t)-1;
  27. }
  28. return u;
  29. }
  30. static
  31. unistr_t *parse_string(char *utf8str, size_t len)
  32. {
  33. unichar_t *buf;
  34. size_t i, j;
  35. if (sombok_decode_utf8(&unistr, 0, utf8str, len, 3) == NULL)
  36. return NULL;
  37. buf = unistr.str;
  38. for (i = 0, j = 0; i < unistr.len; i++) {
  39. if (buf[i] == (unichar_t) '\\') {
  40. if (i + 1 < unistr.len) {
  41. i++;
  42. switch (buf[i]) {
  43. case (unichar_t) '0':
  44. buf[j] = 0x0000; /* null */
  45. break;
  46. case (unichar_t) 'a':
  47. buf[j] = 0x0007; /* bell */
  48. break;
  49. case (unichar_t) 'b':
  50. buf[j] = 0x0008; /* back space */
  51. break;
  52. case (unichar_t) 't':
  53. buf[j] = 0x0009; /* horizontal tab */
  54. break;
  55. case (unichar_t) 'n':
  56. buf[j] = 0x000A; /* line feed */
  57. break;
  58. case (unichar_t) 'v':
  59. buf[j] = 0x000B; /* vertical tab */
  60. break;
  61. case (unichar_t) 'f':
  62. buf[j] = 0x000C; /* form feed */
  63. break;
  64. case (unichar_t) 'r':
  65. buf[j] = 0x000D; /* carriage return */
  66. break;
  67. case (unichar_t) 'e':
  68. buf[j] = 0x001B; /* escape */
  69. break;
  70. case (unichar_t) 'x': /* \xhh */
  71. if ((buf[j] = hextou(buf + i + 1, 2)) == -1)
  72. buf[j] = buf[i];
  73. else
  74. i += 2;
  75. break;
  76. case (unichar_t) 'u': /* \uhhhh */
  77. if ((buf[j] = hextou(buf + i + 1, 4)) == -1)
  78. buf[j] = buf[i];
  79. else
  80. i += 4;
  81. break;
  82. case (unichar_t) 'U': /* \Uhhhhhhhh */
  83. if ((buf[j] = hextou(buf + i + 1, 8)) == -1)
  84. buf[j] = buf[i];
  85. else
  86. i += 8;
  87. break;
  88. default:
  89. buf[j] = buf[i];
  90. }
  91. } else if (j < i)
  92. buf[j] = buf[i];
  93. } else if (j < i)
  94. buf[j] = buf[i];
  95. j++;
  96. }
  97. unistr.len = j;
  98. return &unistr;
  99. }
  100. static
  101. pid_t popen2(const char *cmd, const char *arg, int *ifd, int *ofd)
  102. {
  103. int ipipe[2], opipe[2], errnum;
  104. pid_t pid;
  105. if (pipe(ipipe) != 0 || pipe(opipe) != 0)
  106. return -1;
  107. if ((pid = fork()) < 0)
  108. return -1;
  109. if (pid == 0) {
  110. close(ipipe[1]);
  111. dup2(ipipe[0], 0);
  112. close(opipe[0]);
  113. dup2(opipe[1], 1);
  114. execl(SHELL_PROGRAM, SHELL_NAME, "-c", cmd, SHELL_NAME, arg, NULL);
  115. errnum = errno;
  116. perror("execl");
  117. exit(errnum);
  118. }
  119. *ifd = ipipe[1];
  120. *ofd = opipe[0];
  121. return pid;
  122. }
  123. static
  124. gcstring_t *format_SHELL(linebreak_t * lbobj, linebreak_state_t state,
  125. gcstring_t * gcstr)
  126. {
  127. size_t len;
  128. int ifd = 0, ofd = 1;
  129. char *statestr;
  130. gcstring_t *ret;
  131. switch (state) {
  132. case LINEBREAK_STATE_SOT:
  133. statestr = "sot";
  134. break;
  135. case LINEBREAK_STATE_SOP:
  136. statestr = "sop";
  137. break;
  138. case LINEBREAK_STATE_SOL:
  139. statestr = "sol";
  140. break;
  141. case LINEBREAK_STATE_LINE:
  142. statestr = "";
  143. break;
  144. case LINEBREAK_STATE_EOL:
  145. statestr = "eol";
  146. break;
  147. case LINEBREAK_STATE_EOP:
  148. statestr = "eop";
  149. break;
  150. case LINEBREAK_STATE_EOT:
  151. statestr = "eot";
  152. break;
  153. default:
  154. lbobj->errnum = EINVAL;
  155. return NULL;
  156. }
  157. if ((encbuf = sombok_encode_utf8(encbuf, &len, 0, (unistr_t *)gcstr))
  158. == NULL) {
  159. lbobj->errnum = errno;
  160. return NULL;
  161. }
  162. popen2(lbobj->format_data, statestr, &ifd, &ofd);
  163. write(ifd, encbuf, len);
  164. close(ifd);
  165. if ((len = read(ofd, buf, BUFSIZ)) == -1) {
  166. lbobj->errnum = errno;
  167. close(ofd);
  168. return NULL;
  169. }
  170. if (close(ofd) == -1) {
  171. lbobj->errnum = errno;
  172. return NULL;
  173. }
  174. if (len == 0)
  175. unistr.len = 0;
  176. else if (sombok_decode_utf8(&unistr, 0, buf, len, 3) == NULL) {
  177. lbobj->errnum = errno;
  178. return NULL;
  179. }
  180. if ((ret = gcstring_newcopy(&unistr, lbobj)) == NULL) {
  181. lbobj->errnum = errno ? errno : ENOMEM;
  182. return NULL;
  183. }
  184. return ret;
  185. }
  186. int main(int argc, char **argv)
  187. {
  188. linebreak_t *lbobj;
  189. size_t i, j, len;
  190. gcstring_t **lines;
  191. char *outfile = NULL;
  192. FILE *ifp, *ofp;
  193. int errnum;
  194. unistr_t newline;
  195. lbobj = linebreak_new(NULL);
  196. lbobj->colmax = 76.0;
  197. lbobj->charmax = 998;
  198. newline.str = newline_str;
  199. newline.len = 1;
  200. linebreak_set_newline(lbobj, &newline);
  201. linebreak_set_format(lbobj, linebreak_format_SIMPLE, NULL);
  202. linebreak_set_sizing(lbobj, linebreak_sizing_UAX11, NULL);
  203. for (i = 1; i < argc; i++) {
  204. if (argv[i][0] == '-' && argv[i][1] == '-') {
  205. if (argv[i][2] == '\0') {
  206. i++;
  207. break;
  208. } else if (strcmp(argv[i] + 2, "colmax") == 0)
  209. lbobj->colmax = atof(argv[++i]);
  210. else if (strcmp(argv[i] + 2, "colmin") == 0)
  211. lbobj->colmin = atof(argv[++i]);
  212. else if (strcmp(argv[i] + 2, "charmax") == 0)
  213. lbobj->charmax = atol(argv[++i]);
  214. else if (strcmp(argv[i] + 2, "newline") == 0 && i + 1 < argc) {
  215. i++;
  216. if (parse_string(argv[i], strlen(argv[i])) == NULL) {
  217. errnum = errno;
  218. perror("parse_string");
  219. linebreak_destroy(lbobj);
  220. exit(errnum);
  221. }
  222. linebreak_set_newline(lbobj, &unistr);
  223. } else if (strcmp(argv[i] + 2, "complex-breaking") == 0)
  224. lbobj->options |= LINEBREAK_OPTION_COMPLEX_BREAKING;
  225. else if (strcmp(argv[i] + 2, "no-complex-breaking") == 0)
  226. lbobj->options &= ~LINEBREAK_OPTION_COMPLEX_BREAKING;
  227. else if (strcmp(argv[i] + 2, "context") == 0 && i + 1 < argc) {
  228. i++;
  229. if (strcasecmp(argv[i], "EASTASIAN") == 0)
  230. lbobj->options |= LINEBREAK_OPTION_EASTASIAN_CONTEXT;
  231. else
  232. lbobj->options &= ~LINEBREAK_OPTION_EASTASIAN_CONTEXT;
  233. } else if (strcmp(argv[i] + 2, "hangul-as-al") == 0)
  234. lbobj->options |= LINEBREAK_OPTION_HANGUL_AS_AL;
  235. else if (strcmp(argv[i] + 2, "no-hangul-as-al") == 0)
  236. lbobj->options &= ~LINEBREAK_OPTION_HANGUL_AS_AL;
  237. else if (strcmp(argv[i] + 2, "legacy-cm") == 0)
  238. lbobj->options |= LINEBREAK_OPTION_LEGACY_CM;
  239. else if (strcmp(argv[i] + 2, "no-legacy-cm") == 0)
  240. lbobj->options &= ~LINEBREAK_OPTION_LEGACY_CM;
  241. else if (strcmp(argv[i] + 2, "nonstarter") == 0 && i + 1 < argc) {
  242. i++;
  243. if (strcasecmp(argv[i], "LOOSE") == 0)
  244. lbobj->options |= LINEBREAK_OPTION_NONSTARTER_LOOSE;
  245. else
  246. lbobj->options &= ~LINEBREAK_OPTION_NONSTARTER_LOOSE;
  247. } else if (strcmp(argv[i] + 2, "virama-as-joiner") == 0)
  248. lbobj->options |= LINEBREAK_OPTION_VIRAMA_AS_JOINER;
  249. else if (strcmp(argv[i] + 2, "no-virama-as-joiner") == 0)
  250. lbobj->options &= ~LINEBREAK_OPTION_VIRAMA_AS_JOINER;
  251. else if (strcmp(argv[i] + 2, "format-func") == 0 &&
  252. i + 1 < argc) {
  253. i++;
  254. if (strcasecmp(argv[i], "NONE") == 0)
  255. linebreak_set_format(lbobj, NULL, NULL);
  256. else if (strcasecmp(argv[i], "SIMPLE") == 0)
  257. linebreak_set_format(lbobj, linebreak_format_SIMPLE,
  258. NULL);
  259. else if (strcasecmp(argv[i], "NEWLINE") == 0)
  260. linebreak_set_format(lbobj, linebreak_format_NEWLINE,
  261. NULL);
  262. else if (strcasecmp(argv[i], "TRIM") == 0)
  263. linebreak_set_format(lbobj, linebreak_format_TRIM,
  264. NULL);
  265. else
  266. linebreak_set_format(lbobj, format_SHELL, argv[i]);
  267. } else if (strcmp(argv[i] + 2, "prep-func") == 0 &&
  268. i + 1 < argc) {
  269. i++;
  270. if (strcasecmp(argv[i], "NONE") == 0)
  271. linebreak_add_prep(lbobj, NULL, NULL);
  272. else if (strcasecmp(argv[i], "BREAKURI") == 0)
  273. linebreak_add_prep(lbobj, linebreak_prep_URIBREAK, "");
  274. else if (strcasecmp(argv[i], "NONBREAKURI") == 0)
  275. linebreak_add_prep(lbobj, linebreak_prep_URIBREAK,
  276. NULL);
  277. else {
  278. fprintf(stderr, "unknown option value: %s\n", argv[i]);
  279. linebreak_destroy(lbobj);
  280. exit(1);
  281. }
  282. } else if (strcmp(argv[i] + 2, "sizing-func") == 0 &&
  283. i + 1 < argc) {
  284. i++;
  285. if (strcasecmp(argv[i], "NONE") == 0)
  286. linebreak_set_sizing(lbobj, NULL, NULL);
  287. else if (strcasecmp(argv[i], "UAX11") == 0)
  288. linebreak_set_sizing(lbobj, linebreak_sizing_UAX11,
  289. NULL);
  290. else {
  291. fprintf(stderr, "unknown option value: %s\n", argv[i]);
  292. linebreak_destroy(lbobj);
  293. exit(1);
  294. }
  295. } else if (strcmp(argv[i] + 2, "urgent-func") == 0 &&
  296. i + 1 < argc) {
  297. i++;
  298. if (strcasecmp(argv[i], "NONE") == 0)
  299. linebreak_set_urgent(lbobj, NULL, NULL);
  300. else if (strcasecmp(argv[i], "ABORT") == 0)
  301. linebreak_set_urgent(lbobj, linebreak_urgent_ABORT,
  302. NULL);
  303. else if (strcasecmp(argv[i], "FORCE") == 0)
  304. linebreak_set_urgent(lbobj, linebreak_urgent_FORCE,
  305. NULL);
  306. else {
  307. fprintf(stderr, "unknown option value: %s\n", argv[i]);
  308. linebreak_destroy(lbobj);
  309. exit(1);
  310. }
  311. } else if (strcmp(argv[i] + 2, "eawidth") == 0 && i + 1 < argc) {
  312. char *p, *q, *codes, *propname = "";
  313. size_t j;
  314. propval_t propval = PROP_UNKNOWN;
  315. unichar_t beg, end, c;
  316. i++;
  317. p = argv[i];
  318. while ((q = strchr(p, '=')) != NULL) {
  319. *q = '\0';
  320. codes = p;
  321. propname = q + 1;
  322. if ((q = strchr(propname, ',')) != NULL) {
  323. *q = '\0';
  324. p = q + 1;
  325. while (*p == ' ' || *p == '\t') p++;
  326. } else while (*p) p++;
  327. for (j = 0; linebreak_propvals_EA[j] != NULL; j++)
  328. if (strcasecmp(linebreak_propvals_EA[j], propname)
  329. == 0) {
  330. propval = j;
  331. break;
  332. }
  333. if ((q = strchr(codes, '-')) != NULL) {
  334. *q = '\0';
  335. beg = (unichar_t)strtoul(codes, NULL, 16);
  336. end = (unichar_t)strtoul(q + 1, NULL, 16);
  337. if (end < beg) {
  338. c = beg;
  339. beg = end;
  340. end = c;
  341. }
  342. } else
  343. beg = end = (unichar_t)strtoul(codes, NULL, 16);
  344. for (c = beg; c <= end; c++)
  345. linebreak_update_eawidth(lbobj, c, propval);
  346. }
  347. } else if (strcmp(argv[i] + 2, "lbclass") == 0 && i + 1 < argc) {
  348. char *p, *q, *codes, *propname = "";
  349. size_t j;
  350. propval_t propval = PROP_UNKNOWN;
  351. unichar_t beg, end, c;
  352. i++;
  353. p = argv[i];
  354. while ((q = strchr(p, '=')) != NULL) {
  355. *q = '\0';
  356. codes = p;
  357. propname = q + 1;
  358. if ((q = strchr(propname, ',')) != NULL) {
  359. *q = '\0';
  360. p = q + 1;
  361. while (*p == ' ' || *p == '\t') p++;
  362. } else while (*p) p++;
  363. for (j = 0; linebreak_propvals_LB[j] != NULL; j++)
  364. if (strcasecmp(linebreak_propvals_LB[j], propname)
  365. == 0) {
  366. propval = j;
  367. break;
  368. }
  369. if ((q = strchr(codes, '-')) != NULL) {
  370. *q = '\0';
  371. beg = (unichar_t)strtoul(codes, NULL, 16);
  372. end = (unichar_t)strtoul(q + 1, NULL, 16);
  373. if (end < beg) {
  374. c = beg;
  375. beg = end;
  376. end = c;
  377. }
  378. } else
  379. beg = end = (unichar_t)strtoul(codes, NULL, 16);
  380. for (c = beg; c <= end; c++)
  381. linebreak_update_lbclass(lbobj, c, propval);
  382. }
  383. } else if (strcmp(argv[i] + 2, "version") == 0) {
  384. printf(PACKAGE_NAME " " PACKAGE_VERSION "\n");
  385. linebreak_destroy(lbobj);
  386. exit(0);
  387. } else if (strcmp(argv[i] + 2, "sea-support") == 0) {
  388. printf("%s\n", linebreak_southeastasian_supported ?
  389. linebreak_southeastasian_supported : "none");
  390. linebreak_destroy(lbobj);
  391. exit(0);
  392. } else {
  393. fprintf(stderr, "unknown option: %s\n", argv[i]);
  394. linebreak_destroy(lbobj);
  395. exit(1);
  396. }
  397. } else if (argv[i][0] == '-' && argv[i][1] != '\0' &&
  398. argv[i][2] == '\0') {
  399. switch (argv[i][1]) {
  400. case 'o':
  401. if (i + 1 < argc) {
  402. i++;
  403. outfile = argv[i];
  404. break;
  405. }
  406. default:
  407. fprintf(stderr, "Unknown optoion %s\n", argv[i]);
  408. linebreak_destroy(lbobj);
  409. exit(1);
  410. }
  411. } else
  412. break;
  413. }
  414. if (outfile == NULL)
  415. ofp = stdout;
  416. else if ((ofp = fopen(outfile, "wb")) == NULL) {
  417. errnum = errno;
  418. perror(outfile);
  419. linebreak_destroy(lbobj);
  420. exit(errnum);
  421. }
  422. if (argc <= i) {
  423. len = fread(buf, sizeof(char), BUFLEN - 1, stdin);
  424. if (len <= 0 && errno) {
  425. errnum = errno;
  426. perror("(stdin)");
  427. linebreak_destroy(lbobj);
  428. exit(errnum);
  429. }
  430. if (len == 0)
  431. unistr.len = 0;
  432. else if (sombok_decode_utf8(&unistr, 0, buf, len, 3) == NULL) {
  433. errnum = errno;
  434. perror("decode_utf8");
  435. linebreak_destroy(lbobj);
  436. exit(errnum);
  437. }
  438. lines = linebreak_break(lbobj, &unistr);
  439. if (lbobj->errnum == LINEBREAK_ELONG) {
  440. fprintf(stderr, "Excessive line was found\n");
  441. free(unistr.str);
  442. linebreak_destroy(lbobj);
  443. exit(LINEBREAK_ELONG);
  444. } else if (lbobj->errnum) {
  445. errno = errnum = lbobj->errnum;
  446. perror("linebreak_break");
  447. linebreak_destroy(lbobj);
  448. exit(errnum);
  449. }
  450. for (j = 0; lines[j] != NULL; j++) {
  451. if (lines[j]->str != NULL) {
  452. if ((encbuf = sombok_encode_utf8(encbuf, &len, 0,
  453. (unistr_t *)(lines[j])))
  454. == NULL) {
  455. errnum = errno;
  456. perror("encode_utf8");
  457. linebreak_destroy(lbobj);
  458. exit(errnum);
  459. }
  460. fwrite(encbuf, sizeof(char), len, ofp);
  461. }
  462. gcstring_destroy(lines[j]);
  463. }
  464. free(lines);
  465. } else {
  466. for (; i < argc; i++) {
  467. if (argv[i][0] == '-' && argv[i][1] == '\0')
  468. ifp = stdin;
  469. else if ((ifp = fopen(argv[i], "rb")) == NULL) {
  470. errnum = errno;
  471. perror(argv[i]);
  472. linebreak_destroy(lbobj);
  473. exit(errnum);
  474. }
  475. len = fread(buf, sizeof(char), BUFLEN - 1, ifp);
  476. if (len <= 0 && errno) {
  477. errnum = errno;
  478. perror("fread");
  479. linebreak_destroy(lbobj);
  480. exit(errnum);
  481. }
  482. if (len == 0)
  483. unistr.len = 0;
  484. else if (sombok_decode_utf8(&unistr, 0, buf, len, 3) == NULL) {
  485. errnum = errno;
  486. perror("decode_utf8");
  487. linebreak_destroy(lbobj);
  488. exit(errnum);
  489. }
  490. fclose(ifp);
  491. lines = linebreak_break_partial(lbobj, &unistr);
  492. if (lbobj->errnum == LINEBREAK_ELONG) {
  493. fprintf(stderr, "Excessive line was found\n");
  494. free(unistr.str);
  495. linebreak_destroy(lbobj);
  496. exit(LINEBREAK_ELONG);
  497. } else if (lbobj->errnum) {
  498. errno = errnum = lbobj->errnum;
  499. perror("linebreak_break_partial");
  500. linebreak_destroy(lbobj);
  501. exit(errnum);
  502. }
  503. for (j = 0; lines[j] != NULL; j++) {
  504. if (lines[j]->str != NULL) {
  505. if ((encbuf = sombok_encode_utf8(encbuf, &len, 0,
  506. (unistr_t *)(lines[j])))
  507. == NULL) {
  508. errnum = errno;
  509. perror("encode_utf8");
  510. linebreak_destroy(lbobj);
  511. exit(errnum);
  512. }
  513. fwrite(encbuf, sizeof(char), len, ofp);
  514. }
  515. gcstring_destroy(lines[j]);
  516. }
  517. free(lines);
  518. }
  519. lines = linebreak_break_partial(lbobj, NULL);
  520. if (lbobj->errnum == LINEBREAK_ELONG) {
  521. fprintf(stderr, "Excessive line was found\n");
  522. free(unistr.str);
  523. linebreak_destroy(lbobj);
  524. exit(LINEBREAK_ELONG);
  525. } else if (lbobj->errnum) {
  526. errno = errnum = lbobj->errnum;
  527. perror("linebreak_break_partial");
  528. linebreak_destroy(lbobj);
  529. exit(errnum);
  530. }
  531. for (j = 0; lines[j] != NULL; j++) {
  532. if (lines[j]->str != NULL) {
  533. if ((encbuf = sombok_encode_utf8(encbuf, &len, 0,
  534. (unistr_t *)(lines[j])))
  535. == NULL) {
  536. errnum = errno;
  537. perror("encode_utf8");
  538. linebreak_destroy(lbobj);
  539. exit(errnum);
  540. }
  541. fwrite(encbuf, sizeof(char), len, ofp);
  542. }
  543. gcstring_destroy(lines[j]);
  544. }
  545. free(lines);
  546. }
  547. fclose(ofp);
  548. free(encbuf);
  549. free(unistr.str);
  550. linebreak_destroy(lbobj);
  551. exit(0);
  552. }