/pdf/pdf_cmap_parse.c

https://github.com/Lafriks/mupdf · C · 490 lines · 401 code · 83 blank · 6 comment · 137 complexity · 931d9c4ff9729362d0900ea1cd339af6 MD5 · raw file

  1. #include "fitz.h"
  2. #include "mupdf.h"
  3. /*
  4. * CMap parser
  5. */
  6. enum
  7. {
  8. TOK_USECMAP = PDF_NUM_TOKENS,
  9. TOK_BEGIN_CODESPACE_RANGE,
  10. TOK_END_CODESPACE_RANGE,
  11. TOK_BEGIN_BF_CHAR,
  12. TOK_END_BF_CHAR,
  13. TOK_BEGIN_BF_RANGE,
  14. TOK_END_BF_RANGE,
  15. TOK_BEGIN_CID_CHAR,
  16. TOK_END_CID_CHAR,
  17. TOK_BEGIN_CID_RANGE,
  18. TOK_END_CID_RANGE,
  19. TOK_END_CMAP
  20. };
  21. static int
  22. pdf_cmap_token_from_keyword(char *key)
  23. {
  24. if (!strcmp(key, "usecmap")) return TOK_USECMAP;
  25. if (!strcmp(key, "begincodespacerange")) return TOK_BEGIN_CODESPACE_RANGE;
  26. if (!strcmp(key, "endcodespacerange")) return TOK_END_CODESPACE_RANGE;
  27. if (!strcmp(key, "beginbfchar")) return TOK_BEGIN_BF_CHAR;
  28. if (!strcmp(key, "endbfchar")) return TOK_END_BF_CHAR;
  29. if (!strcmp(key, "beginbfrange")) return TOK_BEGIN_BF_RANGE;
  30. if (!strcmp(key, "endbfrange")) return TOK_END_BF_RANGE;
  31. if (!strcmp(key, "begincidchar")) return TOK_BEGIN_CID_CHAR;
  32. if (!strcmp(key, "endcidchar")) return TOK_END_CID_CHAR;
  33. if (!strcmp(key, "begincidrange")) return TOK_BEGIN_CID_RANGE;
  34. if (!strcmp(key, "endcidrange")) return TOK_END_CID_RANGE;
  35. if (!strcmp(key, "endcmap")) return TOK_END_CMAP;
  36. return PDF_TOK_KEYWORD;
  37. }
  38. static int
  39. pdf_code_from_string(char *buf, int len)
  40. {
  41. int a = 0;
  42. while (len--)
  43. a = (a << 8) | *(unsigned char *)buf++;
  44. return a;
  45. }
  46. static fz_error
  47. pdf_lex_cmap(int *tok, fz_stream *file, char *buf, int n, int *sl)
  48. {
  49. fz_error error;
  50. error = pdf_lex(tok, file, buf, n, sl);
  51. if (error)
  52. return fz_error_note(file->ctx, error, "cannot parse cmap token");
  53. if (*tok == PDF_TOK_KEYWORD)
  54. *tok = pdf_cmap_token_from_keyword(buf);
  55. return fz_okay;
  56. }
  57. static fz_error
  58. pdf_parse_cmap_name(pdf_cmap *cmap, fz_stream *file)
  59. {
  60. fz_error error;
  61. char buf[256];
  62. int tok;
  63. int len;
  64. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  65. if (error)
  66. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  67. if (tok == PDF_TOK_NAME)
  68. fz_strlcpy(cmap->cmap_name, buf, sizeof(cmap->cmap_name));
  69. else
  70. fz_warn(file->ctx, "expected name after CMapName in cmap");
  71. return fz_okay;
  72. }
  73. static fz_error
  74. pdf_parse_wmode(pdf_cmap *cmap, fz_stream *file)
  75. {
  76. fz_error error;
  77. char buf[256];
  78. int tok;
  79. int len;
  80. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  81. if (error)
  82. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  83. if (tok == PDF_TOK_INT)
  84. pdf_set_wmode(cmap, atoi(buf));
  85. else
  86. fz_warn(file->ctx, "expected integer after WMode in cmap");
  87. return fz_okay;
  88. }
  89. static fz_error
  90. pdf_parse_codespace_range(pdf_cmap *cmap, fz_stream *file)
  91. {
  92. fz_error error;
  93. char buf[256];
  94. int tok;
  95. int len;
  96. int lo, hi;
  97. while (1)
  98. {
  99. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  100. if (error)
  101. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  102. if (tok == TOK_END_CODESPACE_RANGE)
  103. return fz_okay;
  104. else if (tok == PDF_TOK_STRING)
  105. {
  106. lo = pdf_code_from_string(buf, len);
  107. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  108. if (error)
  109. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  110. if (tok == PDF_TOK_STRING)
  111. {
  112. hi = pdf_code_from_string(buf, len);
  113. pdf_add_codespace(file->ctx, cmap, lo, hi, len);
  114. }
  115. else break;
  116. }
  117. else break;
  118. }
  119. return fz_error_make(file->ctx, "expected string or endcodespacerange");
  120. }
  121. static fz_error
  122. pdf_parse_cid_range(pdf_cmap *cmap, fz_stream *file)
  123. {
  124. fz_error error;
  125. char buf[256];
  126. int tok;
  127. int len;
  128. int lo, hi, dst;
  129. while (1)
  130. {
  131. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  132. if (error)
  133. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  134. if (tok == TOK_END_CID_RANGE)
  135. return fz_okay;
  136. else if (tok != PDF_TOK_STRING)
  137. return fz_error_make(file->ctx, "expected string or endcidrange");
  138. lo = pdf_code_from_string(buf, len);
  139. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  140. if (error)
  141. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  142. if (tok != PDF_TOK_STRING)
  143. return fz_error_make(file->ctx, "expected string");
  144. hi = pdf_code_from_string(buf, len);
  145. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  146. if (error)
  147. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  148. if (tok != PDF_TOK_INT)
  149. return fz_error_make(file->ctx, "expected integer");
  150. dst = atoi(buf);
  151. pdf_map_range_to_range(file->ctx, cmap, lo, hi, dst);
  152. }
  153. }
  154. static fz_error
  155. pdf_parse_cid_char(pdf_cmap *cmap, fz_stream *file)
  156. {
  157. fz_error error;
  158. char buf[256];
  159. int tok;
  160. int len;
  161. int src, dst;
  162. while (1)
  163. {
  164. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  165. if (error)
  166. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  167. if (tok == TOK_END_CID_CHAR)
  168. return fz_okay;
  169. else if (tok != PDF_TOK_STRING)
  170. return fz_error_make(file->ctx, "expected string or endcidchar");
  171. src = pdf_code_from_string(buf, len);
  172. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  173. if (error)
  174. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  175. if (tok != PDF_TOK_INT)
  176. return fz_error_make(file->ctx, "expected integer");
  177. dst = atoi(buf);
  178. pdf_map_range_to_range(file->ctx, cmap, src, src, dst);
  179. }
  180. }
  181. static fz_error
  182. pdf_parse_bf_range_array(pdf_cmap *cmap, fz_stream *file, int lo, int hi)
  183. {
  184. fz_error error;
  185. char buf[256];
  186. int tok;
  187. int len;
  188. int dst[256];
  189. int i;
  190. while (1)
  191. {
  192. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  193. if (error)
  194. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  195. if (tok == PDF_TOK_CLOSE_ARRAY)
  196. return fz_okay;
  197. /* Note: does not handle [ /Name /Name ... ] */
  198. else if (tok != PDF_TOK_STRING)
  199. return fz_error_make(file->ctx, "expected string or ]");
  200. if (len / 2)
  201. {
  202. for (i = 0; i < len / 2; i++)
  203. dst[i] = pdf_code_from_string(buf + i * 2, 2);
  204. pdf_map_one_to_many(file->ctx, cmap, lo, dst, len / 2);
  205. }
  206. lo ++;
  207. }
  208. }
  209. static fz_error
  210. pdf_parse_bf_range(pdf_cmap *cmap, fz_stream *file)
  211. {
  212. fz_error error;
  213. char buf[256];
  214. int tok;
  215. int len;
  216. int lo, hi, dst;
  217. while (1)
  218. {
  219. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  220. if (error)
  221. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  222. if (tok == TOK_END_BF_RANGE)
  223. return fz_okay;
  224. else if (tok != PDF_TOK_STRING)
  225. return fz_error_make(file->ctx, "expected string or endbfrange");
  226. lo = pdf_code_from_string(buf, len);
  227. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  228. if (error)
  229. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  230. if (tok != PDF_TOK_STRING)
  231. return fz_error_make(file->ctx, "expected string");
  232. hi = pdf_code_from_string(buf, len);
  233. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  234. if (error)
  235. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  236. if (tok == PDF_TOK_STRING)
  237. {
  238. if (len == 2)
  239. {
  240. dst = pdf_code_from_string(buf, len);
  241. pdf_map_range_to_range(file->ctx, cmap, lo, hi, dst);
  242. }
  243. else
  244. {
  245. int dststr[256];
  246. int i;
  247. if (len / 2)
  248. {
  249. for (i = 0; i < len / 2; i++)
  250. dststr[i] = pdf_code_from_string(buf + i * 2, 2);
  251. while (lo <= hi)
  252. {
  253. dststr[i-1] ++;
  254. pdf_map_one_to_many(file->ctx, cmap, lo, dststr, i);
  255. lo ++;
  256. }
  257. }
  258. }
  259. }
  260. else if (tok == PDF_TOK_OPEN_ARRAY)
  261. {
  262. error = pdf_parse_bf_range_array(cmap, file, lo, hi);
  263. if (error)
  264. return fz_error_note(file->ctx, error, "cannot map bfrange");
  265. }
  266. else
  267. {
  268. return fz_error_make(file->ctx, "expected string or array or endbfrange");
  269. }
  270. }
  271. }
  272. static fz_error
  273. pdf_parse_bf_char(pdf_cmap *cmap, fz_stream *file)
  274. {
  275. fz_error error;
  276. char buf[256];
  277. int tok;
  278. int len;
  279. int dst[256];
  280. int src;
  281. int i;
  282. while (1)
  283. {
  284. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  285. if (error)
  286. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  287. if (tok == TOK_END_BF_CHAR)
  288. return fz_okay;
  289. else if (tok != PDF_TOK_STRING)
  290. return fz_error_make(file->ctx, "expected string or endbfchar");
  291. src = pdf_code_from_string(buf, len);
  292. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  293. if (error)
  294. return fz_error_note(file->ctx, error, "syntaxerror in cmap");
  295. /* Note: does not handle /dstName */
  296. if (tok != PDF_TOK_STRING)
  297. return fz_error_make(file->ctx, "expected string");
  298. if (len / 2)
  299. {
  300. for (i = 0; i < len / 2; i++)
  301. dst[i] = pdf_code_from_string(buf + i * 2, 2);
  302. pdf_map_one_to_many(file->ctx, cmap, src, dst, i);
  303. }
  304. }
  305. }
  306. fz_error
  307. pdf_parse_cmap(pdf_cmap **cmapp, fz_stream *file)
  308. {
  309. fz_error error;
  310. pdf_cmap *cmap;
  311. char key[64];
  312. char buf[256];
  313. int tok;
  314. int len;
  315. cmap = pdf_new_cmap(file->ctx);
  316. strcpy(key, ".notdef");
  317. while (1)
  318. {
  319. error = pdf_lex_cmap(&tok, file, buf, sizeof buf, &len);
  320. if (error)
  321. {
  322. error = fz_error_note(file->ctx, error, "syntaxerror in cmap");
  323. goto cleanup;
  324. }
  325. if (tok == PDF_TOK_EOF || tok == TOK_END_CMAP)
  326. break;
  327. else if (tok == PDF_TOK_NAME)
  328. {
  329. if (!strcmp(buf, "CMapName"))
  330. {
  331. error = pdf_parse_cmap_name(cmap, file);
  332. if (error)
  333. {
  334. error = fz_error_note(file->ctx, error, "syntaxerror in cmap after CMapName");
  335. goto cleanup;
  336. }
  337. }
  338. else if (!strcmp(buf, "WMode"))
  339. {
  340. error = pdf_parse_wmode(cmap, file);
  341. if (error)
  342. {
  343. error = fz_error_note(file->ctx, error, "syntaxerror in cmap after WMode");
  344. goto cleanup;
  345. }
  346. }
  347. else
  348. fz_strlcpy(key, buf, sizeof key);
  349. }
  350. else if (tok == TOK_USECMAP)
  351. {
  352. fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name));
  353. }
  354. else if (tok == TOK_BEGIN_CODESPACE_RANGE)
  355. {
  356. error = pdf_parse_codespace_range(cmap, file);
  357. if (error)
  358. {
  359. error = fz_error_note(file->ctx, error, "syntaxerror in cmap codespacerange");
  360. goto cleanup;
  361. }
  362. }
  363. else if (tok == TOK_BEGIN_BF_CHAR)
  364. {
  365. error = pdf_parse_bf_char(cmap, file);
  366. if (error)
  367. {
  368. error = fz_error_note(file->ctx, error, "syntaxerror in cmap bfchar");
  369. goto cleanup;
  370. }
  371. }
  372. else if (tok == TOK_BEGIN_CID_CHAR)
  373. {
  374. error = pdf_parse_cid_char(cmap, file);
  375. if (error)
  376. {
  377. error = fz_error_note(file->ctx, error, "syntaxerror in cmap cidchar");
  378. goto cleanup;
  379. }
  380. }
  381. else if (tok == TOK_BEGIN_BF_RANGE)
  382. {
  383. error = pdf_parse_bf_range(cmap, file);
  384. if (error)
  385. {
  386. error = fz_error_note(file->ctx, error, "syntaxerror in cmap bfrange");
  387. goto cleanup;
  388. }
  389. }
  390. else if (tok == TOK_BEGIN_CID_RANGE)
  391. {
  392. error = pdf_parse_cid_range(cmap, file);
  393. if (error)
  394. {
  395. error = fz_error_note(file->ctx, error, "syntaxerror in cmap cidrange");
  396. goto cleanup;
  397. }
  398. }
  399. /* ignore everything else */
  400. }
  401. pdf_sort_cmap(file->ctx, cmap);
  402. *cmapp = cmap;
  403. return fz_okay;
  404. cleanup:
  405. pdf_drop_cmap(file->ctx, cmap);
  406. return error; /* already rethrown */
  407. }