PageRenderTime 58ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/lang/c99/parser/lex.c

http://github.com/rflynn/assume
C | 582 lines | 539 code | 10 blank | 33 comment | 2 complexity | b2e260413adfe28a6a292c2af4a2b176 MD5 | raw file
  1. /* ex: set ts=2 et: */
  2. /*
  3. * lexer for C99
  4. */
  5. #define _POSIX_SOURCE /* get stdio.h/fileno() */
  6. #include <assert.h>
  7. #include <stdio.h>
  8. #include <string.h>
  9. #include <stdlib.h>
  10. #include <sys/types.h>
  11. #include <regex.h>
  12. #include <unistd.h>
  13. #include "lex.h"
  14. /* initializer for regex_t */
  15. #ifdef __GNUC__
  16. # define R {0,0,0,0,0,0,0,0,0,0,0,0,0,0}
  17. #else
  18. # define R {0}
  19. #endif
  20. /**
  21. * forall enum tok
  22. * list all pertinent info
  23. */
  24. static struct {
  25. enum tok id;
  26. const char *descr;
  27. regex_t rgx;
  28. const char *pattern;
  29. } Lexeme[T_CNT] = {
  30. /* token descr rgx regex-pattern */
  31. { T, "", R, "" },
  32. { T_SPACE, "ws", R, "^[ \f\r\t\v]+"},
  33. { T_NEWLINE, "nl", R, "^\n" },
  34. { T_COMMENT, "comment", R,
  35. /* TODO: multi-line weird preprocessor comment crap,
  36. * uncomment the tests from test-lex.sh!
  37. */
  38. "^("
  39. /* C-style comment, like this */
  40. "/\\*"
  41. "("
  42. "\\*[^/]" /* * then anything except / */
  43. "|[^*]" /* anything but \ and * */
  44. ")*"
  45. "\\*/"
  46. /* C++ style "//" */
  47. "|//[^\n]*\n|//[^\n]*"
  48. ")"
  49. },
  50. /* Preprocessor (Ref #1 S6.10 */
  51. { T_CPP, "cpp", R, "^#[ \f\r\t\v]*" },
  52. { T_CPP_IFDEF, "", R, "^#[ \f\r\t\v]*ifdef" },
  53. { T_CPP_IFNDEF, "", R, "^#[ \f\r\t\v]*ifndef" },
  54. { T_CPP_IF, "", R, "^#[ \f\r\t\v]*if" },
  55. { T_CPP_ELIF, "", R, "^#[ \f\r\t\v]*elif" },
  56. { T_CPP_ELSE, "", R, "^#[ \f\r\t\v]*else" },
  57. { T_CPP_ENDIF, "", R, "^#[ \f\r\t\v]*endif" },
  58. { T_CPP_INCLUDE, "", R, "^#[ \f\r\t\v]*include" },
  59. { T_CPP_DEFINE, "", R, "^#[ \f\r\t\v]*define" },
  60. { T_CPP_UNDEF, "", R, "^#[ \f\r\t\v]*undef" },
  61. { T_CPP_LINE, "", R, "^#[ \f\r\t\v]*line" },
  62. { T_CPP_ERROR, "", R, "^#[ \f\r\t\v]*error" },
  63. { T_CPP_PRAGMA, "", R, "^#[ \f\r\t\v]*pragma" },
  64. { T_CPP_LINECONT, "", R, "^\\\\\n" },
  65. /* constant values */
  66. { T_CONST_FLOAT,"float_lit", R,
  67. "^("
  68. /* decimal float */
  69. "("
  70. "[0-9](\\.[0-9]*)?" /* 0(.1)? */
  71. "|\\.[0-9]+" /* .0 */
  72. ")"
  73. "([eE][+-]?[0-9]+)?" /* exponent */
  74. /* hexadecimal float */
  75. "|0[xX][[:xdigit:]]+"
  76. "(\\.[[:xdigit:]]+)?"
  77. "[pP][+-]?[0-9]+" /* exponent */
  78. ")"
  79. "([fF]?[lL]?|[lL][fF])?" /* suffix */
  80. },
  81. { T_CONST_INT, "int_lit", R,
  82. "^("
  83. "0[xX][[:xdigit:]]+" /* hexadecimal */
  84. "|0[0-7]*" /* octal */
  85. "|[1-9][0-9]*" /* decimal */
  86. ")"
  87. "[uU]?[lL]?[lL]?" /* suffix */
  88. },
  89. { T_CONST_STR, "str_lit", R,
  90. "^"
  91. "L?" /* optional wide string */
  92. "\""
  93. "("
  94. "[^\\\"\n]+" /* normal char */
  95. "|\\\\[\\\\\"abfnrtv]" /* escaped special */
  96. "|\\\\x[[:xdigit:]]{1,2}" /* hexadecimal escape */
  97. "|\\\\0[0-7]{0,2}" /* octal escape */
  98. ")*"
  99. "\""
  100. },
  101. { T_CONST_CHAR, "char_lit", R,
  102. "^"
  103. "L?" /* optional wide char */
  104. "'"
  105. "("
  106. "[^\\']" /* any char except ' or \ */
  107. "|\\\\[\\\\'abfnrtv]" /* escaped special */
  108. "|\\\\x[[:xdigit:]]{1,2}" /* hexadecimal escape */
  109. "|\\\\0[0-7]{0,2}" /* octal escape */
  110. ")*"
  111. "'"
  112. },
  113. { T_IDENT, "ident", R,
  114. "^[_a-zA-Z]" /* must start with non-digit */
  115. "("
  116. "[_a-zA-Z0-9]+" /* continue with alphanum */
  117. "|\\\\u[[:xdigit:]]{4}" /* "small" universal names */
  118. "|\\\\U[[:xdigit:]]{8}" /* "big" ones. */
  119. ")*"
  120. },
  121. /* Keywords (Ref #1 S6.4.1.1) */
  122. { T_AUTO, "keyword", R, "^auto" },
  123. { T_BREAK, "keyword", R, "^break" },
  124. { T_CASE, "keyword", R, "^case" },
  125. { T_CHAR, "keyword", R, "^char" },
  126. { T_CONST, "keyword", R, "^const" },
  127. { T_CONTINUE, "keyword", R, "^continue" },
  128. { T_DEFAULT, "keyword", R, "^default" },
  129. { T_DO, "keyword", R, "^do" },
  130. { T_DOUBLE, "keyword", R, "^double" },
  131. { T_ELSE, "keyword", R, "^else" },
  132. { T_ENUM, "keyword", R, "^enum" },
  133. { T_EXTERN, "keyword", R, "^extern" },
  134. { T_FLOAT, "keyword", R, "^float" },
  135. { T_FOR, "keyword", R, "^for" },
  136. { T_GOTO, "keyword", R, "^goto" },
  137. { T_IF, "keyword", R, "^if" },
  138. { T_INLINE, "keyword", R, "^inline" },
  139. { T_INT, "keyword", R, "^int" },
  140. { T_LONG, "keyword", R, "^long" },
  141. { T_REGISTER, "keyword", R, "^register" },
  142. { T_RESTRICT, "keyword", R, "^restrict" },
  143. { T_RETURN, "keyword", R, "^return" },
  144. { T_SHORT, "keyword", R, "^short" },
  145. { T_SIGNED, "keyword", R, "^signed" },
  146. { T_SIZEOF, "keyword", R, "^sizeof" },
  147. { T_STATIC, "keyword", R, "^static" },
  148. { T_STRUCT, "keyword", R, "^struct" },
  149. { T_SWITCH, "keyword", R, "^switch" },
  150. { T_TYPEDEF, "keyword", R, "^typedef" },
  151. { T_UNION, "keyword", R, "^union" },
  152. { T_UNSIGNED, "keyword", R, "^unsigned" },
  153. { T_VOID, "keyword", R, "^void" },
  154. { T_VOLATILE, "keyword", R, "^volatile" },
  155. { T_WHILE, "keyword", R, "^while" },
  156. { T__BOOL, "keyword", R, "^_Bool" },
  157. { T__COMPLEX, "keyword", R, "^_Complex" },
  158. { T__IMAGINARY, "keyword", R, "^_Imaginary" },
  159. /* Punctuators (Ref #1 S6.4.6.1) */
  160. { T_OBRACE, "", R, "^\\[" },
  161. { T_CBRACE, "", R, "^\\]" },
  162. { T_OPAREN, "", R, "^\\(" },
  163. { T_CPAREN, "", R, "^\\)" },
  164. { T_OBRACK, "", R, "^\\{" },
  165. { T_CBRACK, "", R, "^\\}" },
  166. { T_DOT, "", R, "^\\." },
  167. { T_RARROW, "", R, "^->" },
  168. { T_PLUSPLUS, "", R, "^\\+\\+" },
  169. { T_DASHDASH, "", R, "^--" },
  170. { T_AMP, "", R, "^&" },
  171. { T_STAR, "", R, "^\\*" },
  172. { T_PLUS, "", R, "^\\+" },
  173. { T_DASH, "", R, "^-" },
  174. { T_SQUIG, "", R, "^~" },
  175. { T_BANG, "", R, "^!" },
  176. { T_SLASH, "", R, "^/" },
  177. { T_PCT, "", R, "^%" },
  178. { T_LTLT, "", R, "^<<" },
  179. { T_GTGT, "", R, "^>>" },
  180. { T_LT, "", R, "^<" },
  181. { T_GT, "", R, "^>" },
  182. { T_LTEQ, "", R, "^<=" },
  183. { T_GTEQ, "", R, "^>=" },
  184. { T_EQEQ, "", R, "^==" },
  185. { T_BANGEQ, "", R, "^!=" },
  186. { T_CARET, "", R, "^\\^" },
  187. { T_PIPE, "", R, "^\\|" },
  188. { T_AMPAMP, "", R, "^&&" },
  189. { T_PIPEPIPE, "", R, "^\\|\\|" },
  190. { T_QMARK, "", R, "^\\?" },
  191. { T_COLON, "", R, "^:" },
  192. { T_SEMIC, "", R, "^;" },
  193. { T_ELLIPSIS, "", R, "^\\.\\.\\." },
  194. { T_EQ, "", R, "^=" },
  195. { T_STAREQ, "", R, "^\\*=" },
  196. { T_SLASHEQ, "", R, "^/=" },
  197. { T_PCTEQ, "", R, "^%=" },
  198. { T_PLUSEQ, "", R, "^\\+=" },
  199. { T_DASHEQ, "", R, "^-=" },
  200. { T_LTLTEQ, "", R, "^<<=" },
  201. { T_GTGTEQ, "", R, "^>>=" },
  202. { T_AMPEQ, "", R, "^\\&=" },
  203. { T_CARETEQ, "", R, "^\\^=" },
  204. { T_PIPEEQ, "", R, "^\\|=" },
  205. { T_COMMA, "", R, "^," },
  206. { T_HASH, "", R, "^#" },
  207. { T_HASHISH, "", R, "^##" },
  208. { T_LTCOLON, "", R, "^<:" },
  209. { T_COLONGT, "", R, "^:>" },
  210. { T_LTPCT, "", R, "^<%" },
  211. { T_PCTGT, "", R, "^%>" },
  212. { T_PCTCOLON, "", R, "^#%" },
  213. { T_PCTCOLON2, "", R, "^#%#%" }
  214. };
  215. /**
  216. * store list of each possible Lexeme, given a starting character.
  217. * we need to find the longest match for each token, and thus try all
  218. * possibilities. this lets us avoid trying everything every time.
  219. */
  220. static struct {
  221. unsigned cnt;
  222. enum tok lexeme[17]; /* '#' for cpp, digraphs */
  223. } Match[256]; /* one for each u8 */
  224. static void match_add(const char c, enum tok t)
  225. {
  226. int i = c;
  227. unsigned cnt = Match[i].cnt;
  228. assert(cnt < sizeof Match[0].lexeme / sizeof Match[0].lexeme[0]);
  229. Match[i].lexeme[cnt] = t;
  230. Match[i].cnt++;
  231. }
  232. /**
  233. * for each complex regular expression (T_SPACE through T_INDENT),
  234. * add the token to each character that may begin that token's match.
  235. */
  236. static void match_build_regexes(void)
  237. {
  238. char c;
  239. match_add(' ', T_SPACE);
  240. match_add('\t', T_SPACE);
  241. match_add('\v', T_SPACE);
  242. match_add('\f', T_SPACE);
  243. match_add('\r', T_SPACE);
  244. match_add('\n', T_NEWLINE);
  245. match_add('/', T_COMMENT);
  246. /* CPP */
  247. match_add('#', T_CPP);
  248. match_add('#', T_CPP_IFDEF);
  249. match_add('#', T_CPP_IFNDEF);
  250. match_add('#', T_CPP_IF);
  251. match_add('#', T_CPP_ELIF);
  252. match_add('#', T_CPP_ELSE);
  253. match_add('#', T_CPP_ENDIF);
  254. match_add('#', T_CPP_INCLUDE);
  255. match_add('#', T_CPP_DEFINE);
  256. match_add('#', T_CPP_UNDEF);
  257. match_add('#', T_CPP_LINE);
  258. match_add('#', T_CPP_PRAGMA);
  259. match_add('#', T_CPP_IFDEF);
  260. match_add('\\', T_CPP_LINECONT);
  261. /* octal, decimal, hexadecimal integer constant */
  262. for (c = '0'; c <= '9'; c++)
  263. match_add(c, T_CONST_INT);
  264. /* floating constant */
  265. match_add('.', T_CONST_FLOAT);
  266. for (c = '0'; c <= '9'; c++)
  267. match_add(c, T_CONST_FLOAT);
  268. match_add('L', T_CONST_STR);
  269. match_add('"', T_CONST_STR);
  270. match_add('L', T_CONST_CHAR);
  271. match_add('\'', T_CONST_CHAR);
  272. /* identifiers */
  273. match_add('_', T_IDENT);
  274. for (c = 'a'; c <= 'z'; c++)
  275. match_add(c, T_IDENT);
  276. for (c = 'A'; c <= 'Z'; c++)
  277. match_add(c, T_IDENT);
  278. }
  279. /**
  280. * for all patterns that constitute a simple, non-branching token,
  281. * programatically add the first char
  282. */
  283. static void match_build_simple(void)
  284. {
  285. enum tok t;
  286. for (t = T_AUTO; t < T_CNT; t++) {
  287. const char *c = Lexeme[t].pattern;
  288. c += ('^' == *c); /* skip regex "start of line" anchor, all patterns have this */
  289. c += ('\\' == *c); /* get to the first REAL char */
  290. c += ('\\' == *c);
  291. assert('\\' != *c);
  292. match_add(*c, t);
  293. }
  294. }
  295. /**
  296. * utility function
  297. */
  298. static void rgxdie(const char *descr, const char *pattern, const regex_t *r, int errcode)
  299. {
  300. char buf[64];
  301. regerror(errcode, r, buf, sizeof buf);
  302. fprintf(stderr, "%s: \"%s\" -> %s\n", descr, pattern, buf);
  303. exit(1);
  304. }
  305. static void match_compile(void)
  306. {
  307. size_t i;
  308. for (i = 0; i < sizeof Lexeme / sizeof Lexeme[0]; i++) {
  309. int r;
  310. r = regcomp(&Lexeme[i].rgx, Lexeme[i].pattern, REG_EXTENDED);
  311. if (r != 0)
  312. rgxdie("regcomp", Lexeme[i].pattern, &Lexeme[i].rgx, r);
  313. }
  314. }
  315. /**
  316. * match the single, first longest token (as defined in Lemexe) found in 'buf'
  317. * of not more than 'buflen' chars
  318. * @return 0=no match, 1=match recorded in 't'
  319. */
  320. static int match_one(const char *buf, size_t buflen, struct lexeme *t)
  321. {
  322. t->tok = T;
  323. if (buflen > 0) {
  324. unsigned i = 0;
  325. unsigned longl;
  326. int c = *buf;
  327. regoff_t longest = 0;
  328. while (i < Match[c].cnt) {
  329. regmatch_t m[4];
  330. int l, r;
  331. l = Match[c].lexeme[i];
  332. r = regexec(&Lexeme[l].rgx, buf, sizeof m / sizeof m[0], m, 0);
  333. if (r != REG_NOMATCH) {
  334. int len = m[0].rm_eo;
  335. if (0 != m[0].rm_so || 0 == len) {
  336. fprintf(stderr, "expect offset=0 len>0. instead, offset=%d len=%d",
  337. m[0].rm_so, len);
  338. exit(1);
  339. }
  340. assert((size_t)len <= buflen && "match ran off the end of buf(!)");
  341. if (len > longest) {
  342. longest = len;
  343. longl = l;
  344. }
  345. }
  346. i++;
  347. }
  348. if (longest > 0) {
  349. t->tok = longl;
  350. t->len = longest;
  351. t->str = buf;
  352. }
  353. }
  354. return t->tok != T;
  355. }
  356. void lexeme_show(const struct lexeme *t)
  357. {
  358. if (T_NEWLINE == t->tok) {
  359. fputs("\\n\n", stdout);
  360. } else {
  361. printf("%s(%.*s)",
  362. Lexeme[t->tok].descr, (unsigned)t->len, t->str);
  363. }
  364. }
  365. int lexeme_cmp(const struct lexeme *a, const struct lexeme *b)
  366. {
  367. return
  368. a->len == b->len &&
  369. a->str[0] == b->str[0] &&
  370. 0 == memcmp(a->str, b->str, a->len);
  371. }
  372. void lexemelist_show(const struct lexeme *t)
  373. {
  374. while (t) {
  375. lexeme_show(t);
  376. t = t->next;
  377. }
  378. }
  379. static void lexeme_init(struct lexeme *t)
  380. {
  381. t->tok = T;
  382. t->len = 0U;
  383. t->str = NULL;
  384. t->loc.file = NULL;
  385. t->loc.line = 1UL;
  386. t->loc.off.total = 0UL;
  387. t->loc.off.line = 0UL;
  388. }
  389. static unsigned lexeme_newline_cnt(const struct lexeme *t)
  390. {
  391. unsigned cnt = 0;
  392. switch (t->tok) {
  393. case T_CPP_LINECONT:
  394. case T_NEWLINE:
  395. /* always exactly one */
  396. cnt = 1;
  397. break;
  398. case T_COMMENT:
  399. case T_CPP:
  400. {
  401. size_t i = 0;
  402. while (i < t->len) {
  403. if ('\n' == t->str[i])
  404. cnt++;
  405. i++;
  406. }
  407. }
  408. break;
  409. default:
  410. break;
  411. }
  412. return cnt;
  413. }
  414. /**
  415. * token contains at least one newline.
  416. * we want to calculate the current line offset...
  417. * return the number of character at the end that are not newline.
  418. */
  419. static unsigned lexeme_lastlinelen(const struct lexeme *t)
  420. {
  421. unsigned cnt = 0;
  422. switch (t->tok) {
  423. /* NOTE: T_NEWLINE does not count towards line offset */
  424. case T_COMMENT:
  425. case T_CPP:
  426. {
  427. size_t i = t->len;
  428. while (i--)
  429. if ('\n' == t->str[i])
  430. break;
  431. cnt = t->len - i;
  432. }
  433. break;
  434. default:
  435. break;
  436. }
  437. return cnt;
  438. }
  439. /**
  440. * calculate the contents of curr->loc based on prev->loc
  441. */
  442. static void lexeme_calc_loc(const struct lexeme *prev, struct lexeme *curr)
  443. {
  444. unsigned nlcnt = lexeme_newline_cnt(prev);
  445. curr->loc = prev->loc; /* copy whole thing */
  446. curr->loc.line += nlcnt;
  447. curr->loc.off.total += prev->len;
  448. if (nlcnt) {
  449. /* prev token contained at least one newline;
  450. * calculate our current offset on current line */
  451. curr->loc.off.line = lexeme_lastlinelen(prev);
  452. } else {
  453. /* we're still on the same line as previous token */
  454. curr->loc.off.line += prev->len;
  455. }
  456. assert(curr->loc.line >= prev->loc.line);
  457. }
  458. /**
  459. * @return number of bytes of buf consumed; not more than buflen
  460. */
  461. size_t lex(const char *buf, size_t buflen, struct lexeme **head)
  462. {
  463. struct lexeme scratch; /* always passed to match_one */
  464. struct lexeme *tail = NULL; /* previous match, used to connect list */
  465. const char *curr = buf;
  466. size_t left = buflen;
  467. *head = NULL;
  468. lexeme_init(&scratch);
  469. /* initial match */
  470. if (match_one(curr, left, &scratch)) {
  471. struct lexeme *t = malloc(sizeof *t);
  472. if (t) {
  473. *t = scratch;
  474. *head = t;
  475. tail = t;
  476. left -= scratch.len;
  477. curr += scratch.len;
  478. /* subsequent matches */
  479. while (match_one(curr, left, &scratch)) {
  480. t = malloc(sizeof *t);
  481. if (t) {
  482. *t = scratch;
  483. lexeme_calc_loc(tail, t);
  484. tail->next = t;
  485. tail = t;
  486. }
  487. left -= scratch.len;
  488. curr += scratch.len;
  489. }
  490. }
  491. tail->next = NULL;
  492. }
  493. return buflen - left;
  494. }
  495. /**
  496. * utility function
  497. * read contents of FILE into a buffer
  498. */
  499. static char * file2buf(FILE *f, size_t *len)
  500. {
  501. size_t buflen = 32 * 1024;
  502. char *buf = malloc(buflen);
  503. *len = 0;
  504. if (buf) {
  505. size_t off = 0;
  506. ssize_t rd;
  507. do {
  508. ssize_t space;
  509. space = buflen - off;
  510. assert(space > 1);
  511. rd = read(fileno(f), buf+off, space - 1);
  512. if (rd > 0) {
  513. off += rd;
  514. if (off == buflen - 1) {
  515. char *tmp = realloc(buf, buflen*2);
  516. if (!tmp)
  517. break;
  518. buflen += buflen;
  519. }
  520. }
  521. } while (rd > 0);
  522. *len = off;
  523. buf[off] = '\0'; /* string-ize, ugh */
  524. }
  525. return buf;
  526. }
  527. size_t lex_file(FILE *f, struct lexeme **head)
  528. {
  529. size_t buflen = 0;
  530. char *buf = file2buf(f, &buflen);
  531. size_t r = 0;
  532. if (buflen)
  533. r = lex(buf, buflen, head);
  534. return r;
  535. }
  536. int lex_init(void)
  537. {
  538. match_build_simple();
  539. match_build_regexes();
  540. match_compile();
  541. return 1;
  542. }
  543. #ifdef TEST
  544. int main(void)
  545. {
  546. struct lexeme *l;
  547. l = NULL;
  548. lex_init();
  549. (void)lex_file(stdin, &l);
  550. lexemelist_show(l);
  551. return 0;
  552. }
  553. #endif