/Parser/pgen.c

http://unladen-swallow.googlecode.com/ · C · 708 lines · 565 code · 86 blank · 57 comment · 97 complexity · 6d3d4f50b524e71cd7ad318e6b387db3 MD5 · raw file

  1. /* Parser generator */
  2. /* For a description, see the comments at end of this file */
  3. #include "Python.h"
  4. #include "pgenheaders.h"
  5. #include "token.h"
  6. #include "node.h"
  7. #include "grammar.h"
  8. #include "metagrammar.h"
  9. #include "pgen.h"
  10. extern int Py_DebugFlag;
  11. extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */
  12. /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
  13. typedef struct _nfaarc {
  14. int ar_label;
  15. int ar_arrow;
  16. } nfaarc;
  17. typedef struct _nfastate {
  18. int st_narcs;
  19. nfaarc *st_arc;
  20. } nfastate;
  21. typedef struct _nfa {
  22. int nf_type;
  23. char *nf_name;
  24. int nf_nstates;
  25. nfastate *nf_state;
  26. int nf_start, nf_finish;
  27. } nfa;
  28. /* Forward */
  29. static void compile_rhs(labellist *ll,
  30. nfa *nf, node *n, int *pa, int *pb);
  31. static void compile_alt(labellist *ll,
  32. nfa *nf, node *n, int *pa, int *pb);
  33. static void compile_item(labellist *ll,
  34. nfa *nf, node *n, int *pa, int *pb);
  35. static void compile_atom(labellist *ll,
  36. nfa *nf, node *n, int *pa, int *pb);
  37. static int
  38. addnfastate(nfa *nf)
  39. {
  40. nfastate *st;
  41. nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state,
  42. sizeof(nfastate) * (nf->nf_nstates + 1));
  43. if (nf->nf_state == NULL)
  44. Py_FatalError("out of mem");
  45. st = &nf->nf_state[nf->nf_nstates++];
  46. st->st_narcs = 0;
  47. st->st_arc = NULL;
  48. return st - nf->nf_state;
  49. }
  50. static void
  51. addnfaarc(nfa *nf, int from, int to, int lbl)
  52. {
  53. nfastate *st;
  54. nfaarc *ar;
  55. st = &nf->nf_state[from];
  56. st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc,
  57. sizeof(nfaarc) * (st->st_narcs + 1));
  58. if (st->st_arc == NULL)
  59. Py_FatalError("out of mem");
  60. ar = &st->st_arc[st->st_narcs++];
  61. ar->ar_label = lbl;
  62. ar->ar_arrow = to;
  63. }
  64. static nfa *
  65. newnfa(char *name)
  66. {
  67. nfa *nf;
  68. static int type = NT_OFFSET; /* All types will be disjunct */
  69. nf = (nfa *)PyObject_MALLOC(sizeof(nfa));
  70. if (nf == NULL)
  71. Py_FatalError("no mem for new nfa");
  72. nf->nf_type = type++;
  73. nf->nf_name = name; /* XXX strdup(name) ??? */
  74. nf->nf_nstates = 0;
  75. nf->nf_state = NULL;
  76. nf->nf_start = nf->nf_finish = -1;
  77. return nf;
  78. }
  79. typedef struct _nfagrammar {
  80. int gr_nnfas;
  81. nfa **gr_nfa;
  82. labellist gr_ll;
  83. } nfagrammar;
  84. /* Forward */
  85. static void compile_rule(nfagrammar *gr, node *n);
  86. static nfagrammar *
  87. newnfagrammar(void)
  88. {
  89. nfagrammar *gr;
  90. gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar));
  91. if (gr == NULL)
  92. Py_FatalError("no mem for new nfa grammar");
  93. gr->gr_nnfas = 0;
  94. gr->gr_nfa = NULL;
  95. gr->gr_ll.ll_nlabels = 0;
  96. gr->gr_ll.ll_label = NULL;
  97. addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
  98. return gr;
  99. }
  100. static nfa *
  101. addnfa(nfagrammar *gr, char *name)
  102. {
  103. nfa *nf;
  104. nf = newnfa(name);
  105. gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa,
  106. sizeof(nfa*) * (gr->gr_nnfas + 1));
  107. if (gr->gr_nfa == NULL)
  108. Py_FatalError("out of mem");
  109. gr->gr_nfa[gr->gr_nnfas++] = nf;
  110. addlabel(&gr->gr_ll, NAME, nf->nf_name);
  111. return nf;
  112. }
  113. #ifdef Py_DEBUG
  114. static char REQNFMT[] = "metacompile: less than %d children\n";
  115. #define REQN(i, count) \
  116. if (i < count) { \
  117. fprintf(stderr, REQNFMT, count); \
  118. Py_FatalError("REQN"); \
  119. } else
  120. #else
  121. #define REQN(i, count) /* empty */
  122. #endif
  123. static nfagrammar *
  124. metacompile(node *n)
  125. {
  126. nfagrammar *gr;
  127. int i;
  128. if (Py_DebugFlag)
  129. printf("Compiling (meta-) parse tree into NFA grammar\n");
  130. gr = newnfagrammar();
  131. REQ(n, MSTART);
  132. i = n->n_nchildren - 1; /* Last child is ENDMARKER */
  133. n = n->n_child;
  134. for (; --i >= 0; n++) {
  135. if (n->n_type != NEWLINE)
  136. compile_rule(gr, n);
  137. }
  138. return gr;
  139. }
  140. static void
  141. compile_rule(nfagrammar *gr, node *n)
  142. {
  143. nfa *nf;
  144. REQ(n, RULE);
  145. REQN(n->n_nchildren, 4);
  146. n = n->n_child;
  147. REQ(n, NAME);
  148. nf = addnfa(gr, n->n_str);
  149. n++;
  150. REQ(n, COLON);
  151. n++;
  152. REQ(n, RHS);
  153. compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
  154. n++;
  155. REQ(n, NEWLINE);
  156. }
  157. static void
  158. compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
  159. {
  160. int i;
  161. int a, b;
  162. REQ(n, RHS);
  163. i = n->n_nchildren;
  164. REQN(i, 1);
  165. n = n->n_child;
  166. REQ(n, ALT);
  167. compile_alt(ll, nf, n, pa, pb);
  168. if (--i <= 0)
  169. return;
  170. n++;
  171. a = *pa;
  172. b = *pb;
  173. *pa = addnfastate(nf);
  174. *pb = addnfastate(nf);
  175. addnfaarc(nf, *pa, a, EMPTY);
  176. addnfaarc(nf, b, *pb, EMPTY);
  177. for (; --i >= 0; n++) {
  178. REQ(n, VBAR);
  179. REQN(i, 1);
  180. --i;
  181. n++;
  182. REQ(n, ALT);
  183. compile_alt(ll, nf, n, &a, &b);
  184. addnfaarc(nf, *pa, a, EMPTY);
  185. addnfaarc(nf, b, *pb, EMPTY);
  186. }
  187. }
  188. static void
  189. compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
  190. {
  191. int i;
  192. int a, b;
  193. REQ(n, ALT);
  194. i = n->n_nchildren;
  195. REQN(i, 1);
  196. n = n->n_child;
  197. REQ(n, ITEM);
  198. compile_item(ll, nf, n, pa, pb);
  199. --i;
  200. n++;
  201. for (; --i >= 0; n++) {
  202. REQ(n, ITEM);
  203. compile_item(ll, nf, n, &a, &b);
  204. addnfaarc(nf, *pb, a, EMPTY);
  205. *pb = b;
  206. }
  207. }
  208. static void
  209. compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
  210. {
  211. int i;
  212. int a, b;
  213. REQ(n, ITEM);
  214. i = n->n_nchildren;
  215. REQN(i, 1);
  216. n = n->n_child;
  217. if (n->n_type == LSQB) {
  218. REQN(i, 3);
  219. n++;
  220. REQ(n, RHS);
  221. *pa = addnfastate(nf);
  222. *pb = addnfastate(nf);
  223. addnfaarc(nf, *pa, *pb, EMPTY);
  224. compile_rhs(ll, nf, n, &a, &b);
  225. addnfaarc(nf, *pa, a, EMPTY);
  226. addnfaarc(nf, b, *pb, EMPTY);
  227. REQN(i, 1);
  228. n++;
  229. REQ(n, RSQB);
  230. }
  231. else {
  232. compile_atom(ll, nf, n, pa, pb);
  233. if (--i <= 0)
  234. return;
  235. n++;
  236. addnfaarc(nf, *pb, *pa, EMPTY);
  237. if (n->n_type == STAR)
  238. *pb = *pa;
  239. else
  240. REQ(n, PLUS);
  241. }
  242. }
  243. static void
  244. compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
  245. {
  246. int i;
  247. REQ(n, ATOM);
  248. i = n->n_nchildren;
  249. REQN(i, 1);
  250. n = n->n_child;
  251. if (n->n_type == LPAR) {
  252. REQN(i, 3);
  253. n++;
  254. REQ(n, RHS);
  255. compile_rhs(ll, nf, n, pa, pb);
  256. n++;
  257. REQ(n, RPAR);
  258. }
  259. else if (n->n_type == NAME || n->n_type == STRING) {
  260. *pa = addnfastate(nf);
  261. *pb = addnfastate(nf);
  262. addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
  263. }
  264. else
  265. REQ(n, NAME);
  266. }
  267. static void
  268. dumpstate(labellist *ll, nfa *nf, int istate)
  269. {
  270. nfastate *st;
  271. int i;
  272. nfaarc *ar;
  273. printf("%c%2d%c",
  274. istate == nf->nf_start ? '*' : ' ',
  275. istate,
  276. istate == nf->nf_finish ? '.' : ' ');
  277. st = &nf->nf_state[istate];
  278. ar = st->st_arc;
  279. for (i = 0; i < st->st_narcs; i++) {
  280. if (i > 0)
  281. printf("\n ");
  282. printf("-> %2d %s", ar->ar_arrow,
  283. PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
  284. ar++;
  285. }
  286. printf("\n");
  287. }
  288. static void
  289. dumpnfa(labellist *ll, nfa *nf)
  290. {
  291. int i;
  292. printf("NFA '%s' has %d states; start %d, finish %d\n",
  293. nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
  294. for (i = 0; i < nf->nf_nstates; i++)
  295. dumpstate(ll, nf, i);
  296. }
  297. /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
  298. static void
  299. addclosure(bitset ss, nfa *nf, int istate)
  300. {
  301. if (addbit(ss, istate)) {
  302. nfastate *st = &nf->nf_state[istate];
  303. nfaarc *ar = st->st_arc;
  304. int i;
  305. for (i = st->st_narcs; --i >= 0; ) {
  306. if (ar->ar_label == EMPTY)
  307. addclosure(ss, nf, ar->ar_arrow);
  308. ar++;
  309. }
  310. }
  311. }
  312. typedef struct _ss_arc {
  313. bitset sa_bitset;
  314. int sa_arrow;
  315. int sa_label;
  316. } ss_arc;
  317. typedef struct _ss_state {
  318. bitset ss_ss;
  319. int ss_narcs;
  320. struct _ss_arc *ss_arc;
  321. int ss_deleted;
  322. int ss_finish;
  323. int ss_rename;
  324. } ss_state;
  325. typedef struct _ss_dfa {
  326. int sd_nstates;
  327. ss_state *sd_state;
  328. } ss_dfa;
  329. /* Forward */
  330. static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
  331. labellist *ll, char *msg);
  332. static void simplify(int xx_nstates, ss_state *xx_state);
  333. static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
  334. static void
  335. makedfa(nfagrammar *gr, nfa *nf, dfa *d)
  336. {
  337. int nbits = nf->nf_nstates;
  338. bitset ss;
  339. int xx_nstates;
  340. ss_state *xx_state, *yy;
  341. ss_arc *zz;
  342. int istate, jstate, iarc, jarc, ibit;
  343. nfastate *st;
  344. nfaarc *ar;
  345. ss = newbitset(nbits);
  346. addclosure(ss, nf, nf->nf_start);
  347. xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state));
  348. if (xx_state == NULL)
  349. Py_FatalError("no mem for xx_state in makedfa");
  350. xx_nstates = 1;
  351. yy = &xx_state[0];
  352. yy->ss_ss = ss;
  353. yy->ss_narcs = 0;
  354. yy->ss_arc = NULL;
  355. yy->ss_deleted = 0;
  356. yy->ss_finish = testbit(ss, nf->nf_finish);
  357. if (yy->ss_finish)
  358. printf("Error: nonterminal '%s' may produce empty.\n",
  359. nf->nf_name);
  360. /* This algorithm is from a book written before
  361. the invention of structured programming... */
  362. /* For each unmarked state... */
  363. for (istate = 0; istate < xx_nstates; ++istate) {
  364. size_t size;
  365. yy = &xx_state[istate];
  366. ss = yy->ss_ss;
  367. /* For all its states... */
  368. for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
  369. if (!testbit(ss, ibit))
  370. continue;
  371. st = &nf->nf_state[ibit];
  372. /* For all non-empty arcs from this state... */
  373. for (iarc = 0; iarc < st->st_narcs; iarc++) {
  374. ar = &st->st_arc[iarc];
  375. if (ar->ar_label == EMPTY)
  376. continue;
  377. /* Look up in list of arcs from this state */
  378. for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
  379. zz = &yy->ss_arc[jarc];
  380. if (ar->ar_label == zz->sa_label)
  381. goto found;
  382. }
  383. /* Add new arc for this state */
  384. size = sizeof(ss_arc) * (yy->ss_narcs + 1);
  385. yy->ss_arc = (ss_arc *)PyObject_REALLOC(
  386. yy->ss_arc, size);
  387. if (yy->ss_arc == NULL)
  388. Py_FatalError("out of mem");
  389. zz = &yy->ss_arc[yy->ss_narcs++];
  390. zz->sa_label = ar->ar_label;
  391. zz->sa_bitset = newbitset(nbits);
  392. zz->sa_arrow = -1;
  393. found: ;
  394. /* Add destination */
  395. addclosure(zz->sa_bitset, nf, ar->ar_arrow);
  396. }
  397. }
  398. /* Now look up all the arrow states */
  399. for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
  400. zz = &xx_state[istate].ss_arc[jarc];
  401. for (jstate = 0; jstate < xx_nstates; jstate++) {
  402. if (samebitset(zz->sa_bitset,
  403. xx_state[jstate].ss_ss, nbits)) {
  404. zz->sa_arrow = jstate;
  405. goto done;
  406. }
  407. }
  408. size = sizeof(ss_state) * (xx_nstates + 1);
  409. xx_state = (ss_state *)PyObject_REALLOC(xx_state,
  410. size);
  411. if (xx_state == NULL)
  412. Py_FatalError("out of mem");
  413. zz->sa_arrow = xx_nstates;
  414. yy = &xx_state[xx_nstates++];
  415. yy->ss_ss = zz->sa_bitset;
  416. yy->ss_narcs = 0;
  417. yy->ss_arc = NULL;
  418. yy->ss_deleted = 0;
  419. yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
  420. done: ;
  421. }
  422. }
  423. if (Py_DebugFlag)
  424. printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
  425. "before minimizing");
  426. simplify(xx_nstates, xx_state);
  427. if (Py_DebugFlag)
  428. printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
  429. "after minimizing");
  430. convert(d, xx_nstates, xx_state);
  431. /* XXX cleanup */
  432. PyObject_FREE(xx_state);
  433. }
  434. static void
  435. printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
  436. labellist *ll, char *msg)
  437. {
  438. int i, ibit, iarc;
  439. ss_state *yy;
  440. ss_arc *zz;
  441. printf("Subset DFA %s\n", msg);
  442. for (i = 0; i < xx_nstates; i++) {
  443. yy = &xx_state[i];
  444. if (yy->ss_deleted)
  445. continue;
  446. printf(" Subset %d", i);
  447. if (yy->ss_finish)
  448. printf(" (finish)");
  449. printf(" { ");
  450. for (ibit = 0; ibit < nbits; ibit++) {
  451. if (testbit(yy->ss_ss, ibit))
  452. printf("%d ", ibit);
  453. }
  454. printf("}\n");
  455. for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
  456. zz = &yy->ss_arc[iarc];
  457. printf(" Arc to state %d, label %s\n",
  458. zz->sa_arrow,
  459. PyGrammar_LabelRepr(
  460. &ll->ll_label[zz->sa_label]));
  461. }
  462. }
  463. }
  464. /* PART THREE -- SIMPLIFY DFA */
  465. /* Simplify the DFA by repeatedly eliminating states that are
  466. equivalent to another oner. This is NOT Algorithm 3.3 from
  467. [Aho&Ullman 77]. It does not always finds the minimal DFA,
  468. but it does usually make a much smaller one... (For an example
  469. of sub-optimal behavior, try S: x a b+ | y a b+.)
  470. */
  471. static int
  472. samestate(ss_state *s1, ss_state *s2)
  473. {
  474. int i;
  475. if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
  476. return 0;
  477. for (i = 0; i < s1->ss_narcs; i++) {
  478. if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
  479. s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
  480. return 0;
  481. }
  482. return 1;
  483. }
  484. static void
  485. renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
  486. {
  487. int i, j;
  488. if (Py_DebugFlag)
  489. printf("Rename state %d to %d.\n", from, to);
  490. for (i = 0; i < xx_nstates; i++) {
  491. if (xx_state[i].ss_deleted)
  492. continue;
  493. for (j = 0; j < xx_state[i].ss_narcs; j++) {
  494. if (xx_state[i].ss_arc[j].sa_arrow == from)
  495. xx_state[i].ss_arc[j].sa_arrow = to;
  496. }
  497. }
  498. }
  499. static void
  500. simplify(int xx_nstates, ss_state *xx_state)
  501. {
  502. int changes;
  503. int i, j;
  504. do {
  505. changes = 0;
  506. for (i = 1; i < xx_nstates; i++) {
  507. if (xx_state[i].ss_deleted)
  508. continue;
  509. for (j = 0; j < i; j++) {
  510. if (xx_state[j].ss_deleted)
  511. continue;
  512. if (samestate(&xx_state[i], &xx_state[j])) {
  513. xx_state[i].ss_deleted++;
  514. renamestates(xx_nstates, xx_state,
  515. i, j);
  516. changes++;
  517. break;
  518. }
  519. }
  520. }
  521. } while (changes);
  522. }
  523. /* PART FOUR -- GENERATE PARSING TABLES */
  524. /* Convert the DFA into a grammar that can be used by our parser */
  525. static void
  526. convert(dfa *d, int xx_nstates, ss_state *xx_state)
  527. {
  528. int i, j;
  529. ss_state *yy;
  530. ss_arc *zz;
  531. for (i = 0; i < xx_nstates; i++) {
  532. yy = &xx_state[i];
  533. if (yy->ss_deleted)
  534. continue;
  535. yy->ss_rename = addstate(d);
  536. }
  537. for (i = 0; i < xx_nstates; i++) {
  538. yy = &xx_state[i];
  539. if (yy->ss_deleted)
  540. continue;
  541. for (j = 0; j < yy->ss_narcs; j++) {
  542. zz = &yy->ss_arc[j];
  543. addarc(d, yy->ss_rename,
  544. xx_state[zz->sa_arrow].ss_rename,
  545. zz->sa_label);
  546. }
  547. if (yy->ss_finish)
  548. addarc(d, yy->ss_rename, yy->ss_rename, 0);
  549. }
  550. d->d_initial = 0;
  551. }
  552. /* PART FIVE -- GLUE IT ALL TOGETHER */
  553. static grammar *
  554. maketables(nfagrammar *gr)
  555. {
  556. int i;
  557. nfa *nf;
  558. dfa *d;
  559. grammar *g;
  560. if (gr->gr_nnfas == 0)
  561. return NULL;
  562. g = newgrammar(gr->gr_nfa[0]->nf_type);
  563. /* XXX first rule must be start rule */
  564. g->g_ll = gr->gr_ll;
  565. for (i = 0; i < gr->gr_nnfas; i++) {
  566. nf = gr->gr_nfa[i];
  567. if (Py_DebugFlag) {
  568. printf("Dump of NFA for '%s' ...\n", nf->nf_name);
  569. dumpnfa(&gr->gr_ll, nf);
  570. printf("Making DFA for '%s' ...\n", nf->nf_name);
  571. }
  572. d = adddfa(g, nf->nf_type, nf->nf_name);
  573. makedfa(gr, gr->gr_nfa[i], d);
  574. }
  575. return g;
  576. }
  577. grammar *
  578. pgen(node *n)
  579. {
  580. nfagrammar *gr;
  581. grammar *g;
  582. gr = metacompile(n);
  583. g = maketables(gr);
  584. translatelabels(g);
  585. addfirstsets(g);
  586. PyObject_FREE(gr);
  587. return g;
  588. }
  589. grammar *
  590. Py_pgen(node *n)
  591. {
  592. return pgen(n);
  593. }
  594. /*
  595. Description
  596. -----------
  597. Input is a grammar in extended BNF (using * for repetition, + for
  598. at-least-once repetition, [] for optional parts, | for alternatives and
  599. () for grouping). This has already been parsed and turned into a parse
  600. tree.
  601. Each rule is considered as a regular expression in its own right.
  602. It is turned into a Non-deterministic Finite Automaton (NFA), which
  603. is then turned into a Deterministic Finite Automaton (DFA), which is then
  604. optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
  605. or similar compiler books (this technique is more often used for lexical
  606. analyzers).
  607. The DFA's are used by the parser as parsing tables in a special way
  608. that's probably unique. Before they are usable, the FIRST sets of all
  609. non-terminals are computed.
  610. Reference
  611. ---------
  612. [Aho&Ullman 77]
  613. Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
  614. (first edition)
  615. */