/contrib/groff/src/preproc/refer/label.y

https://bitbucket.org/freebsd/freebsd-head/ · Happy · 1193 lines · 1082 code · 111 blank · 0 comment · 0 complexity · 507fb1ef8a694bdf863cb14343765d48 MD5 · raw file

  1. /* -*- C++ -*-
  2. Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
  3. Free Software Foundation, Inc.
  4. Written by James Clark (jjc@jclark.com)
  5. This file is part of groff.
  6. groff is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation; either version 2, or (at your option) any later
  9. version.
  10. groff is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12. FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  13. for more details.
  14. You should have received a copy of the GNU General Public License along
  15. with groff; see the file COPYING. If not, write to the Free Software
  16. Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  17. %{
  18. #include "refer.h"
  19. #include "refid.h"
  20. #include "ref.h"
  21. #include "token.h"
  22. int yylex();
  23. void yyerror(const char *);
  24. int yyparse();
  25. static const char *format_serial(char c, int n);
  26. struct label_info {
  27. int start;
  28. int length;
  29. int count;
  30. int total;
  31. label_info(const string &);
  32. };
  33. label_info *lookup_label(const string &label);
  34. struct expression {
  35. enum {
  36. // Does the tentative label depend on the reference?
  37. CONTAINS_VARIABLE = 01,
  38. CONTAINS_STAR = 02,
  39. CONTAINS_FORMAT = 04,
  40. CONTAINS_AT = 010
  41. };
  42. virtual ~expression() { }
  43. virtual void evaluate(int, const reference &, string &,
  44. substring_position &) = 0;
  45. virtual unsigned analyze() { return 0; }
  46. };
  47. class at_expr : public expression {
  48. public:
  49. at_expr() { }
  50. void evaluate(int, const reference &, string &, substring_position &);
  51. unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
  52. };
  53. class format_expr : public expression {
  54. char type;
  55. int width;
  56. int first_number;
  57. public:
  58. format_expr(char c, int w = 0, int f = 1)
  59. : type(c), width(w), first_number(f) { }
  60. void evaluate(int, const reference &, string &, substring_position &);
  61. unsigned analyze() { return CONTAINS_FORMAT; }
  62. };
  63. class field_expr : public expression {
  64. int number;
  65. char name;
  66. public:
  67. field_expr(char nm, int num) : number(num), name(nm) { }
  68. void evaluate(int, const reference &, string &, substring_position &);
  69. unsigned analyze() { return CONTAINS_VARIABLE; }
  70. };
  71. class literal_expr : public expression {
  72. string s;
  73. public:
  74. literal_expr(const char *ptr, int len) : s(ptr, len) { }
  75. void evaluate(int, const reference &, string &, substring_position &);
  76. };
  77. class unary_expr : public expression {
  78. protected:
  79. expression *expr;
  80. public:
  81. unary_expr(expression *e) : expr(e) { }
  82. ~unary_expr() { delete expr; }
  83. void evaluate(int, const reference &, string &, substring_position &) = 0;
  84. unsigned analyze() { return expr ? expr->analyze() : 0; }
  85. };
  86. // This caches the analysis of an expression.
  87. class analyzed_expr : public unary_expr {
  88. unsigned flags;
  89. public:
  90. analyzed_expr(expression *);
  91. void evaluate(int, const reference &, string &, substring_position &);
  92. unsigned analyze() { return flags; }
  93. };
  94. class star_expr : public unary_expr {
  95. public:
  96. star_expr(expression *e) : unary_expr(e) { }
  97. void evaluate(int, const reference &, string &, substring_position &);
  98. unsigned analyze() {
  99. return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
  100. | CONTAINS_STAR);
  101. }
  102. };
  103. typedef void map_func(const char *, const char *, string &);
  104. class map_expr : public unary_expr {
  105. map_func *func;
  106. public:
  107. map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
  108. void evaluate(int, const reference &, string &, substring_position &);
  109. };
  110. typedef const char *extractor_func(const char *, const char *, const char **);
  111. class extractor_expr : public unary_expr {
  112. int part;
  113. extractor_func *func;
  114. public:
  115. enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
  116. extractor_expr(expression *e, extractor_func *f, int pt)
  117. : unary_expr(e), part(pt), func(f) { }
  118. void evaluate(int, const reference &, string &, substring_position &);
  119. };
  120. class truncate_expr : public unary_expr {
  121. int n;
  122. public:
  123. truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
  124. void evaluate(int, const reference &, string &, substring_position &);
  125. };
  126. class separator_expr : public unary_expr {
  127. public:
  128. separator_expr(expression *e) : unary_expr(e) { }
  129. void evaluate(int, const reference &, string &, substring_position &);
  130. };
  131. class binary_expr : public expression {
  132. protected:
  133. expression *expr1;
  134. expression *expr2;
  135. public:
  136. binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
  137. ~binary_expr() { delete expr1; delete expr2; }
  138. void evaluate(int, const reference &, string &, substring_position &) = 0;
  139. unsigned analyze() {
  140. return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
  141. }
  142. };
  143. class alternative_expr : public binary_expr {
  144. public:
  145. alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
  146. void evaluate(int, const reference &, string &, substring_position &);
  147. };
  148. class list_expr : public binary_expr {
  149. public:
  150. list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
  151. void evaluate(int, const reference &, string &, substring_position &);
  152. };
  153. class substitute_expr : public binary_expr {
  154. public:
  155. substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
  156. void evaluate(int, const reference &, string &, substring_position &);
  157. };
  158. class ternary_expr : public expression {
  159. protected:
  160. expression *expr1;
  161. expression *expr2;
  162. expression *expr3;
  163. public:
  164. ternary_expr(expression *e1, expression *e2, expression *e3)
  165. : expr1(e1), expr2(e2), expr3(e3) { }
  166. ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
  167. void evaluate(int, const reference &, string &, substring_position &) = 0;
  168. unsigned analyze() {
  169. return ((expr1 ? expr1->analyze() : 0)
  170. | (expr2 ? expr2->analyze() : 0)
  171. | (expr3 ? expr3->analyze() : 0));
  172. }
  173. };
  174. class conditional_expr : public ternary_expr {
  175. public:
  176. conditional_expr(expression *e1, expression *e2, expression *e3)
  177. : ternary_expr(e1, e2, e3) { }
  178. void evaluate(int, const reference &, string &, substring_position &);
  179. };
  180. static expression *parsed_label = 0;
  181. static expression *parsed_date_label = 0;
  182. static expression *parsed_short_label = 0;
  183. static expression *parse_result;
  184. string literals;
  185. %}
  186. %union {
  187. int num;
  188. expression *expr;
  189. struct { int ndigits; int val; } dig;
  190. struct { int start; int len; } str;
  191. }
  192. /* uppercase or lowercase letter */
  193. %token <num> TOKEN_LETTER
  194. /* literal characters */
  195. %token <str> TOKEN_LITERAL
  196. /* digit */
  197. %token <num> TOKEN_DIGIT
  198. %type <expr> conditional
  199. %type <expr> alternative
  200. %type <expr> list
  201. %type <expr> string
  202. %type <expr> substitute
  203. %type <expr> optional_conditional
  204. %type <num> number
  205. %type <dig> digits
  206. %type <num> optional_number
  207. %type <num> flag
  208. %%
  209. expr:
  210. optional_conditional
  211. { parse_result = ($1 ? new analyzed_expr($1) : 0); }
  212. ;
  213. conditional:
  214. alternative
  215. { $$ = $1; }
  216. | alternative '?' optional_conditional ':' conditional
  217. { $$ = new conditional_expr($1, $3, $5); }
  218. ;
  219. optional_conditional:
  220. /* empty */
  221. { $$ = 0; }
  222. | conditional
  223. { $$ = $1; }
  224. ;
  225. alternative:
  226. list
  227. { $$ = $1; }
  228. | alternative '|' list
  229. { $$ = new alternative_expr($1, $3); }
  230. | alternative '&' list
  231. { $$ = new conditional_expr($1, $3, 0); }
  232. ;
  233. list:
  234. substitute
  235. { $$ = $1; }
  236. | list substitute
  237. { $$ = new list_expr($1, $2); }
  238. ;
  239. substitute:
  240. string
  241. { $$ = $1; }
  242. | substitute '~' string
  243. { $$ = new substitute_expr($1, $3); }
  244. ;
  245. string:
  246. '@'
  247. { $$ = new at_expr; }
  248. | TOKEN_LITERAL
  249. {
  250. $$ = new literal_expr(literals.contents() + $1.start,
  251. $1.len);
  252. }
  253. | TOKEN_LETTER
  254. { $$ = new field_expr($1, 0); }
  255. | TOKEN_LETTER number
  256. { $$ = new field_expr($1, $2 - 1); }
  257. | '%' TOKEN_LETTER
  258. {
  259. switch ($2) {
  260. case 'I':
  261. case 'i':
  262. case 'A':
  263. case 'a':
  264. $$ = new format_expr($2);
  265. break;
  266. default:
  267. command_error("unrecognized format `%1'", char($2));
  268. $$ = new format_expr('a');
  269. break;
  270. }
  271. }
  272. | '%' digits
  273. {
  274. $$ = new format_expr('0', $2.ndigits, $2.val);
  275. }
  276. | string '.' flag TOKEN_LETTER optional_number
  277. {
  278. switch ($4) {
  279. case 'l':
  280. $$ = new map_expr($1, lowercase);
  281. break;
  282. case 'u':
  283. $$ = new map_expr($1, uppercase);
  284. break;
  285. case 'c':
  286. $$ = new map_expr($1, capitalize);
  287. break;
  288. case 'r':
  289. $$ = new map_expr($1, reverse_name);
  290. break;
  291. case 'a':
  292. $$ = new map_expr($1, abbreviate_name);
  293. break;
  294. case 'y':
  295. $$ = new extractor_expr($1, find_year, $3);
  296. break;
  297. case 'n':
  298. $$ = new extractor_expr($1, find_last_name, $3);
  299. break;
  300. default:
  301. $$ = $1;
  302. command_error("unknown function `%1'", char($4));
  303. break;
  304. }
  305. }
  306. | string '+' number
  307. { $$ = new truncate_expr($1, $3); }
  308. | string '-' number
  309. { $$ = new truncate_expr($1, -$3); }
  310. | string '*'
  311. { $$ = new star_expr($1); }
  312. | '(' optional_conditional ')'
  313. { $$ = $2; }
  314. | '<' optional_conditional '>'
  315. { $$ = new separator_expr($2); }
  316. ;
  317. optional_number:
  318. /* empty */
  319. { $$ = -1; }
  320. | number
  321. { $$ = $1; }
  322. ;
  323. number:
  324. TOKEN_DIGIT
  325. { $$ = $1; }
  326. | number TOKEN_DIGIT
  327. { $$ = $1*10 + $2; }
  328. ;
  329. digits:
  330. TOKEN_DIGIT
  331. { $$.ndigits = 1; $$.val = $1; }
  332. | digits TOKEN_DIGIT
  333. { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
  334. ;
  335. flag:
  336. /* empty */
  337. { $$ = 0; }
  338. | '+'
  339. { $$ = 1; }
  340. | '-'
  341. { $$ = -1; }
  342. ;
  343. %%
  344. /* bison defines const to be empty unless __STDC__ is defined, which it
  345. isn't under cfront */
  346. #ifdef const
  347. #undef const
  348. #endif
  349. const char *spec_ptr;
  350. const char *spec_end;
  351. const char *spec_cur;
  352. static char uppercase_array[] = {
  353. 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
  354. 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
  355. 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
  356. 'Y', 'Z',
  357. };
  358. static char lowercase_array[] = {
  359. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
  360. 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
  361. 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
  362. 'y', 'z',
  363. };
  364. int yylex()
  365. {
  366. while (spec_ptr < spec_end && csspace(*spec_ptr))
  367. spec_ptr++;
  368. spec_cur = spec_ptr;
  369. if (spec_ptr >= spec_end)
  370. return 0;
  371. unsigned char c = *spec_ptr++;
  372. if (csalpha(c)) {
  373. yylval.num = c;
  374. return TOKEN_LETTER;
  375. }
  376. if (csdigit(c)) {
  377. yylval.num = c - '0';
  378. return TOKEN_DIGIT;
  379. }
  380. if (c == '\'') {
  381. yylval.str.start = literals.length();
  382. for (; spec_ptr < spec_end; spec_ptr++) {
  383. if (*spec_ptr == '\'') {
  384. if (++spec_ptr < spec_end && *spec_ptr == '\'')
  385. literals += '\'';
  386. else {
  387. yylval.str.len = literals.length() - yylval.str.start;
  388. return TOKEN_LITERAL;
  389. }
  390. }
  391. else
  392. literals += *spec_ptr;
  393. }
  394. yylval.str.len = literals.length() - yylval.str.start;
  395. return TOKEN_LITERAL;
  396. }
  397. return c;
  398. }
  399. int set_label_spec(const char *label_spec)
  400. {
  401. spec_cur = spec_ptr = label_spec;
  402. spec_end = strchr(label_spec, '\0');
  403. literals.clear();
  404. if (yyparse())
  405. return 0;
  406. delete parsed_label;
  407. parsed_label = parse_result;
  408. return 1;
  409. }
  410. int set_date_label_spec(const char *label_spec)
  411. {
  412. spec_cur = spec_ptr = label_spec;
  413. spec_end = strchr(label_spec, '\0');
  414. literals.clear();
  415. if (yyparse())
  416. return 0;
  417. delete parsed_date_label;
  418. parsed_date_label = parse_result;
  419. return 1;
  420. }
  421. int set_short_label_spec(const char *label_spec)
  422. {
  423. spec_cur = spec_ptr = label_spec;
  424. spec_end = strchr(label_spec, '\0');
  425. literals.clear();
  426. if (yyparse())
  427. return 0;
  428. delete parsed_short_label;
  429. parsed_short_label = parse_result;
  430. return 1;
  431. }
  432. void yyerror(const char *message)
  433. {
  434. if (spec_cur < spec_end)
  435. command_error("label specification %1 before `%2'", message, spec_cur);
  436. else
  437. command_error("label specification %1 at end of string",
  438. message, spec_cur);
  439. }
  440. void at_expr::evaluate(int tentative, const reference &ref,
  441. string &result, substring_position &)
  442. {
  443. if (tentative)
  444. ref.canonicalize_authors(result);
  445. else {
  446. const char *end, *start = ref.get_authors(&end);
  447. if (start)
  448. result.append(start, end - start);
  449. }
  450. }
  451. void format_expr::evaluate(int tentative, const reference &ref,
  452. string &result, substring_position &)
  453. {
  454. if (tentative)
  455. return;
  456. const label_info *lp = ref.get_label_ptr();
  457. int num = lp == 0 ? ref.get_number() : lp->count;
  458. if (type != '0')
  459. result += format_serial(type, num + 1);
  460. else {
  461. const char *ptr = i_to_a(num + first_number);
  462. int pad = width - strlen(ptr);
  463. while (--pad >= 0)
  464. result += '0';
  465. result += ptr;
  466. }
  467. }
  468. static const char *format_serial(char c, int n)
  469. {
  470. assert(n > 0);
  471. static char buf[128]; // more than enough.
  472. switch (c) {
  473. case 'i':
  474. case 'I':
  475. {
  476. char *p = buf;
  477. // troff uses z and w to represent 10000 and 5000 in Roman
  478. // numerals; I can find no historical basis for this usage
  479. const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
  480. if (n >= 40000)
  481. return i_to_a(n);
  482. while (n >= 10000) {
  483. *p++ = s[0];
  484. n -= 10000;
  485. }
  486. for (int i = 1000; i > 0; i /= 10, s += 2) {
  487. int m = n/i;
  488. n -= m*i;
  489. switch (m) {
  490. case 3:
  491. *p++ = s[2];
  492. /* falls through */
  493. case 2:
  494. *p++ = s[2];
  495. /* falls through */
  496. case 1:
  497. *p++ = s[2];
  498. break;
  499. case 4:
  500. *p++ = s[2];
  501. *p++ = s[1];
  502. break;
  503. case 8:
  504. *p++ = s[1];
  505. *p++ = s[2];
  506. *p++ = s[2];
  507. *p++ = s[2];
  508. break;
  509. case 7:
  510. *p++ = s[1];
  511. *p++ = s[2];
  512. *p++ = s[2];
  513. break;
  514. case 6:
  515. *p++ = s[1];
  516. *p++ = s[2];
  517. break;
  518. case 5:
  519. *p++ = s[1];
  520. break;
  521. case 9:
  522. *p++ = s[2];
  523. *p++ = s[0];
  524. }
  525. }
  526. *p = 0;
  527. break;
  528. }
  529. case 'a':
  530. case 'A':
  531. {
  532. char *p = buf;
  533. // this is derived from troff/reg.c
  534. while (n > 0) {
  535. int d = n % 26;
  536. if (d == 0)
  537. d = 26;
  538. n -= d;
  539. n /= 26;
  540. *p++ = c == 'a' ? lowercase_array[d - 1] :
  541. uppercase_array[d - 1];
  542. }
  543. *p-- = 0;
  544. // Reverse it.
  545. char *q = buf;
  546. while (q < p) {
  547. char temp = *q;
  548. *q = *p;
  549. *p = temp;
  550. --p;
  551. ++q;
  552. }
  553. break;
  554. }
  555. default:
  556. assert(0);
  557. }
  558. return buf;
  559. }
  560. void field_expr::evaluate(int, const reference &ref,
  561. string &result, substring_position &)
  562. {
  563. const char *end;
  564. const char *start = ref.get_field(name, &end);
  565. if (start) {
  566. start = nth_field(number, start, &end);
  567. if (start)
  568. result.append(start, end - start);
  569. }
  570. }
  571. void literal_expr::evaluate(int, const reference &,
  572. string &result, substring_position &)
  573. {
  574. result += s;
  575. }
  576. analyzed_expr::analyzed_expr(expression *e)
  577. : unary_expr(e), flags(e ? e->analyze() : 0)
  578. {
  579. }
  580. void analyzed_expr::evaluate(int tentative, const reference &ref,
  581. string &result, substring_position &pos)
  582. {
  583. if (expr)
  584. expr->evaluate(tentative, ref, result, pos);
  585. }
  586. void star_expr::evaluate(int tentative, const reference &ref,
  587. string &result, substring_position &pos)
  588. {
  589. const label_info *lp = ref.get_label_ptr();
  590. if (!tentative
  591. && (lp == 0 || lp->total > 1)
  592. && expr)
  593. expr->evaluate(tentative, ref, result, pos);
  594. }
  595. void separator_expr::evaluate(int tentative, const reference &ref,
  596. string &result, substring_position &pos)
  597. {
  598. int start_length = result.length();
  599. int is_first = pos.start < 0;
  600. if (expr)
  601. expr->evaluate(tentative, ref, result, pos);
  602. if (is_first) {
  603. pos.start = start_length;
  604. pos.length = result.length() - start_length;
  605. }
  606. }
  607. void map_expr::evaluate(int tentative, const reference &ref,
  608. string &result, substring_position &)
  609. {
  610. if (expr) {
  611. string temp;
  612. substring_position temp_pos;
  613. expr->evaluate(tentative, ref, temp, temp_pos);
  614. (*func)(temp.contents(), temp.contents() + temp.length(), result);
  615. }
  616. }
  617. void extractor_expr::evaluate(int tentative, const reference &ref,
  618. string &result, substring_position &)
  619. {
  620. if (expr) {
  621. string temp;
  622. substring_position temp_pos;
  623. expr->evaluate(tentative, ref, temp, temp_pos);
  624. const char *end, *start = (*func)(temp.contents(),
  625. temp.contents() + temp.length(),
  626. &end);
  627. switch (part) {
  628. case BEFORE:
  629. if (start)
  630. result.append(temp.contents(), start - temp.contents());
  631. else
  632. result += temp;
  633. break;
  634. case MATCH:
  635. if (start)
  636. result.append(start, end - start);
  637. break;
  638. case AFTER:
  639. if (start)
  640. result.append(end, temp.contents() + temp.length() - end);
  641. break;
  642. default:
  643. assert(0);
  644. }
  645. }
  646. }
  647. static void first_part(int len, const char *ptr, const char *end,
  648. string &result)
  649. {
  650. for (;;) {
  651. const char *token_start = ptr;
  652. if (!get_token(&ptr, end))
  653. break;
  654. const token_info *ti = lookup_token(token_start, ptr);
  655. int counts = ti->sortify_non_empty(token_start, ptr);
  656. if (counts && --len < 0)
  657. break;
  658. if (counts || ti->is_accent())
  659. result.append(token_start, ptr - token_start);
  660. }
  661. }
  662. static void last_part(int len, const char *ptr, const char *end,
  663. string &result)
  664. {
  665. const char *start = ptr;
  666. int count = 0;
  667. for (;;) {
  668. const char *token_start = ptr;
  669. if (!get_token(&ptr, end))
  670. break;
  671. const token_info *ti = lookup_token(token_start, ptr);
  672. if (ti->sortify_non_empty(token_start, ptr))
  673. count++;
  674. }
  675. ptr = start;
  676. int skip = count - len;
  677. if (skip > 0) {
  678. for (;;) {
  679. const char *token_start = ptr;
  680. if (!get_token(&ptr, end))
  681. assert(0);
  682. const token_info *ti = lookup_token(token_start, ptr);
  683. if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
  684. ptr = token_start;
  685. break;
  686. }
  687. }
  688. }
  689. first_part(len, ptr, end, result);
  690. }
  691. void truncate_expr::evaluate(int tentative, const reference &ref,
  692. string &result, substring_position &)
  693. {
  694. if (expr) {
  695. string temp;
  696. substring_position temp_pos;
  697. expr->evaluate(tentative, ref, temp, temp_pos);
  698. const char *start = temp.contents();
  699. const char *end = start + temp.length();
  700. if (n > 0)
  701. first_part(n, start, end, result);
  702. else if (n < 0)
  703. last_part(-n, start, end, result);
  704. }
  705. }
  706. void alternative_expr::evaluate(int tentative, const reference &ref,
  707. string &result, substring_position &pos)
  708. {
  709. int start_length = result.length();
  710. if (expr1)
  711. expr1->evaluate(tentative, ref, result, pos);
  712. if (result.length() == start_length && expr2)
  713. expr2->evaluate(tentative, ref, result, pos);
  714. }
  715. void list_expr::evaluate(int tentative, const reference &ref,
  716. string &result, substring_position &pos)
  717. {
  718. if (expr1)
  719. expr1->evaluate(tentative, ref, result, pos);
  720. if (expr2)
  721. expr2->evaluate(tentative, ref, result, pos);
  722. }
  723. void substitute_expr::evaluate(int tentative, const reference &ref,
  724. string &result, substring_position &pos)
  725. {
  726. int start_length = result.length();
  727. if (expr1)
  728. expr1->evaluate(tentative, ref, result, pos);
  729. if (result.length() > start_length && result[result.length() - 1] == '-') {
  730. // ought to see if pos covers the -
  731. result.set_length(result.length() - 1);
  732. if (expr2)
  733. expr2->evaluate(tentative, ref, result, pos);
  734. }
  735. }
  736. void conditional_expr::evaluate(int tentative, const reference &ref,
  737. string &result, substring_position &pos)
  738. {
  739. string temp;
  740. substring_position temp_pos;
  741. if (expr1)
  742. expr1->evaluate(tentative, ref, temp, temp_pos);
  743. if (temp.length() > 0) {
  744. if (expr2)
  745. expr2->evaluate(tentative, ref, result, pos);
  746. }
  747. else {
  748. if (expr3)
  749. expr3->evaluate(tentative, ref, result, pos);
  750. }
  751. }
  752. void reference::pre_compute_label()
  753. {
  754. if (parsed_label != 0
  755. && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
  756. label.clear();
  757. substring_position temp_pos;
  758. parsed_label->evaluate(1, *this, label, temp_pos);
  759. label_ptr = lookup_label(label);
  760. }
  761. }
  762. void reference::compute_label()
  763. {
  764. label.clear();
  765. if (parsed_label)
  766. parsed_label->evaluate(0, *this, label, separator_pos);
  767. if (short_label_flag && parsed_short_label)
  768. parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
  769. if (date_as_label) {
  770. string new_date;
  771. if (parsed_date_label) {
  772. substring_position temp_pos;
  773. parsed_date_label->evaluate(0, *this, new_date, temp_pos);
  774. }
  775. set_date(new_date);
  776. }
  777. if (label_ptr)
  778. label_ptr->count += 1;
  779. }
  780. void reference::immediate_compute_label()
  781. {
  782. if (label_ptr)
  783. label_ptr->total = 2; // force use of disambiguator
  784. compute_label();
  785. }
  786. int reference::merge_labels(reference **v, int n, label_type type,
  787. string &result)
  788. {
  789. if (abbreviate_label_ranges)
  790. return merge_labels_by_number(v, n, type, result);
  791. else
  792. return merge_labels_by_parts(v, n, type, result);
  793. }
  794. int reference::merge_labels_by_number(reference **v, int n, label_type type,
  795. string &result)
  796. {
  797. if (n <= 1)
  798. return 0;
  799. int num = get_number();
  800. // Only merge three or more labels.
  801. if (v[0]->get_number() != num + 1
  802. || v[1]->get_number() != num + 2)
  803. return 0;
  804. int i;
  805. for (i = 2; i < n; i++)
  806. if (v[i]->get_number() != num + i + 1)
  807. break;
  808. result = get_label(type);
  809. result += label_range_indicator;
  810. result += v[i - 1]->get_label(type);
  811. return i;
  812. }
  813. const substring_position &reference::get_separator_pos(label_type type) const
  814. {
  815. if (type == SHORT_LABEL && short_label_flag)
  816. return short_separator_pos;
  817. else
  818. return separator_pos;
  819. }
  820. const string &reference::get_label(label_type type) const
  821. {
  822. if (type == SHORT_LABEL && short_label_flag)
  823. return short_label;
  824. else
  825. return label;
  826. }
  827. int reference::merge_labels_by_parts(reference **v, int n, label_type type,
  828. string &result)
  829. {
  830. if (n <= 0)
  831. return 0;
  832. const string &lb = get_label(type);
  833. const substring_position &sp = get_separator_pos(type);
  834. if (sp.start < 0
  835. || sp.start != v[0]->get_separator_pos(type).start
  836. || memcmp(lb.contents(), v[0]->get_label(type).contents(),
  837. sp.start) != 0)
  838. return 0;
  839. result = lb;
  840. int i = 0;
  841. do {
  842. result += separate_label_second_parts;
  843. const substring_position &s = v[i]->get_separator_pos(type);
  844. int sep_end_pos = s.start + s.length;
  845. result.append(v[i]->get_label(type).contents() + sep_end_pos,
  846. v[i]->get_label(type).length() - sep_end_pos);
  847. } while (++i < n
  848. && sp.start == v[i]->get_separator_pos(type).start
  849. && memcmp(lb.contents(), v[i]->get_label(type).contents(),
  850. sp.start) == 0);
  851. return i;
  852. }
  853. string label_pool;
  854. label_info::label_info(const string &s)
  855. : start(label_pool.length()), length(s.length()), count(0), total(1)
  856. {
  857. label_pool += s;
  858. }
  859. static label_info **label_table = 0;
  860. static int label_table_size = 0;
  861. static int label_table_used = 0;
  862. label_info *lookup_label(const string &label)
  863. {
  864. if (label_table == 0) {
  865. label_table = new label_info *[17];
  866. label_table_size = 17;
  867. for (int i = 0; i < 17; i++)
  868. label_table[i] = 0;
  869. }
  870. unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
  871. label_info **ptr;
  872. for (ptr = label_table + h;
  873. *ptr != 0;
  874. (ptr == label_table)
  875. ? (ptr = label_table + label_table_size - 1)
  876. : ptr--)
  877. if ((*ptr)->length == label.length()
  878. && memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
  879. label.length()) == 0) {
  880. (*ptr)->total += 1;
  881. return *ptr;
  882. }
  883. label_info *result = *ptr = new label_info(label);
  884. if (++label_table_used * 2 > label_table_size) {
  885. // Rehash the table.
  886. label_info **old_table = label_table;
  887. int old_size = label_table_size;
  888. label_table_size = next_size(label_table_size);
  889. label_table = new label_info *[label_table_size];
  890. int i;
  891. for (i = 0; i < label_table_size; i++)
  892. label_table[i] = 0;
  893. for (i = 0; i < old_size; i++)
  894. if (old_table[i]) {
  895. h = hash_string(label_pool.contents() + old_table[i]->start,
  896. old_table[i]->length);
  897. label_info **p;
  898. for (p = label_table + (h % label_table_size);
  899. *p != 0;
  900. (p == label_table)
  901. ? (p = label_table + label_table_size - 1)
  902. : --p)
  903. ;
  904. *p = old_table[i];
  905. }
  906. a_delete old_table;
  907. }
  908. return result;
  909. }
  910. void clear_labels()
  911. {
  912. for (int i = 0; i < label_table_size; i++) {
  913. delete label_table[i];
  914. label_table[i] = 0;
  915. }
  916. label_table_used = 0;
  917. label_pool.clear();
  918. }
  919. static void consider_authors(reference **start, reference **end, int i);
  920. void compute_labels(reference **v, int n)
  921. {
  922. if (parsed_label
  923. && (parsed_label->analyze() & expression::CONTAINS_AT)
  924. && sort_fields.length() >= 2
  925. && sort_fields[0] == 'A'
  926. && sort_fields[1] == '+')
  927. consider_authors(v, v + n, 0);
  928. for (int i = 0; i < n; i++)
  929. v[i]->compute_label();
  930. }
  931. /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
  932. where 0 <= i <= N if there exists a reference with a list of authors
  933. <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
  934. and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
  935. A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
  936. <B0,B1,...,BM>. If a reference needs author i we only have to call
  937. need_author(j) for some j >= i such that the reference also needs
  938. author j. */
  939. /* This function handles 2 tasks:
  940. determine which authors are needed (cannot be elided with et al.);
  941. determine which authors can have only last names in the labels.
  942. References >= start and < end have the same first i author names.
  943. Also they're sorted by A+. */
  944. static void consider_authors(reference **start, reference **end, int i)
  945. {
  946. if (start >= end)
  947. return;
  948. reference **p = start;
  949. if (i >= (*p)->get_nauthors()) {
  950. for (++p; p < end && i >= (*p)->get_nauthors(); p++)
  951. ;
  952. if (p < end && i > 0) {
  953. // If we have an author list <A B C> and an author list <A B C D>,
  954. // then both lists need C.
  955. for (reference **q = start; q < end; q++)
  956. (*q)->need_author(i - 1);
  957. }
  958. start = p;
  959. }
  960. while (p < end) {
  961. reference **last_name_start = p;
  962. reference **name_start = p;
  963. for (++p;
  964. p < end && i < (*p)->get_nauthors()
  965. && same_author_last_name(**last_name_start, **p, i);
  966. p++) {
  967. if (!same_author_name(**name_start, **p, i)) {
  968. consider_authors(name_start, p, i + 1);
  969. name_start = p;
  970. }
  971. }
  972. consider_authors(name_start, p, i + 1);
  973. if (last_name_start == name_start) {
  974. for (reference **q = last_name_start; q < p; q++)
  975. (*q)->set_last_name_unambiguous(i);
  976. }
  977. // If we have an author list <A B C D> and <A B C E>, then the lists
  978. // need author D and E respectively.
  979. if (name_start > start || p < end) {
  980. for (reference **q = last_name_start; q < p; q++)
  981. (*q)->need_author(i);
  982. }
  983. }
  984. }
  985. int same_author_last_name(const reference &r1, const reference &r2, int n)
  986. {
  987. const char *ae1;
  988. const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
  989. const char *ae2;
  990. const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
  991. if (!as1 && !as2) return 1; // they are the same
  992. if (!as1 || !as2) return 0;
  993. return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
  994. }
  995. int same_author_name(const reference &r1, const reference &r2, int n)
  996. {
  997. const char *ae1;
  998. const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
  999. const char *ae2;
  1000. const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
  1001. if (!as1 && !as2) return 1; // they are the same
  1002. if (!as1 || !as2) return 0;
  1003. return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
  1004. }
  1005. void int_set::set(int i)
  1006. {
  1007. assert(i >= 0);
  1008. int bytei = i >> 3;
  1009. if (bytei >= v.length()) {
  1010. int old_length = v.length();
  1011. v.set_length(bytei + 1);
  1012. for (int j = old_length; j <= bytei; j++)
  1013. v[j] = 0;
  1014. }
  1015. v[bytei] |= 1 << (i & 7);
  1016. }
  1017. int int_set::get(int i) const
  1018. {
  1019. assert(i >= 0);
  1020. int bytei = i >> 3;
  1021. return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
  1022. }
  1023. void reference::set_last_name_unambiguous(int i)
  1024. {
  1025. last_name_unambiguous.set(i);
  1026. }
  1027. void reference::need_author(int n)
  1028. {
  1029. if (n > last_needed_author)
  1030. last_needed_author = n;
  1031. }
  1032. const char *reference::get_authors(const char **end) const
  1033. {
  1034. if (!computed_authors) {
  1035. ((reference *)this)->computed_authors = 1;
  1036. string &result = ((reference *)this)->authors;
  1037. int na = get_nauthors();
  1038. result.clear();
  1039. for (int i = 0; i < na; i++) {
  1040. if (last_name_unambiguous.get(i)) {
  1041. const char *e, *start = get_author_last_name(i, &e);
  1042. assert(start != 0);
  1043. result.append(start, e - start);
  1044. }
  1045. else {
  1046. const char *e, *start = get_author(i, &e);
  1047. assert(start != 0);
  1048. result.append(start, e - start);
  1049. }
  1050. if (i == last_needed_author
  1051. && et_al.length() > 0
  1052. && et_al_min_elide > 0
  1053. && last_needed_author + et_al_min_elide < na
  1054. && na >= et_al_min_total) {
  1055. result += et_al;
  1056. break;
  1057. }
  1058. if (i < na - 1) {
  1059. if (na == 2)
  1060. result += join_authors_exactly_two;
  1061. else if (i < na - 2)
  1062. result += join_authors_default;
  1063. else
  1064. result += join_authors_last_two;
  1065. }
  1066. }
  1067. }
  1068. const char *start = authors.contents();
  1069. *end = start + authors.length();
  1070. return start;
  1071. }
  1072. int reference::get_nauthors() const
  1073. {
  1074. if (nauthors < 0) {
  1075. const char *dummy;
  1076. int na;
  1077. for (na = 0; get_author(na, &dummy) != 0; na++)
  1078. ;
  1079. ((reference *)this)->nauthors = na;
  1080. }
  1081. return nauthors;
  1082. }