/contrib/groff/src/preproc/refer/ref.cpp

https://bitbucket.org/freebsd/freebsd-head/ · C++ · 1160 lines · 1066 code · 63 blank · 31 comment · 494 complexity · da905471f9a602cda2847984410a02ce MD5 · raw file

  1. // -*- C++ -*-
  2. /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
  3. Free Software Foundation, Inc.
  4. Written by James Clark (jjc@jclark.com)
  5. This file is part of groff.
  6. groff is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation; either version 2, or (at your option) any later
  9. version.
  10. groff is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12. FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  13. for more details.
  14. You should have received a copy of the GNU General Public License along
  15. with groff; see the file COPYING. If not, write to the Free Software
  16. Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  17. #include "refer.h"
  18. #include "refid.h"
  19. #include "ref.h"
  20. #include "token.h"
  21. static const char *find_day(const char *, const char *, const char **);
  22. static int find_month(const char *start, const char *end);
  23. static void abbreviate_names(string &);
  24. #define DEFAULT_ARTICLES "the\000a\000an"
  25. string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
  26. // Multiple occurrences of fields are separated by FIELD_SEPARATOR.
  27. const char FIELD_SEPARATOR = '\0';
  28. const char MULTI_FIELD_NAMES[] = "AE";
  29. const char *AUTHOR_FIELDS = "AQ";
  30. enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
  31. const char *reference_types[] = {
  32. "other",
  33. "journal-article",
  34. "book",
  35. "article-in-book",
  36. "tech-report",
  37. "bell-tm",
  38. };
  39. static string temp_fields[256];
  40. reference::reference(const char *start, int len, reference_id *ridp)
  41. : h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
  42. computed_authors(0), last_needed_author(-1), nauthors(-1)
  43. {
  44. int i;
  45. for (i = 0; i < 256; i++)
  46. field_index[i] = NULL_FIELD_INDEX;
  47. if (ridp)
  48. rid = *ridp;
  49. if (start == 0)
  50. return;
  51. if (len <= 0)
  52. return;
  53. const char *end = start + len;
  54. const char *ptr = start;
  55. assert(*ptr == '%');
  56. while (ptr < end) {
  57. if (ptr + 1 < end && ptr[1] != '\0'
  58. && ((ptr[1] != '%' && ptr[1] == annotation_field)
  59. || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
  60. && discard_fields.search(ptr[2]) < 0))) {
  61. if (ptr[1] == '%')
  62. ptr++;
  63. string &f = temp_fields[(unsigned char)ptr[1]];
  64. ptr += 2;
  65. while (ptr < end && csspace(*ptr))
  66. ptr++;
  67. for (;;) {
  68. for (;;) {
  69. if (ptr >= end) {
  70. f += '\n';
  71. break;
  72. }
  73. f += *ptr;
  74. if (*ptr++ == '\n')
  75. break;
  76. }
  77. if (ptr >= end || *ptr == '%')
  78. break;
  79. }
  80. }
  81. else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
  82. && discard_fields.search(ptr[1]) < 0) {
  83. string &f = temp_fields[(unsigned char)ptr[1]];
  84. if (f.length() > 0) {
  85. if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
  86. f += FIELD_SEPARATOR;
  87. else
  88. f.clear();
  89. }
  90. ptr += 2;
  91. if (ptr < end) {
  92. if (*ptr == ' ')
  93. ptr++;
  94. for (;;) {
  95. const char *p = ptr;
  96. while (ptr < end && *ptr != '\n')
  97. ptr++;
  98. // strip trailing white space
  99. const char *q = ptr;
  100. while (q > p && q[-1] != '\n' && csspace(q[-1]))
  101. q--;
  102. while (p < q)
  103. f += *p++;
  104. if (ptr >= end)
  105. break;
  106. ptr++;
  107. if (ptr >= end)
  108. break;
  109. if (*ptr == '%')
  110. break;
  111. f += ' ';
  112. }
  113. }
  114. }
  115. else {
  116. // skip this field
  117. for (;;) {
  118. while (ptr < end && *ptr++ != '\n')
  119. ;
  120. if (ptr >= end || *ptr == '%')
  121. break;
  122. }
  123. }
  124. }
  125. for (i = 0; i < 256; i++)
  126. if (temp_fields[i].length() > 0)
  127. nfields++;
  128. field = new string[nfields];
  129. int j = 0;
  130. for (i = 0; i < 256; i++)
  131. if (temp_fields[i].length() > 0) {
  132. field[j].move(temp_fields[i]);
  133. if (abbreviate_fields.search(i) >= 0)
  134. abbreviate_names(field[j]);
  135. field_index[i] = j;
  136. j++;
  137. }
  138. }
  139. reference::~reference()
  140. {
  141. if (nfields > 0)
  142. ad_delete(nfields) field;
  143. }
  144. // ref is the inline, this is the database ref
  145. void reference::merge(reference &ref)
  146. {
  147. int i;
  148. for (i = 0; i < 256; i++)
  149. if (field_index[i] != NULL_FIELD_INDEX)
  150. temp_fields[i].move(field[field_index[i]]);
  151. for (i = 0; i < 256; i++)
  152. if (ref.field_index[i] != NULL_FIELD_INDEX)
  153. temp_fields[i].move(ref.field[ref.field_index[i]]);
  154. for (i = 0; i < 256; i++)
  155. field_index[i] = NULL_FIELD_INDEX;
  156. int old_nfields = nfields;
  157. nfields = 0;
  158. for (i = 0; i < 256; i++)
  159. if (temp_fields[i].length() > 0)
  160. nfields++;
  161. if (nfields != old_nfields) {
  162. if (old_nfields > 0)
  163. ad_delete(old_nfields) field;
  164. field = new string[nfields];
  165. }
  166. int j = 0;
  167. for (i = 0; i < 256; i++)
  168. if (temp_fields[i].length() > 0) {
  169. field[j].move(temp_fields[i]);
  170. field_index[i] = j;
  171. j++;
  172. }
  173. merged = 1;
  174. }
  175. void reference::insert_field(unsigned char c, string &s)
  176. {
  177. assert(s.length() > 0);
  178. if (field_index[c] != NULL_FIELD_INDEX) {
  179. field[field_index[c]].move(s);
  180. return;
  181. }
  182. assert(field_index[c] == NULL_FIELD_INDEX);
  183. string *old_field = field;
  184. field = new string[nfields + 1];
  185. int pos = 0;
  186. int i;
  187. for (i = 0; i < int(c); i++)
  188. if (field_index[i] != NULL_FIELD_INDEX)
  189. pos++;
  190. for (i = 0; i < pos; i++)
  191. field[i].move(old_field[i]);
  192. field[pos].move(s);
  193. for (i = pos; i < nfields; i++)
  194. field[i + 1].move(old_field[i]);
  195. if (nfields > 0)
  196. ad_delete(nfields) old_field;
  197. nfields++;
  198. field_index[c] = pos;
  199. for (i = c + 1; i < 256; i++)
  200. if (field_index[i] != NULL_FIELD_INDEX)
  201. field_index[i] += 1;
  202. }
  203. void reference::delete_field(unsigned char c)
  204. {
  205. if (field_index[c] == NULL_FIELD_INDEX)
  206. return;
  207. string *old_field = field;
  208. field = new string[nfields - 1];
  209. int i;
  210. for (i = 0; i < int(field_index[c]); i++)
  211. field[i].move(old_field[i]);
  212. for (i = field_index[c]; i < nfields - 1; i++)
  213. field[i].move(old_field[i + 1]);
  214. if (nfields > 0)
  215. ad_delete(nfields) old_field;
  216. nfields--;
  217. field_index[c] = NULL_FIELD_INDEX;
  218. for (i = c + 1; i < 256; i++)
  219. if (field_index[i] != NULL_FIELD_INDEX)
  220. field_index[i] -= 1;
  221. }
  222. void reference::compute_hash_code()
  223. {
  224. if (!rid.is_null())
  225. h = rid.hash();
  226. else {
  227. h = 0;
  228. for (int i = 0; i < nfields; i++)
  229. if (field[i].length() > 0) {
  230. h <<= 4;
  231. h ^= hash_string(field[i].contents(), field[i].length());
  232. }
  233. }
  234. }
  235. void reference::set_number(int n)
  236. {
  237. no = n;
  238. }
  239. const char SORT_SEP = '\001';
  240. const char SORT_SUB_SEP = '\002';
  241. const char SORT_SUB_SUB_SEP = '\003';
  242. // sep specifies additional word separators
  243. void sortify_words(const char *s, const char *end, const char *sep,
  244. string &result)
  245. {
  246. int non_empty = 0;
  247. int need_separator = 0;
  248. for (;;) {
  249. const char *token_start = s;
  250. if (!get_token(&s, end))
  251. break;
  252. if ((s - token_start == 1
  253. && (*token_start == ' '
  254. || *token_start == '\n'
  255. || (sep && *token_start != '\0'
  256. && strchr(sep, *token_start) != 0)))
  257. || (s - token_start == 2
  258. && token_start[0] == '\\' && token_start[1] == ' ')) {
  259. if (non_empty)
  260. need_separator = 1;
  261. }
  262. else {
  263. const token_info *ti = lookup_token(token_start, s);
  264. if (ti->sortify_non_empty(token_start, s)) {
  265. if (need_separator) {
  266. result += ' ';
  267. need_separator = 0;
  268. }
  269. ti->sortify(token_start, s, result);
  270. non_empty = 1;
  271. }
  272. }
  273. }
  274. }
  275. void sortify_word(const char *s, const char *end, string &result)
  276. {
  277. for (;;) {
  278. const char *token_start = s;
  279. if (!get_token(&s, end))
  280. break;
  281. const token_info *ti = lookup_token(token_start, s);
  282. ti->sortify(token_start, s, result);
  283. }
  284. }
  285. void sortify_other(const char *s, int len, string &key)
  286. {
  287. sortify_words(s, s + len, 0, key);
  288. }
  289. void sortify_title(const char *s, int len, string &key)
  290. {
  291. const char *end = s + len;
  292. for (; s < end && (*s == ' ' || *s == '\n'); s++)
  293. ;
  294. const char *ptr = s;
  295. for (;;) {
  296. const char *token_start = ptr;
  297. if (!get_token(&ptr, end))
  298. break;
  299. if (ptr - token_start == 1
  300. && (*token_start == ' ' || *token_start == '\n'))
  301. break;
  302. }
  303. if (ptr < end) {
  304. unsigned int first_word_len = ptr - s - 1;
  305. const char *ae = articles.contents() + articles.length();
  306. for (const char *a = articles.contents();
  307. a < ae;
  308. a = strchr(a, '\0') + 1)
  309. if (first_word_len == strlen(a)) {
  310. unsigned int j;
  311. for (j = 0; j < first_word_len; j++)
  312. if (a[j] != cmlower(s[j]))
  313. break;
  314. if (j >= first_word_len) {
  315. s = ptr;
  316. for (; s < end && (*s == ' ' || *s == '\n'); s++)
  317. ;
  318. break;
  319. }
  320. }
  321. }
  322. sortify_words(s, end, 0, key);
  323. }
  324. void sortify_name(const char *s, int len, string &key)
  325. {
  326. const char *last_name_end;
  327. const char *last_name = find_last_name(s, s + len, &last_name_end);
  328. sortify_word(last_name, last_name_end, key);
  329. key += SORT_SUB_SUB_SEP;
  330. if (last_name > s)
  331. sortify_words(s, last_name, ".", key);
  332. key += SORT_SUB_SUB_SEP;
  333. if (last_name_end < s + len)
  334. sortify_words(last_name_end, s + len, ".,", key);
  335. }
  336. void sortify_date(const char *s, int len, string &key)
  337. {
  338. const char *year_end;
  339. const char *year_start = find_year(s, s + len, &year_end);
  340. if (!year_start) {
  341. // Things without years are often `forthcoming', so it makes sense
  342. // that they sort after things with explicit years.
  343. key += 'A';
  344. sortify_words(s, s + len, 0, key);
  345. return;
  346. }
  347. int n = year_end - year_start;
  348. while (n < 4) {
  349. key += '0';
  350. n++;
  351. }
  352. while (year_start < year_end)
  353. key += *year_start++;
  354. int m = find_month(s, s + len);
  355. if (m < 0)
  356. return;
  357. key += 'A' + m;
  358. const char *day_end;
  359. const char *day_start = find_day(s, s + len, &day_end);
  360. if (!day_start)
  361. return;
  362. if (day_end - day_start == 1)
  363. key += '0';
  364. while (day_start < day_end)
  365. key += *day_start++;
  366. }
  367. // SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
  368. void sortify_label(const char *s, int len, string &key)
  369. {
  370. const char *end = s + len;
  371. for (;;) {
  372. const char *ptr;
  373. for (ptr = s;
  374. ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
  375. ptr++)
  376. ;
  377. if (ptr > s)
  378. sortify_words(s, ptr, 0, key);
  379. s = ptr;
  380. if (s >= end)
  381. break;
  382. key += *s++;
  383. }
  384. }
  385. void reference::compute_sort_key()
  386. {
  387. if (sort_fields.length() == 0)
  388. return;
  389. sort_fields += '\0';
  390. const char *sf = sort_fields.contents();
  391. while (*sf != '\0') {
  392. sort_key += SORT_SEP;
  393. char f = *sf++;
  394. int n = 1;
  395. if (*sf == '+') {
  396. n = INT_MAX;
  397. sf++;
  398. }
  399. else if (csdigit(*sf)) {
  400. char *ptr;
  401. long l = strtol(sf, &ptr, 10);
  402. if (l == 0 && ptr == sf)
  403. ;
  404. else {
  405. sf = ptr;
  406. if (l < 0) {
  407. n = 1;
  408. }
  409. else {
  410. n = int(l);
  411. }
  412. }
  413. }
  414. if (f == '.')
  415. sortify_label(label.contents(), label.length(), sort_key);
  416. else if (f == AUTHOR_FIELDS[0])
  417. sortify_authors(n, sort_key);
  418. else
  419. sortify_field(f, n, sort_key);
  420. }
  421. sort_fields.set_length(sort_fields.length() - 1);
  422. }
  423. void reference::sortify_authors(int n, string &result) const
  424. {
  425. for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
  426. if (contains_field(*p)) {
  427. sortify_field(*p, n, result);
  428. return;
  429. }
  430. sortify_field(AUTHOR_FIELDS[0], n, result);
  431. }
  432. void reference::canonicalize_authors(string &result) const
  433. {
  434. int len = result.length();
  435. sortify_authors(INT_MAX, result);
  436. if (result.length() > len)
  437. result += SORT_SUB_SEP;
  438. }
  439. void reference::sortify_field(unsigned char f, int n, string &result) const
  440. {
  441. typedef void (*sortify_t)(const char *, int, string &);
  442. sortify_t sortifier = sortify_other;
  443. switch (f) {
  444. case 'A':
  445. case 'E':
  446. sortifier = sortify_name;
  447. break;
  448. case 'D':
  449. sortifier = sortify_date;
  450. break;
  451. case 'B':
  452. case 'J':
  453. case 'T':
  454. sortifier = sortify_title;
  455. break;
  456. }
  457. int fi = field_index[(unsigned char)f];
  458. if (fi != NULL_FIELD_INDEX) {
  459. string &str = field[fi];
  460. const char *start = str.contents();
  461. const char *end = start + str.length();
  462. for (int i = 0; i < n && start < end; i++) {
  463. const char *p = start;
  464. while (start < end && *start != FIELD_SEPARATOR)
  465. start++;
  466. if (i > 0)
  467. result += SORT_SUB_SEP;
  468. (*sortifier)(p, start - p, result);
  469. if (start < end)
  470. start++;
  471. }
  472. }
  473. }
  474. int compare_reference(const reference &r1, const reference &r2)
  475. {
  476. assert(r1.no >= 0);
  477. assert(r2.no >= 0);
  478. const char *s1 = r1.sort_key.contents();
  479. int n1 = r1.sort_key.length();
  480. const char *s2 = r2.sort_key.contents();
  481. int n2 = r2.sort_key.length();
  482. for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
  483. if (*s1 != *s2)
  484. return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
  485. if (n2 > 0)
  486. return -1;
  487. if (n1 > 0)
  488. return 1;
  489. return r1.no - r2.no;
  490. }
  491. int same_reference(const reference &r1, const reference &r2)
  492. {
  493. if (!r1.rid.is_null() && r1.rid == r2.rid)
  494. return 1;
  495. if (r1.h != r2.h)
  496. return 0;
  497. if (r1.nfields != r2.nfields)
  498. return 0;
  499. int i = 0;
  500. for (i = 0; i < 256; i++)
  501. if (r1.field_index != r2.field_index)
  502. return 0;
  503. for (i = 0; i < r1.nfields; i++)
  504. if (r1.field[i] != r2.field[i])
  505. return 0;
  506. return 1;
  507. }
  508. const char *find_last_name(const char *start, const char *end,
  509. const char **endp)
  510. {
  511. const char *ptr = start;
  512. const char *last_word = start;
  513. for (;;) {
  514. const char *token_start = ptr;
  515. if (!get_token(&ptr, end))
  516. break;
  517. if (ptr - token_start == 1) {
  518. if (*token_start == ',') {
  519. *endp = token_start;
  520. return last_word;
  521. }
  522. else if (*token_start == ' ' || *token_start == '\n') {
  523. if (ptr < end && *ptr != ' ' && *ptr != '\n')
  524. last_word = ptr;
  525. }
  526. }
  527. }
  528. *endp = end;
  529. return last_word;
  530. }
  531. void abbreviate_name(const char *ptr, const char *end, string &result)
  532. {
  533. const char *last_name_end;
  534. const char *last_name_start = find_last_name(ptr, end, &last_name_end);
  535. int need_period = 0;
  536. for (;;) {
  537. const char *token_start = ptr;
  538. if (!get_token(&ptr, last_name_start))
  539. break;
  540. const token_info *ti = lookup_token(token_start, ptr);
  541. if (need_period) {
  542. if ((ptr - token_start == 1 && *token_start == ' ')
  543. || (ptr - token_start == 2 && token_start[0] == '\\'
  544. && token_start[1] == ' '))
  545. continue;
  546. if (ti->is_upper())
  547. result += period_before_initial;
  548. else
  549. result += period_before_other;
  550. need_period = 0;
  551. }
  552. result.append(token_start, ptr - token_start);
  553. if (ti->is_upper()) {
  554. const char *lower_ptr = ptr;
  555. int first_token = 1;
  556. for (;;) {
  557. token_start = ptr;
  558. if (!get_token(&ptr, last_name_start))
  559. break;
  560. if ((ptr - token_start == 1 && *token_start == ' ')
  561. || (ptr - token_start == 2 && token_start[0] == '\\'
  562. && token_start[1] == ' '))
  563. break;
  564. ti = lookup_token(token_start, ptr);
  565. if (ti->is_hyphen()) {
  566. const char *ptr1 = ptr;
  567. if (get_token(&ptr1, last_name_start)) {
  568. ti = lookup_token(ptr, ptr1);
  569. if (ti->is_upper()) {
  570. result += period_before_hyphen;
  571. result.append(token_start, ptr1 - token_start);
  572. ptr = ptr1;
  573. }
  574. }
  575. }
  576. else if (ti->is_upper()) {
  577. // MacDougal -> MacD.
  578. result.append(lower_ptr, ptr - lower_ptr);
  579. lower_ptr = ptr;
  580. first_token = 1;
  581. }
  582. else if (first_token && ti->is_accent()) {
  583. result.append(token_start, ptr - token_start);
  584. lower_ptr = ptr;
  585. }
  586. first_token = 0;
  587. }
  588. need_period = 1;
  589. }
  590. }
  591. if (need_period)
  592. result += period_before_last_name;
  593. result.append(last_name_start, end - last_name_start);
  594. }
  595. static void abbreviate_names(string &result)
  596. {
  597. string str;
  598. str.move(result);
  599. const char *ptr = str.contents();
  600. const char *end = ptr + str.length();
  601. while (ptr < end) {
  602. const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
  603. if (name_end == 0)
  604. name_end = end;
  605. abbreviate_name(ptr, name_end, result);
  606. if (name_end >= end)
  607. break;
  608. ptr = name_end + 1;
  609. result += FIELD_SEPARATOR;
  610. }
  611. }
  612. void reverse_name(const char *ptr, const char *name_end, string &result)
  613. {
  614. const char *last_name_end;
  615. const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
  616. result.append(last_name_start, last_name_end - last_name_start);
  617. while (last_name_start > ptr
  618. && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
  619. last_name_start--;
  620. if (last_name_start > ptr) {
  621. result += ", ";
  622. result.append(ptr, last_name_start - ptr);
  623. }
  624. if (last_name_end < name_end)
  625. result.append(last_name_end, name_end - last_name_end);
  626. }
  627. void reverse_names(string &result, int n)
  628. {
  629. if (n <= 0)
  630. return;
  631. string str;
  632. str.move(result);
  633. const char *ptr = str.contents();
  634. const char *end = ptr + str.length();
  635. while (ptr < end) {
  636. if (--n < 0) {
  637. result.append(ptr, end - ptr);
  638. break;
  639. }
  640. const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
  641. if (name_end == 0)
  642. name_end = end;
  643. reverse_name(ptr, name_end, result);
  644. if (name_end >= end)
  645. break;
  646. ptr = name_end + 1;
  647. result += FIELD_SEPARATOR;
  648. }
  649. }
  650. // Return number of field separators.
  651. int join_fields(string &f)
  652. {
  653. const char *ptr = f.contents();
  654. int len = f.length();
  655. int nfield_seps = 0;
  656. int j;
  657. for (j = 0; j < len; j++)
  658. if (ptr[j] == FIELD_SEPARATOR)
  659. nfield_seps++;
  660. if (nfield_seps == 0)
  661. return 0;
  662. string temp;
  663. int field_seps_left = nfield_seps;
  664. for (j = 0; j < len; j++) {
  665. if (ptr[j] == FIELD_SEPARATOR) {
  666. if (nfield_seps == 1)
  667. temp += join_authors_exactly_two;
  668. else if (--field_seps_left == 0)
  669. temp += join_authors_last_two;
  670. else
  671. temp += join_authors_default;
  672. }
  673. else
  674. temp += ptr[j];
  675. }
  676. f = temp;
  677. return nfield_seps;
  678. }
  679. void uppercase(const char *start, const char *end, string &result)
  680. {
  681. for (;;) {
  682. const char *token_start = start;
  683. if (!get_token(&start, end))
  684. break;
  685. const token_info *ti = lookup_token(token_start, start);
  686. ti->upper_case(token_start, start, result);
  687. }
  688. }
  689. void lowercase(const char *start, const char *end, string &result)
  690. {
  691. for (;;) {
  692. const char *token_start = start;
  693. if (!get_token(&start, end))
  694. break;
  695. const token_info *ti = lookup_token(token_start, start);
  696. ti->lower_case(token_start, start, result);
  697. }
  698. }
  699. void capitalize(const char *ptr, const char *end, string &result)
  700. {
  701. int in_small_point_size = 0;
  702. for (;;) {
  703. const char *start = ptr;
  704. if (!get_token(&ptr, end))
  705. break;
  706. const token_info *ti = lookup_token(start, ptr);
  707. const char *char_end = ptr;
  708. int is_lower = ti->is_lower();
  709. if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
  710. const token_info *ti2 = lookup_token(char_end, ptr);
  711. if (!ti2->is_accent())
  712. ptr = char_end;
  713. }
  714. if (is_lower) {
  715. if (!in_small_point_size) {
  716. result += "\\s-2";
  717. in_small_point_size = 1;
  718. }
  719. ti->upper_case(start, char_end, result);
  720. result.append(char_end, ptr - char_end);
  721. }
  722. else {
  723. if (in_small_point_size) {
  724. result += "\\s+2";
  725. in_small_point_size = 0;
  726. }
  727. result.append(start, ptr - start);
  728. }
  729. }
  730. if (in_small_point_size)
  731. result += "\\s+2";
  732. }
  733. void capitalize_field(string &str)
  734. {
  735. string temp;
  736. capitalize(str.contents(), str.contents() + str.length(), temp);
  737. str.move(temp);
  738. }
  739. int is_terminated(const char *ptr, const char *end)
  740. {
  741. const char *last_token = end;
  742. for (;;) {
  743. const char *p = ptr;
  744. if (!get_token(&ptr, end))
  745. break;
  746. last_token = p;
  747. }
  748. return end - last_token == 1
  749. && (*last_token == '.' || *last_token == '!' || *last_token == '?');
  750. }
  751. void reference::output(FILE *fp)
  752. {
  753. fputs(".]-\n", fp);
  754. for (int i = 0; i < 256; i++)
  755. if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
  756. string &f = field[field_index[i]];
  757. if (!csdigit(i)) {
  758. int j = reverse_fields.search(i);
  759. if (j >= 0) {
  760. int n;
  761. int len = reverse_fields.length();
  762. if (++j < len && csdigit(reverse_fields[j])) {
  763. n = reverse_fields[j] - '0';
  764. for (++j; j < len && csdigit(reverse_fields[j]); j++)
  765. // should check for overflow
  766. n = n*10 + reverse_fields[j] - '0';
  767. }
  768. else
  769. n = INT_MAX;
  770. reverse_names(f, n);
  771. }
  772. }
  773. int is_multiple = join_fields(f) > 0;
  774. if (capitalize_fields.search(i) >= 0)
  775. capitalize_field(f);
  776. if (memchr(f.contents(), '\n', f.length()) == 0) {
  777. fprintf(fp, ".ds [%c ", i);
  778. if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
  779. putc('"', fp);
  780. put_string(f, fp);
  781. putc('\n', fp);
  782. }
  783. else {
  784. fprintf(fp, ".de [%c\n", i);
  785. put_string(f, fp);
  786. fputs("..\n", fp);
  787. }
  788. if (i == 'P') {
  789. int multiple_pages = 0;
  790. const char *s = f.contents();
  791. const char *end = f.contents() + f.length();
  792. for (;;) {
  793. const char *token_start = s;
  794. if (!get_token(&s, end))
  795. break;
  796. const token_info *ti = lookup_token(token_start, s);
  797. if (ti->is_hyphen() || ti->is_range_sep()) {
  798. multiple_pages = 1;
  799. break;
  800. }
  801. }
  802. fprintf(fp, ".nr [P %d\n", multiple_pages);
  803. }
  804. else if (i == 'E')
  805. fprintf(fp, ".nr [E %d\n", is_multiple);
  806. }
  807. for (const char *p = "TAO"; *p; p++) {
  808. int fi = field_index[(unsigned char)*p];
  809. if (fi != NULL_FIELD_INDEX) {
  810. string &f = field[fi];
  811. fprintf(fp, ".nr [%c %d\n", *p,
  812. is_terminated(f.contents(), f.contents() + f.length()));
  813. }
  814. }
  815. int t = classify();
  816. fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
  817. if (annotation_macro.length() > 0 && annotation_field >= 0
  818. && field_index[annotation_field] != NULL_FIELD_INDEX) {
  819. putc('.', fp);
  820. put_string(annotation_macro, fp);
  821. putc('\n', fp);
  822. put_string(field[field_index[annotation_field]], fp);
  823. }
  824. }
  825. void reference::print_sort_key_comment(FILE *fp)
  826. {
  827. fputs(".\\\"", fp);
  828. put_string(sort_key, fp);
  829. putc('\n', fp);
  830. }
  831. const char *find_year(const char *start, const char *end, const char **endp)
  832. {
  833. for (;;) {
  834. while (start < end && !csdigit(*start))
  835. start++;
  836. const char *ptr = start;
  837. if (start == end)
  838. break;
  839. while (ptr < end && csdigit(*ptr))
  840. ptr++;
  841. if (ptr - start == 4 || ptr - start == 3
  842. || (ptr - start == 2
  843. && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
  844. *endp = ptr;
  845. return start;
  846. }
  847. start = ptr;
  848. }
  849. return 0;
  850. }
  851. static const char *find_day(const char *start, const char *end,
  852. const char **endp)
  853. {
  854. for (;;) {
  855. while (start < end && !csdigit(*start))
  856. start++;
  857. const char *ptr = start;
  858. if (start == end)
  859. break;
  860. while (ptr < end && csdigit(*ptr))
  861. ptr++;
  862. if ((ptr - start == 1 && start[0] != '0')
  863. || (ptr - start == 2 &&
  864. (start[0] == '1'
  865. || start[0] == '2'
  866. || (start[0] == '3' && start[1] <= '1')
  867. || (start[0] == '0' && start[1] != '0')))) {
  868. *endp = ptr;
  869. return start;
  870. }
  871. start = ptr;
  872. }
  873. return 0;
  874. }
  875. static int find_month(const char *start, const char *end)
  876. {
  877. static const char *months[] = {
  878. "january",
  879. "february",
  880. "march",
  881. "april",
  882. "may",
  883. "june",
  884. "july",
  885. "august",
  886. "september",
  887. "october",
  888. "november",
  889. "december",
  890. };
  891. for (;;) {
  892. while (start < end && !csalpha(*start))
  893. start++;
  894. const char *ptr = start;
  895. if (start == end)
  896. break;
  897. while (ptr < end && csalpha(*ptr))
  898. ptr++;
  899. if (ptr - start >= 3) {
  900. for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
  901. const char *q = months[i];
  902. const char *p = start;
  903. for (; p < ptr; p++, q++)
  904. if (cmlower(*p) != *q)
  905. break;
  906. if (p >= ptr)
  907. return i;
  908. }
  909. }
  910. start = ptr;
  911. }
  912. return -1;
  913. }
  914. int reference::contains_field(char c) const
  915. {
  916. return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
  917. }
  918. int reference::classify()
  919. {
  920. if (contains_field('J'))
  921. return JOURNAL_ARTICLE;
  922. if (contains_field('B'))
  923. return ARTICLE_IN_BOOK;
  924. if (contains_field('G'))
  925. return TECH_REPORT;
  926. if (contains_field('R'))
  927. return TECH_REPORT;
  928. if (contains_field('I'))
  929. return BOOK;
  930. if (contains_field('M'))
  931. return BELL_TM;
  932. return OTHER;
  933. }
  934. const char *reference::get_year(const char **endp) const
  935. {
  936. if (field_index['D'] != NULL_FIELD_INDEX) {
  937. string &date = field[field_index['D']];
  938. const char *start = date.contents();
  939. const char *end = start + date.length();
  940. return find_year(start, end, endp);
  941. }
  942. else
  943. return 0;
  944. }
  945. const char *reference::get_field(unsigned char c, const char **endp) const
  946. {
  947. if (field_index[c] != NULL_FIELD_INDEX) {
  948. string &f = field[field_index[c]];
  949. const char *start = f.contents();
  950. *endp = start + f.length();
  951. return start;
  952. }
  953. else
  954. return 0;
  955. }
  956. const char *reference::get_date(const char **endp) const
  957. {
  958. return get_field('D', endp);
  959. }
  960. const char *nth_field(int i, const char *start, const char **endp)
  961. {
  962. while (--i >= 0) {
  963. start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
  964. if (!start)
  965. return 0;
  966. start++;
  967. }
  968. const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
  969. if (e)
  970. *endp = e;
  971. return start;
  972. }
  973. const char *reference::get_author(int i, const char **endp) const
  974. {
  975. for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
  976. const char *start = get_field(*f, endp);
  977. if (start) {
  978. if (strchr(MULTI_FIELD_NAMES, *f) != 0)
  979. return nth_field(i, start, endp);
  980. else if (i == 0)
  981. return start;
  982. else
  983. return 0;
  984. }
  985. }
  986. return 0;
  987. }
  988. const char *reference::get_author_last_name(int i, const char **endp) const
  989. {
  990. for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
  991. const char *start = get_field(*f, endp);
  992. if (start) {
  993. if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
  994. start = nth_field(i, start, endp);
  995. if (!start)
  996. return 0;
  997. }
  998. if (*f == 'A')
  999. return find_last_name(start, *endp, endp);
  1000. else
  1001. return start;
  1002. }
  1003. }
  1004. return 0;
  1005. }
  1006. void reference::set_date(string &d)
  1007. {
  1008. if (d.length() == 0)
  1009. delete_field('D');
  1010. else
  1011. insert_field('D', d);
  1012. }
  1013. int same_year(const reference &r1, const reference &r2)
  1014. {
  1015. const char *ye1;
  1016. const char *ys1 = r1.get_year(&ye1);
  1017. const char *ye2;
  1018. const char *ys2 = r2.get_year(&ye2);
  1019. if (ys1 == 0) {
  1020. if (ys2 == 0)
  1021. return same_date(r1, r2);
  1022. else
  1023. return 0;
  1024. }
  1025. else if (ys2 == 0)
  1026. return 0;
  1027. else if (ye1 - ys1 != ye2 - ys2)
  1028. return 0;
  1029. else
  1030. return memcmp(ys1, ys2, ye1 - ys1) == 0;
  1031. }
  1032. int same_date(const reference &r1, const reference &r2)
  1033. {
  1034. const char *e1;
  1035. const char *s1 = r1.get_date(&e1);
  1036. const char *e2;
  1037. const char *s2 = r2.get_date(&e2);
  1038. if (s1 == 0)
  1039. return s2 == 0;
  1040. else if (s2 == 0)
  1041. return 0;
  1042. else if (e1 - s1 != e2 - s2)
  1043. return 0;
  1044. else
  1045. return memcmp(s1, s2, e1 - s1) == 0;
  1046. }
  1047. const char *reference::get_sort_field(int i, int si, int ssi,
  1048. const char **endp) const
  1049. {
  1050. const char *start = sort_key.contents();
  1051. const char *end = start + sort_key.length();
  1052. if (i < 0) {
  1053. *endp = end;
  1054. return start;
  1055. }
  1056. while (--i >= 0) {
  1057. start = (char *)memchr(start, SORT_SEP, end - start);
  1058. if (!start)
  1059. return 0;
  1060. start++;
  1061. }
  1062. const char *e = (char *)memchr(start, SORT_SEP, end - start);
  1063. if (e)
  1064. end = e;
  1065. if (si < 0) {
  1066. *endp = end;
  1067. return start;
  1068. }
  1069. while (--si >= 0) {
  1070. start = (char *)memchr(start, SORT_SUB_SEP, end - start);
  1071. if (!start)
  1072. return 0;
  1073. start++;
  1074. }
  1075. e = (char *)memchr(start, SORT_SUB_SEP, end - start);
  1076. if (e)
  1077. end = e;
  1078. if (ssi < 0) {
  1079. *endp = end;
  1080. return start;
  1081. }
  1082. while (--ssi >= 0) {
  1083. start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
  1084. if (!start)
  1085. return 0;
  1086. start++;
  1087. }
  1088. e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
  1089. if (e)
  1090. end = e;
  1091. *endp = end;
  1092. return start;
  1093. }