PageRenderTime 164ms CodeModel.GetById 64ms app.highlight 91ms RepoModel.GetById 1ms app.codeStats 0ms

/contrib/groff/src/preproc/refer/ref.cpp

https://bitbucket.org/freebsd/freebsd-head/
C++ | 1160 lines | 1066 code | 63 blank | 31 comment | 494 complexity | da905471f9a602cda2847984410a02ce MD5 | raw file
   1// -*- C++ -*-
   2/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
   3   Free Software Foundation, Inc.
   4Written by James Clark (jjc@jclark.com)
   5
   6This file is part of groff.
   7
   8groff is free software; you can redistribute it and/or modify it under
   9the terms of the GNU General Public License as published by the Free
  10Software Foundation; either version 2, or (at your option) any later
  11version.
  12
  13groff is distributed in the hope that it will be useful, but WITHOUT ANY
  14WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16for more details.
  17
  18You should have received a copy of the GNU General Public License along
  19with groff; see the file COPYING.  If not, write to the Free Software
  20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  21     
  22#include "refer.h"
  23#include "refid.h"
  24#include "ref.h"
  25#include "token.h"
  26
  27static const char *find_day(const char *, const char *, const char **);
  28static int find_month(const char *start, const char *end);
  29static void abbreviate_names(string &);
  30
  31#define DEFAULT_ARTICLES "the\000a\000an"
  32     
  33string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
  34
  35// Multiple occurrences of fields are separated by FIELD_SEPARATOR.
  36const char FIELD_SEPARATOR = '\0';
  37
  38const char MULTI_FIELD_NAMES[] = "AE";
  39const char *AUTHOR_FIELDS = "AQ";
  40
  41enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
  42
  43const char *reference_types[] = {
  44  "other",
  45  "journal-article",
  46  "book",
  47  "article-in-book",
  48  "tech-report",
  49  "bell-tm",
  50};
  51
  52static string temp_fields[256];
  53
  54reference::reference(const char *start, int len, reference_id *ridp)
  55: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
  56  computed_authors(0), last_needed_author(-1), nauthors(-1)
  57{
  58  int i;
  59  for (i = 0; i < 256; i++)
  60    field_index[i] = NULL_FIELD_INDEX;
  61  if (ridp)
  62    rid = *ridp;
  63  if (start == 0)
  64    return;
  65  if (len <= 0)
  66    return;
  67  const char *end = start + len;
  68  const char *ptr = start;
  69  assert(*ptr == '%');
  70  while (ptr < end) {
  71    if (ptr + 1 < end && ptr[1] != '\0'
  72	&& ((ptr[1] != '%' && ptr[1] == annotation_field)
  73	    || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
  74		&& discard_fields.search(ptr[2]) < 0))) {
  75      if (ptr[1] == '%')
  76	ptr++;
  77      string &f = temp_fields[(unsigned char)ptr[1]];
  78      ptr += 2;
  79      while (ptr < end && csspace(*ptr))
  80	ptr++;
  81      for (;;) {
  82	for (;;) {
  83	  if (ptr >= end) {
  84	    f += '\n';
  85	    break;
  86	  }
  87	  f += *ptr;
  88	  if (*ptr++ == '\n')
  89	    break;
  90	}
  91	if (ptr >= end || *ptr == '%')
  92	  break;
  93      }
  94    }
  95    else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
  96	     && discard_fields.search(ptr[1]) < 0) {
  97      string &f = temp_fields[(unsigned char)ptr[1]];
  98      if (f.length() > 0) {
  99	if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
 100	  f += FIELD_SEPARATOR;
 101	else
 102	  f.clear();
 103      }
 104      ptr += 2;
 105      if (ptr < end) {
 106	if (*ptr == ' ')
 107	  ptr++;
 108	for (;;) {
 109	  const char *p = ptr;
 110	  while (ptr < end && *ptr != '\n')
 111	    ptr++;
 112	  // strip trailing white space
 113	  const char *q = ptr;
 114	  while (q > p && q[-1] != '\n' && csspace(q[-1]))
 115	    q--;
 116	  while (p < q)
 117	    f += *p++;
 118	  if (ptr >= end)
 119	    break;
 120	  ptr++;
 121	  if (ptr >= end)
 122	    break;
 123	  if (*ptr == '%')
 124	    break;
 125	  f += ' ';
 126	}
 127      }
 128    }
 129    else {
 130      // skip this field
 131      for (;;) {
 132	while (ptr < end && *ptr++ != '\n')
 133	  ;
 134	if (ptr >= end || *ptr == '%')
 135	  break;
 136      }
 137    }
 138  }
 139  for (i = 0; i < 256; i++)
 140    if (temp_fields[i].length() > 0)
 141      nfields++;
 142  field = new string[nfields];
 143  int j = 0;
 144  for (i = 0; i < 256; i++)
 145    if (temp_fields[i].length() > 0) {
 146      field[j].move(temp_fields[i]);
 147      if (abbreviate_fields.search(i) >= 0)
 148	abbreviate_names(field[j]);
 149      field_index[i] = j;
 150      j++;
 151    }
 152}
 153
 154reference::~reference()
 155{
 156  if (nfields > 0)
 157    ad_delete(nfields) field;
 158}
 159
 160// ref is the inline, this is the database ref
 161
 162void reference::merge(reference &ref)
 163{
 164  int i;
 165  for (i = 0; i < 256; i++)
 166    if (field_index[i] != NULL_FIELD_INDEX)
 167      temp_fields[i].move(field[field_index[i]]);
 168  for (i = 0; i < 256; i++)
 169    if (ref.field_index[i] != NULL_FIELD_INDEX)
 170      temp_fields[i].move(ref.field[ref.field_index[i]]);
 171  for (i = 0; i < 256; i++)
 172    field_index[i] = NULL_FIELD_INDEX;
 173  int old_nfields = nfields;
 174  nfields = 0;
 175  for (i = 0; i < 256; i++)
 176    if (temp_fields[i].length() > 0)
 177      nfields++;
 178  if (nfields != old_nfields) {
 179    if (old_nfields > 0)
 180      ad_delete(old_nfields) field;
 181    field = new string[nfields];
 182  }
 183  int j = 0;
 184  for (i = 0; i < 256; i++)
 185    if (temp_fields[i].length() > 0) {
 186      field[j].move(temp_fields[i]);
 187      field_index[i] = j;
 188      j++;
 189    }
 190  merged = 1;
 191}
 192
 193void reference::insert_field(unsigned char c, string &s)
 194{
 195  assert(s.length() > 0);
 196  if (field_index[c] != NULL_FIELD_INDEX) {
 197    field[field_index[c]].move(s);
 198    return;
 199  }
 200  assert(field_index[c] == NULL_FIELD_INDEX);
 201  string *old_field = field;
 202  field = new string[nfields + 1];
 203  int pos = 0;
 204  int i;
 205  for (i = 0; i < int(c); i++)
 206    if (field_index[i] != NULL_FIELD_INDEX)
 207      pos++;
 208  for (i = 0; i < pos; i++)
 209    field[i].move(old_field[i]);
 210  field[pos].move(s);
 211  for (i = pos; i < nfields; i++)
 212    field[i + 1].move(old_field[i]);
 213  if (nfields > 0)
 214    ad_delete(nfields) old_field;
 215  nfields++;
 216  field_index[c] = pos;
 217  for (i = c + 1; i < 256; i++)
 218    if (field_index[i] != NULL_FIELD_INDEX)
 219      field_index[i] += 1;
 220}
 221
 222void reference::delete_field(unsigned char c)
 223{
 224  if (field_index[c] == NULL_FIELD_INDEX)
 225    return;
 226  string *old_field = field;
 227  field = new string[nfields - 1];
 228  int i;
 229  for (i = 0; i < int(field_index[c]); i++)
 230    field[i].move(old_field[i]);
 231  for (i = field_index[c]; i < nfields - 1; i++)
 232    field[i].move(old_field[i + 1]);
 233  if (nfields > 0)
 234    ad_delete(nfields) old_field;
 235  nfields--;
 236  field_index[c] = NULL_FIELD_INDEX;
 237  for (i = c + 1; i < 256; i++)
 238    if (field_index[i] != NULL_FIELD_INDEX)
 239      field_index[i] -= 1;
 240}
 241    
 242void reference::compute_hash_code()
 243{
 244  if (!rid.is_null())
 245    h = rid.hash();
 246  else {
 247    h = 0;
 248    for (int i = 0; i < nfields; i++)
 249      if (field[i].length() > 0) {
 250	h <<= 4;
 251	h ^= hash_string(field[i].contents(), field[i].length());
 252      }
 253  }
 254}
 255
 256void reference::set_number(int n)
 257{
 258  no = n;
 259}
 260
 261const char SORT_SEP = '\001';
 262const char SORT_SUB_SEP = '\002';
 263const char SORT_SUB_SUB_SEP = '\003';
 264
 265// sep specifies additional word separators
 266
 267void sortify_words(const char *s, const char *end, const char *sep,
 268		   string &result)
 269{
 270  int non_empty = 0;
 271  int need_separator = 0;
 272  for (;;) {
 273    const char *token_start = s;
 274    if (!get_token(&s, end))
 275      break;
 276    if ((s - token_start == 1
 277	 && (*token_start == ' '
 278	     || *token_start == '\n'
 279	     || (sep && *token_start != '\0'
 280		 && strchr(sep, *token_start) != 0)))
 281	|| (s - token_start == 2
 282	    && token_start[0] == '\\' && token_start[1] == ' ')) {
 283      if (non_empty)
 284	need_separator = 1;
 285    }
 286    else {
 287      const token_info *ti = lookup_token(token_start, s);
 288      if (ti->sortify_non_empty(token_start, s)) {
 289	if (need_separator) {
 290	  result += ' ';
 291	  need_separator = 0;
 292	}
 293	ti->sortify(token_start, s, result);
 294	non_empty = 1;
 295      }
 296    }
 297  }
 298}
 299
 300void sortify_word(const char *s, const char *end, string &result)
 301{
 302  for (;;) {
 303    const char *token_start = s;
 304    if (!get_token(&s, end))
 305      break;
 306    const token_info *ti = lookup_token(token_start, s);
 307    ti->sortify(token_start, s, result);
 308  }
 309}
 310
 311void sortify_other(const char *s, int len, string &key)
 312{
 313  sortify_words(s, s + len, 0, key);
 314}
 315
 316void sortify_title(const char *s, int len, string &key)
 317{
 318  const char *end = s + len;
 319  for (; s < end && (*s == ' ' || *s == '\n'); s++) 
 320    ;
 321  const char *ptr = s;
 322  for (;;) {
 323    const char *token_start = ptr;
 324    if (!get_token(&ptr, end))
 325      break;
 326    if (ptr - token_start == 1
 327	&& (*token_start == ' ' || *token_start == '\n'))
 328      break;
 329  }
 330  if (ptr < end) {
 331    unsigned int first_word_len = ptr - s - 1;
 332    const char *ae = articles.contents() + articles.length();
 333    for (const char *a = articles.contents();
 334	 a < ae;
 335	 a = strchr(a, '\0') + 1)
 336      if (first_word_len == strlen(a)) {
 337	unsigned int j;
 338	for (j = 0; j < first_word_len; j++)
 339	  if (a[j] != cmlower(s[j]))
 340	    break;
 341	if (j >= first_word_len) {
 342	  s = ptr;
 343	  for (; s < end && (*s == ' ' || *s == '\n'); s++)
 344	    ;
 345	  break;
 346	}
 347      }
 348  }
 349  sortify_words(s, end, 0, key);
 350}
 351
 352void sortify_name(const char *s, int len, string &key)
 353{
 354  const char *last_name_end;
 355  const char *last_name = find_last_name(s, s + len, &last_name_end);
 356  sortify_word(last_name, last_name_end, key);
 357  key += SORT_SUB_SUB_SEP;
 358  if (last_name > s)
 359    sortify_words(s, last_name, ".", key);
 360  key += SORT_SUB_SUB_SEP;
 361  if (last_name_end < s + len)
 362    sortify_words(last_name_end, s + len, ".,", key);
 363}
 364
 365void sortify_date(const char *s, int len, string &key)
 366{
 367  const char *year_end;
 368  const char *year_start = find_year(s, s + len, &year_end);
 369  if (!year_start) {
 370    // Things without years are often `forthcoming', so it makes sense
 371    // that they sort after things with explicit years.
 372    key += 'A';
 373    sortify_words(s, s + len, 0, key);
 374    return;
 375  }
 376  int n = year_end - year_start;
 377  while (n < 4) {
 378    key += '0';
 379    n++;
 380  }
 381  while (year_start < year_end)
 382    key += *year_start++;
 383  int m = find_month(s, s + len);
 384  if (m < 0)
 385    return;
 386  key += 'A' + m;
 387  const char *day_end;
 388  const char *day_start = find_day(s, s + len, &day_end);
 389  if (!day_start)
 390    return;
 391  if (day_end - day_start == 1)
 392    key += '0';
 393  while (day_start < day_end)
 394    key += *day_start++;
 395}
 396
 397// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
 398
 399void sortify_label(const char *s, int len, string &key)
 400{
 401  const char *end = s + len;
 402  for (;;) {
 403    const char *ptr;
 404    for (ptr = s;
 405	 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
 406	 ptr++)
 407      ;
 408    if (ptr > s)
 409      sortify_words(s, ptr, 0, key);
 410    s = ptr;
 411    if (s >= end)
 412      break;
 413    key += *s++;
 414  }
 415}
 416
 417void reference::compute_sort_key()
 418{
 419  if (sort_fields.length() == 0)
 420    return;
 421  sort_fields += '\0';
 422  const char *sf = sort_fields.contents();
 423  while (*sf != '\0') {
 424    sort_key += SORT_SEP;
 425    char f = *sf++;
 426    int n = 1;
 427    if (*sf == '+') {
 428      n = INT_MAX;
 429      sf++;
 430    }
 431    else if (csdigit(*sf)) {
 432      char *ptr;
 433      long l = strtol(sf, &ptr, 10);
 434      if (l == 0 && ptr == sf)
 435	;
 436      else {
 437	sf = ptr;
 438	if (l < 0) {
 439	  n = 1;
 440	}
 441	else {
 442	  n = int(l);
 443	}
 444      }
 445    }
 446    if (f == '.')
 447      sortify_label(label.contents(), label.length(), sort_key);
 448    else if (f == AUTHOR_FIELDS[0])
 449      sortify_authors(n, sort_key);
 450    else
 451      sortify_field(f, n, sort_key);
 452  }
 453  sort_fields.set_length(sort_fields.length() - 1);
 454}
 455
 456void reference::sortify_authors(int n, string &result) const
 457{
 458  for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
 459    if (contains_field(*p)) {
 460      sortify_field(*p, n, result);
 461      return;
 462    }
 463  sortify_field(AUTHOR_FIELDS[0], n, result);
 464}
 465
 466void reference::canonicalize_authors(string &result) const
 467{
 468  int len = result.length();
 469  sortify_authors(INT_MAX, result);
 470  if (result.length() > len)
 471    result += SORT_SUB_SEP;
 472}
 473
 474void reference::sortify_field(unsigned char f, int n, string &result) const
 475{
 476  typedef void (*sortify_t)(const char *, int, string &);
 477  sortify_t sortifier = sortify_other;
 478  switch (f) {
 479  case 'A':
 480  case 'E':
 481    sortifier = sortify_name;
 482    break;
 483  case 'D':
 484    sortifier = sortify_date;
 485    break;
 486  case 'B':
 487  case 'J':
 488  case 'T':
 489    sortifier = sortify_title;
 490    break;
 491  }
 492  int fi = field_index[(unsigned char)f];
 493  if (fi != NULL_FIELD_INDEX) {
 494    string &str = field[fi];
 495    const char *start = str.contents();
 496    const char *end = start + str.length();
 497    for (int i = 0; i < n && start < end; i++) {
 498      const char *p = start;
 499      while (start < end && *start != FIELD_SEPARATOR)
 500	start++;
 501      if (i > 0)
 502	result += SORT_SUB_SEP;
 503      (*sortifier)(p, start - p, result);
 504      if (start < end)
 505	start++;
 506    }
 507  }
 508}
 509
 510int compare_reference(const reference &r1, const reference &r2)
 511{
 512  assert(r1.no >= 0);
 513  assert(r2.no >= 0);
 514  const char *s1 = r1.sort_key.contents();
 515  int n1 = r1.sort_key.length();
 516  const char *s2 = r2.sort_key.contents();
 517  int n2 = r2.sort_key.length();
 518  for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
 519    if (*s1 != *s2)
 520      return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
 521  if (n2 > 0)
 522    return -1;
 523  if (n1 > 0)
 524    return 1;
 525  return r1.no - r2.no;
 526}
 527
 528int same_reference(const reference &r1, const reference &r2)
 529{
 530  if (!r1.rid.is_null() && r1.rid == r2.rid)
 531    return 1;
 532  if (r1.h != r2.h)
 533    return 0;
 534  if (r1.nfields != r2.nfields)
 535    return 0;
 536  int i = 0; 
 537  for (i = 0; i < 256; i++)
 538    if (r1.field_index != r2.field_index)
 539      return 0;
 540  for (i = 0; i < r1.nfields; i++)
 541    if (r1.field[i] != r2.field[i])
 542      return 0;
 543  return 1;
 544}
 545
 546const char *find_last_name(const char *start, const char *end,
 547			   const char **endp)
 548{
 549  const char *ptr = start;
 550  const char *last_word = start;
 551  for (;;) {
 552    const char *token_start = ptr;
 553    if (!get_token(&ptr, end))
 554      break;
 555    if (ptr - token_start == 1) {
 556      if (*token_start == ',') {
 557	*endp = token_start;
 558	return last_word;
 559      }
 560      else if (*token_start == ' ' || *token_start == '\n') {
 561	if (ptr < end && *ptr != ' ' && *ptr != '\n')
 562	  last_word = ptr;
 563      }
 564    }
 565  }
 566  *endp = end;
 567  return last_word;
 568}
 569
 570void abbreviate_name(const char *ptr, const char *end, string &result)
 571{
 572  const char *last_name_end;
 573  const char *last_name_start = find_last_name(ptr, end, &last_name_end);
 574  int need_period = 0;
 575  for (;;) {
 576    const char *token_start = ptr;
 577    if (!get_token(&ptr, last_name_start))
 578      break;
 579    const token_info *ti = lookup_token(token_start, ptr);
 580    if (need_period) {
 581      if ((ptr - token_start == 1 && *token_start == ' ')
 582	  || (ptr - token_start == 2 && token_start[0] == '\\'
 583	      && token_start[1] == ' '))
 584	continue;
 585      if (ti->is_upper())
 586	result += period_before_initial;
 587      else
 588	result += period_before_other;
 589      need_period = 0;
 590    }
 591    result.append(token_start, ptr - token_start);
 592    if (ti->is_upper()) {
 593      const char *lower_ptr = ptr;
 594      int first_token = 1;
 595      for (;;) {
 596	token_start = ptr;
 597	if (!get_token(&ptr, last_name_start))
 598	  break;
 599	if ((ptr - token_start == 1 && *token_start == ' ')
 600	    || (ptr - token_start == 2 && token_start[0] == '\\'
 601		&& token_start[1] == ' '))
 602	  break;
 603	ti = lookup_token(token_start, ptr);
 604	if (ti->is_hyphen()) {
 605	  const char *ptr1 = ptr;
 606	  if (get_token(&ptr1, last_name_start)) {
 607	    ti = lookup_token(ptr, ptr1);
 608	    if (ti->is_upper()) {
 609	      result += period_before_hyphen;
 610	      result.append(token_start, ptr1 - token_start);
 611	      ptr = ptr1;
 612	    }
 613	  }
 614	}
 615	else if (ti->is_upper()) {
 616	  // MacDougal -> MacD.
 617	  result.append(lower_ptr, ptr - lower_ptr);
 618	  lower_ptr = ptr;
 619	  first_token = 1;
 620	}
 621	else if (first_token && ti->is_accent()) {
 622	  result.append(token_start, ptr - token_start);
 623	  lower_ptr = ptr;
 624	}
 625	first_token = 0;
 626      }
 627      need_period = 1;
 628    }
 629  }
 630  if (need_period)
 631    result += period_before_last_name;
 632  result.append(last_name_start, end - last_name_start);
 633}
 634
 635static void abbreviate_names(string &result)
 636{
 637  string str;
 638  str.move(result);
 639  const char *ptr = str.contents();
 640  const char *end = ptr + str.length();
 641  while (ptr < end) {
 642    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
 643    if (name_end == 0)
 644      name_end = end;
 645    abbreviate_name(ptr, name_end, result);
 646    if (name_end >= end)
 647      break;
 648    ptr = name_end + 1;
 649    result += FIELD_SEPARATOR;
 650  }
 651}
 652
 653void reverse_name(const char *ptr, const char *name_end, string &result)
 654{
 655  const char *last_name_end;
 656  const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
 657  result.append(last_name_start, last_name_end - last_name_start);
 658  while (last_name_start > ptr
 659	 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
 660    last_name_start--;
 661  if (last_name_start > ptr) {
 662    result += ", ";
 663    result.append(ptr, last_name_start - ptr);
 664  }
 665  if (last_name_end < name_end)
 666    result.append(last_name_end, name_end - last_name_end);
 667}
 668
 669void reverse_names(string &result, int n)
 670{
 671  if (n <= 0)
 672    return;
 673  string str;
 674  str.move(result);
 675  const char *ptr = str.contents();
 676  const char *end = ptr + str.length();
 677  while (ptr < end) {
 678    if (--n < 0) {
 679      result.append(ptr, end - ptr);
 680      break;
 681    }
 682    const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
 683    if (name_end == 0)
 684      name_end = end;
 685    reverse_name(ptr, name_end, result);
 686    if (name_end >= end)
 687      break;
 688    ptr = name_end + 1;
 689    result += FIELD_SEPARATOR;
 690  }
 691}
 692
 693// Return number of field separators.
 694
 695int join_fields(string &f)
 696{
 697  const char *ptr = f.contents();
 698  int len = f.length();
 699  int nfield_seps = 0;
 700  int j;
 701  for (j = 0; j < len; j++)
 702    if (ptr[j] == FIELD_SEPARATOR)
 703      nfield_seps++;
 704  if (nfield_seps == 0)
 705    return 0;
 706  string temp;
 707  int field_seps_left = nfield_seps;
 708  for (j = 0; j < len; j++) {
 709    if (ptr[j] == FIELD_SEPARATOR) {
 710      if (nfield_seps == 1)
 711	temp += join_authors_exactly_two;
 712      else if (--field_seps_left == 0)
 713	temp += join_authors_last_two;
 714      else
 715	temp += join_authors_default;
 716    }
 717    else
 718      temp += ptr[j];
 719  }
 720  f = temp;
 721  return nfield_seps;
 722}
 723
 724void uppercase(const char *start, const char *end, string &result)
 725{
 726  for (;;) {
 727    const char *token_start = start;
 728    if (!get_token(&start, end))
 729      break;
 730    const token_info *ti = lookup_token(token_start, start);
 731    ti->upper_case(token_start, start, result);
 732  }
 733}
 734
 735void lowercase(const char *start, const char *end, string &result)
 736{
 737  for (;;) {
 738    const char *token_start = start;
 739    if (!get_token(&start, end))
 740      break;
 741    const token_info *ti = lookup_token(token_start, start);
 742    ti->lower_case(token_start, start, result);
 743  }
 744}
 745
 746void capitalize(const char *ptr, const char *end, string &result)
 747{
 748  int in_small_point_size = 0;
 749  for (;;) {
 750    const char *start = ptr;
 751    if (!get_token(&ptr, end))
 752      break;
 753    const token_info *ti = lookup_token(start, ptr);
 754    const char *char_end = ptr;
 755    int is_lower = ti->is_lower();
 756    if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
 757      const token_info *ti2 = lookup_token(char_end, ptr);
 758      if (!ti2->is_accent())
 759	ptr = char_end;
 760    }
 761    if (is_lower) {
 762      if (!in_small_point_size) {
 763	result += "\\s-2";
 764	in_small_point_size = 1;
 765      }
 766      ti->upper_case(start, char_end, result);
 767      result.append(char_end, ptr - char_end);
 768    }
 769    else {
 770      if (in_small_point_size) {
 771	result += "\\s+2";
 772	in_small_point_size = 0;
 773      }
 774      result.append(start, ptr - start);
 775    }
 776  }
 777  if (in_small_point_size)
 778    result += "\\s+2";
 779}
 780
 781void capitalize_field(string &str)
 782{
 783  string temp;
 784  capitalize(str.contents(), str.contents() + str.length(), temp);
 785  str.move(temp);
 786}
 787
 788int is_terminated(const char *ptr, const char *end)
 789{
 790  const char *last_token = end;
 791  for (;;) {
 792    const char *p = ptr;
 793    if (!get_token(&ptr, end))
 794      break;
 795    last_token = p;
 796  }
 797  return end - last_token == 1
 798    && (*last_token == '.' || *last_token == '!' || *last_token == '?');
 799}
 800
 801void reference::output(FILE *fp)
 802{
 803  fputs(".]-\n", fp);
 804  for (int i = 0; i < 256; i++)
 805    if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
 806      string &f = field[field_index[i]];
 807      if (!csdigit(i)) {
 808	int j = reverse_fields.search(i);
 809	if (j >= 0) {
 810	  int n;
 811	  int len = reverse_fields.length();
 812	  if (++j < len && csdigit(reverse_fields[j])) {
 813	    n = reverse_fields[j] - '0';
 814	    for (++j; j < len && csdigit(reverse_fields[j]); j++)
 815	      // should check for overflow
 816	      n = n*10 + reverse_fields[j] - '0';
 817	  }
 818	  else 
 819	    n = INT_MAX;
 820	  reverse_names(f, n);
 821	}
 822      }
 823      int is_multiple = join_fields(f) > 0;
 824      if (capitalize_fields.search(i) >= 0)
 825	capitalize_field(f);
 826      if (memchr(f.contents(), '\n', f.length()) == 0) {
 827	fprintf(fp, ".ds [%c ", i);
 828	if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
 829	  putc('"', fp);
 830	put_string(f, fp);
 831	putc('\n', fp);
 832      }
 833      else {
 834	fprintf(fp, ".de [%c\n", i);
 835	put_string(f, fp);
 836	fputs("..\n", fp);
 837      }
 838      if (i == 'P') {
 839	int multiple_pages = 0;
 840	const char *s = f.contents();
 841	const char *end = f.contents() + f.length();
 842	for (;;) {
 843	  const char *token_start = s;
 844	  if (!get_token(&s, end))
 845	    break;
 846	  const token_info *ti = lookup_token(token_start, s);
 847	  if (ti->is_hyphen() || ti->is_range_sep()) {
 848	    multiple_pages = 1;
 849	    break;
 850	  }
 851	}
 852	fprintf(fp, ".nr [P %d\n", multiple_pages);
 853      }
 854      else if (i == 'E')
 855	fprintf(fp, ".nr [E %d\n", is_multiple);
 856    }
 857  for (const char *p = "TAO"; *p; p++) {
 858    int fi = field_index[(unsigned char)*p];
 859    if (fi != NULL_FIELD_INDEX) {
 860      string &f = field[fi];
 861      fprintf(fp, ".nr [%c %d\n", *p,
 862	      is_terminated(f.contents(), f.contents() + f.length()));
 863    }
 864  }
 865  int t = classify();
 866  fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
 867  if (annotation_macro.length() > 0 && annotation_field >= 0
 868      && field_index[annotation_field] != NULL_FIELD_INDEX) {
 869    putc('.', fp);
 870    put_string(annotation_macro, fp);
 871    putc('\n', fp);
 872    put_string(field[field_index[annotation_field]], fp);
 873  }
 874}
 875
 876void reference::print_sort_key_comment(FILE *fp)
 877{
 878  fputs(".\\\"", fp);
 879  put_string(sort_key, fp);
 880  putc('\n', fp);
 881}
 882
 883const char *find_year(const char *start, const char *end, const char **endp)
 884{
 885  for (;;) {
 886    while (start < end && !csdigit(*start))
 887      start++;
 888    const char *ptr = start;
 889    if (start == end)
 890      break;
 891    while (ptr < end && csdigit(*ptr))
 892      ptr++;
 893    if (ptr - start == 4 || ptr - start == 3
 894	|| (ptr - start == 2
 895	    && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
 896      *endp = ptr;
 897      return start;
 898    }
 899    start = ptr;
 900  }
 901  return 0;
 902}
 903
 904static const char *find_day(const char *start, const char *end,
 905			    const char **endp)
 906{
 907  for (;;) {
 908    while (start < end && !csdigit(*start))
 909      start++;
 910    const char *ptr = start;
 911    if (start == end)
 912      break;
 913    while (ptr < end && csdigit(*ptr))
 914      ptr++;
 915    if ((ptr - start == 1 && start[0] != '0')
 916	|| (ptr - start == 2 &&
 917	    (start[0] == '1'
 918	     || start[0] == '2'
 919	     || (start[0] == '3' && start[1] <= '1')
 920	     || (start[0] == '0' && start[1] != '0')))) {
 921      *endp = ptr;
 922      return start;
 923    }
 924    start = ptr;
 925  }
 926  return 0;
 927}
 928
 929static int find_month(const char *start, const char *end)
 930{
 931  static const char *months[] = {
 932    "january",
 933    "february",
 934    "march",
 935    "april",
 936    "may",
 937    "june",
 938    "july",
 939    "august",
 940    "september",
 941    "october",
 942    "november",
 943    "december",
 944  };
 945  for (;;) {
 946    while (start < end && !csalpha(*start))
 947      start++;
 948    const char *ptr = start;
 949    if (start == end)
 950      break;
 951    while (ptr < end && csalpha(*ptr))
 952      ptr++;
 953    if (ptr - start >= 3) {
 954      for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
 955	const char *q = months[i];
 956	const char *p = start;
 957	for (; p < ptr; p++, q++)
 958	  if (cmlower(*p) != *q)
 959	    break;
 960	if (p >= ptr)
 961	  return i;
 962      }
 963    }
 964    start = ptr;
 965  }
 966  return -1;
 967}
 968
 969int reference::contains_field(char c) const
 970{
 971  return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
 972}
 973
 974int reference::classify()
 975{
 976  if (contains_field('J'))
 977    return JOURNAL_ARTICLE;
 978  if (contains_field('B'))
 979    return ARTICLE_IN_BOOK;
 980  if (contains_field('G'))
 981    return TECH_REPORT;
 982  if (contains_field('R'))
 983    return TECH_REPORT;
 984  if (contains_field('I'))
 985    return BOOK;
 986  if (contains_field('M'))
 987    return BELL_TM;
 988  return OTHER;
 989}
 990
 991const char *reference::get_year(const char **endp) const
 992{
 993  if (field_index['D'] != NULL_FIELD_INDEX) {
 994    string &date = field[field_index['D']];
 995    const char *start = date.contents();
 996    const char *end = start + date.length();
 997    return find_year(start, end, endp);
 998  }
 999  else
1000    return 0;
1001}
1002
1003const char *reference::get_field(unsigned char c, const char **endp) const
1004{
1005  if (field_index[c] != NULL_FIELD_INDEX) {
1006    string &f = field[field_index[c]];
1007    const char *start = f.contents();
1008    *endp = start + f.length();
1009    return start;
1010  }
1011  else
1012    return 0;
1013}
1014
1015const char *reference::get_date(const char **endp) const
1016{
1017  return get_field('D', endp);
1018}
1019
1020const char *nth_field(int i, const char *start, const char **endp)
1021{
1022  while (--i >= 0) {
1023    start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1024    if (!start)
1025      return 0;
1026    start++;
1027  }
1028  const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029  if (e)
1030    *endp = e;
1031  return start;
1032}
1033
1034const char *reference::get_author(int i, const char **endp) const
1035{
1036  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037    const char *start = get_field(*f, endp);
1038    if (start) {
1039      if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040	return nth_field(i, start, endp);
1041      else if (i == 0)
1042	return start;
1043      else
1044	return 0;
1045    }
1046  }
1047  return 0;
1048}
1049
1050const char *reference::get_author_last_name(int i, const char **endp) const
1051{
1052  for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053    const char *start = get_field(*f, endp);
1054    if (start) {
1055      if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056	start = nth_field(i, start, endp);
1057	if (!start)
1058	  return 0;
1059      }
1060      if (*f == 'A')
1061	return find_last_name(start, *endp, endp);
1062      else
1063	return start;
1064    }
1065  }
1066  return 0;
1067}
1068
1069void reference::set_date(string &d)
1070{
1071  if (d.length() == 0)
1072    delete_field('D');
1073  else
1074    insert_field('D', d);
1075}
1076
1077int same_year(const reference &r1, const reference &r2)
1078{
1079  const char *ye1;
1080  const char *ys1 = r1.get_year(&ye1);
1081  const char *ye2;
1082  const char *ys2 = r2.get_year(&ye2);
1083  if (ys1 == 0) {
1084    if (ys2 == 0)
1085      return same_date(r1, r2);
1086    else
1087      return 0;
1088  }
1089  else if (ys2 == 0)
1090    return 0;
1091  else if (ye1 - ys1 != ye2 - ys2)
1092    return 0;
1093  else
1094    return memcmp(ys1, ys2, ye1 - ys1) == 0;
1095}
1096
1097int same_date(const reference &r1, const reference &r2)
1098{
1099  const char *e1;
1100  const char *s1 = r1.get_date(&e1);
1101  const char *e2;
1102  const char *s2 = r2.get_date(&e2);
1103  if (s1 == 0)
1104    return s2 == 0;
1105  else if (s2 == 0)
1106    return 0;
1107  else if (e1 - s1 != e2 - s2)
1108    return 0;
1109  else
1110    return memcmp(s1, s2, e1 - s1) == 0;
1111}
1112
1113const char *reference::get_sort_field(int i, int si, int ssi,
1114				      const char **endp) const
1115{
1116  const char *start = sort_key.contents();
1117  const char *end = start + sort_key.length();
1118  if (i < 0) {
1119    *endp = end;
1120    return start;
1121  }
1122  while (--i >= 0) {
1123    start = (char *)memchr(start, SORT_SEP, end - start);
1124    if (!start)
1125      return 0;
1126    start++;
1127  }
1128  const char *e = (char *)memchr(start, SORT_SEP, end - start);
1129  if (e)
1130    end = e;
1131  if (si < 0) {
1132    *endp = end;
1133    return start;
1134  }
1135  while (--si >= 0) {
1136    start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1137    if (!start)
1138      return 0;
1139    start++;
1140  }
1141  e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142  if (e)
1143    end = e;
1144  if (ssi < 0) {
1145    *endp = end;
1146    return start;
1147  }
1148  while (--ssi >= 0) {
1149    start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1150    if (!start)
1151      return 0;
1152    start++;
1153  }
1154  e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155  if (e)
1156    end = e;
1157  *endp = end;
1158  return start;
1159}
1160