/contrib/groff/src/libs/libbib/linear.cpp

https://bitbucket.org/freebsd/freebsd-head/ · C++ · 503 lines · 426 code · 44 blank · 33 comment · 168 complexity · 0061c52a17ee54c74b35743d5af31660 MD5 · raw file

  1. // -*- C++ -*-
  2. /* Copyright (C) 1989, 1990, 1991, 1992, 2000, 2001
  3. Free Software Foundation, Inc.
  4. Written by James Clark (jjc@jclark.com)
  5. This file is part of groff.
  6. groff is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation; either version 2, or (at your option) any later
  9. version.
  10. groff is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12. FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  13. for more details.
  14. You should have received a copy of the GNU General Public License along
  15. with groff; see the file COPYING. If not, write to the Free Software
  16. Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  17. #include "lib.h"
  18. #include <stdlib.h>
  19. #include <assert.h>
  20. #include <errno.h>
  21. #include "posix.h"
  22. #include "errarg.h"
  23. #include "error.h"
  24. #include "cset.h"
  25. #include "cmap.h"
  26. #include "nonposix.h"
  27. #include "refid.h"
  28. #include "search.h"
  29. class file_buffer {
  30. char *buffer;
  31. char *bufend;
  32. public:
  33. file_buffer();
  34. ~file_buffer();
  35. int load(int fd, const char *filename);
  36. const char *get_start() const;
  37. const char *get_end() const;
  38. };
  39. typedef unsigned char uchar;
  40. static uchar map[256];
  41. static uchar inv_map[256][3];
  42. struct map_init {
  43. map_init();
  44. };
  45. static map_init the_map_init;
  46. map_init::map_init()
  47. {
  48. int i;
  49. for (i = 0; i < 256; i++)
  50. map[i] = csalnum(i) ? cmlower(i) : '\0';
  51. for (i = 0; i < 256; i++) {
  52. if (cslower(i)) {
  53. inv_map[i][0] = i;
  54. inv_map[i][1] = cmupper(i);
  55. inv_map[i][2] = '\0';
  56. }
  57. else if (csdigit(i)) {
  58. inv_map[i][0] = i;
  59. inv_map[i][1] = 0;
  60. }
  61. else
  62. inv_map[i][0] = '\0';
  63. }
  64. }
  65. class bmpattern {
  66. char *pat;
  67. int len;
  68. int delta[256];
  69. public:
  70. bmpattern(const char *pattern, int pattern_length);
  71. ~bmpattern();
  72. const char *search(const char *p, const char *end) const;
  73. int length() const;
  74. };
  75. bmpattern::bmpattern(const char *pattern, int pattern_length)
  76. : len(pattern_length)
  77. {
  78. pat = new char[len];
  79. int i;
  80. for (i = 0; i < len; i++)
  81. pat[i] = map[uchar(pattern[i])];
  82. for (i = 0; i < 256; i++)
  83. delta[i] = len;
  84. for (i = 0; i < len; i++)
  85. for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
  86. delta[*inv] = len - i - 1;
  87. }
  88. const char *bmpattern::search(const char *buf, const char *end) const
  89. {
  90. int buflen = end - buf;
  91. if (len > buflen)
  92. return 0;
  93. const char *strend;
  94. if (buflen > len*4)
  95. strend = end - len*4;
  96. else
  97. strend = buf;
  98. const char *k = buf + len - 1;
  99. const int *del = delta;
  100. const char *pattern = pat;
  101. for (;;) {
  102. while (k < strend) {
  103. int t = del[uchar(*k)];
  104. if (!t)
  105. break;
  106. k += t;
  107. k += del[uchar(*k)];
  108. k += del[uchar(*k)];
  109. }
  110. while (k < end && del[uchar(*k)] != 0)
  111. k++;
  112. if (k == end)
  113. break;
  114. int j = len - 1;
  115. const char *s = k;
  116. for (;;) {
  117. if (j == 0)
  118. return s;
  119. if (map[uchar(*--s)] != uchar(pattern[--j]))
  120. break;
  121. }
  122. k++;
  123. }
  124. return 0;
  125. }
  126. bmpattern::~bmpattern()
  127. {
  128. a_delete pat;
  129. }
  130. inline int bmpattern::length() const
  131. {
  132. return len;
  133. }
  134. static const char *find_end(const char *bufend, const char *p);
  135. const char *linear_searcher::search_and_check(const bmpattern *key,
  136. const char *buf, const char *bufend, const char **start) const
  137. {
  138. assert(buf[-1] == '\n');
  139. assert(bufend[-1] == '\n');
  140. const char *ptr = buf;
  141. for (;;) {
  142. const char *found = key->search(ptr, bufend);
  143. if (!found)
  144. break;
  145. if (check_match(buf, bufend, found, key->length(), &ptr, start))
  146. return found;
  147. }
  148. return 0;
  149. }
  150. static const char *skip_field(const char *end, const char *p)
  151. {
  152. for (;;)
  153. if (*p++ == '\n') {
  154. if (p == end || *p == '%')
  155. break;
  156. const char *q;
  157. for (q = p; *q == ' ' || *q == '\t'; q++)
  158. ;
  159. if (*q == '\n')
  160. break;
  161. p = q + 1;
  162. }
  163. return p;
  164. }
  165. static const char *find_end(const char *bufend, const char *p)
  166. {
  167. for (;;)
  168. if (*p++ == '\n') {
  169. if (p == bufend)
  170. break;
  171. const char *q;
  172. for (q = p; *q == ' ' || *q == '\t'; q++)
  173. ;
  174. if (*q == '\n')
  175. break;
  176. p = q + 1;
  177. }
  178. return p;
  179. }
  180. int linear_searcher::check_match(const char *buf, const char *bufend,
  181. const char *match, int matchlen,
  182. const char **cont, const char **start) const
  183. {
  184. *cont = match + 1;
  185. // The user is required to supply only the first truncate_len characters
  186. // of the key. If truncate_len <= 0, he must supply all the key.
  187. if ((truncate_len <= 0 || matchlen < truncate_len)
  188. && map[uchar(match[matchlen])] != '\0')
  189. return 0;
  190. // The character before the match must not be an alphanumeric
  191. // character (unless the alphanumeric character follows one or two
  192. // percent characters at the beginning of the line), nor must it be
  193. // a percent character at the beginning of a line, nor a percent
  194. // character following a percent character at the beginning of a
  195. // line.
  196. switch (match - buf) {
  197. case 0:
  198. break;
  199. case 1:
  200. if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
  201. return 0;
  202. break;
  203. case 2:
  204. if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
  205. return 0;
  206. if (match[-1] == '%'
  207. && (match[-2] == '\n' || match[-2] == '%'))
  208. return 0;
  209. break;
  210. default:
  211. if (map[uchar(match[-1])] != '\0'
  212. && !(match[-2] == '%'
  213. && (match[-3] == '\n'
  214. || (match[-3] == '%' && match[-4] == '\n'))))
  215. return 0;
  216. if (match[-1] == '%'
  217. && (match[-2] == '\n'
  218. || (match[-2] == '%' && match[-3] == '\n')))
  219. return 0;
  220. }
  221. const char *p = match;
  222. int had_percent = 0;
  223. for (;;) {
  224. if (*p == '\n') {
  225. if (!had_percent && p[1] == '%') {
  226. if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
  227. *cont = skip_field(bufend, match + matchlen);
  228. return 0;
  229. }
  230. if (!start)
  231. break;
  232. had_percent = 1;
  233. }
  234. if (p <= buf) {
  235. if (start)
  236. *start = p + 1;
  237. return 1;
  238. }
  239. const char *q;
  240. for (q = p - 1; *q == ' ' || *q == '\t'; q--)
  241. ;
  242. if (*q == '\n') {
  243. if (start)
  244. *start = p + 1;
  245. break;
  246. }
  247. p = q;
  248. }
  249. p--;
  250. }
  251. return 1;
  252. }
  253. file_buffer::file_buffer()
  254. : buffer(0), bufend(0)
  255. {
  256. }
  257. file_buffer::~file_buffer()
  258. {
  259. a_delete buffer;
  260. }
  261. const char *file_buffer::get_start() const
  262. {
  263. return buffer ? buffer + 4 : 0;
  264. }
  265. const char *file_buffer::get_end() const
  266. {
  267. return bufend;
  268. }
  269. int file_buffer::load(int fd, const char *filename)
  270. {
  271. struct stat sb;
  272. if (fstat(fd, &sb) < 0)
  273. error("can't fstat `%1': %2", filename, strerror(errno));
  274. else if (!S_ISREG(sb.st_mode))
  275. error("`%1' is not a regular file", filename);
  276. else {
  277. // We need one character extra at the beginning for an additional newline
  278. // used as a sentinel. We get 4 instead so that the read buffer will be
  279. // word-aligned. This seems to make the read slightly faster. We also
  280. // need one character at the end also for an additional newline used as a
  281. // sentinel.
  282. int size = int(sb.st_size);
  283. buffer = new char[size + 4 + 1];
  284. int nread = read(fd, buffer + 4, size);
  285. if (nread < 0)
  286. error("error reading `%1': %2", filename, strerror(errno));
  287. else if (nread != size)
  288. error("size of `%1' decreased", filename);
  289. else {
  290. char c;
  291. nread = read(fd, &c, 1);
  292. if (nread != 0)
  293. error("size of `%1' increased", filename);
  294. else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
  295. error("database `%1' is a binary file", filename);
  296. else {
  297. close(fd);
  298. buffer[3] = '\n';
  299. int sidx = 4, didx = 4;
  300. for ( ; sidx < size + 4; sidx++, didx++)
  301. {
  302. if (buffer[sidx] == '\r')
  303. {
  304. if (buffer[++sidx] != '\n')
  305. buffer[didx++] = '\r';
  306. else
  307. size--;
  308. }
  309. if (sidx != didx)
  310. buffer[didx] = buffer[sidx];
  311. }
  312. bufend = buffer + 4 + size;
  313. if (bufend[-1] != '\n')
  314. *bufend++ = '\n';
  315. return 1;
  316. }
  317. }
  318. a_delete buffer;
  319. buffer = 0;
  320. }
  321. close(fd);
  322. return 0;
  323. }
  324. linear_searcher::linear_searcher(const char *query, int query_len,
  325. const char *ign, int trunc)
  326. : ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
  327. {
  328. const char *query_end = query + query_len;
  329. int nk = 0;
  330. const char *p;
  331. for (p = query; p < query_end; p++)
  332. if (map[uchar(*p)] != '\0'
  333. && (p[1] == '\0' || map[uchar(p[1])] == '\0'))
  334. nk++;
  335. if (nk == 0)
  336. return;
  337. keys = new bmpattern*[nk];
  338. p = query;
  339. for (;;) {
  340. while (p < query_end && map[uchar(*p)] == '\0')
  341. p++;
  342. if (p == query_end)
  343. break;
  344. const char *start = p;
  345. while (p < query_end && map[uchar(*p)] != '\0')
  346. p++;
  347. keys[nkeys++] = new bmpattern(start, p - start);
  348. }
  349. assert(nkeys <= nk);
  350. if (nkeys == 0) {
  351. a_delete keys;
  352. keys = 0;
  353. }
  354. }
  355. linear_searcher::~linear_searcher()
  356. {
  357. for (int i = 0; i < nkeys; i++)
  358. delete keys[i];
  359. a_delete keys;
  360. }
  361. int linear_searcher::search(const char *buffer, const char *bufend,
  362. const char **startp, int *lengthp) const
  363. {
  364. assert(bufend - buffer > 0);
  365. assert(buffer[-1] == '\n');
  366. assert(bufend[-1] == '\n');
  367. if (nkeys == 0)
  368. return 0;
  369. for (;;) {
  370. const char *refstart;
  371. const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
  372. if (!found)
  373. break;
  374. const char *refend = find_end(bufend, found + keys[0]->length());
  375. int i;
  376. for (i = 1; i < nkeys; i++)
  377. if (!search_and_check(keys[i], refstart, refend))
  378. break;
  379. if (i >= nkeys) {
  380. *startp = refstart;
  381. *lengthp = refend - refstart;
  382. return 1;
  383. }
  384. buffer = refend;
  385. }
  386. return 0;
  387. }
  388. class linear_search_item : public search_item {
  389. file_buffer fbuf;
  390. public:
  391. linear_search_item(const char *filename, int fid);
  392. ~linear_search_item();
  393. int load(int fd);
  394. search_item_iterator *make_search_item_iterator(const char *);
  395. friend class linear_search_item_iterator;
  396. };
  397. class linear_search_item_iterator : public search_item_iterator {
  398. linear_search_item *lsi;
  399. int pos;
  400. public:
  401. linear_search_item_iterator(linear_search_item *, const char *query);
  402. ~linear_search_item_iterator();
  403. int next(const linear_searcher &, const char **ptr, int *lenp,
  404. reference_id *ridp);
  405. };
  406. search_item *make_linear_search_item(int fd, const char *filename, int fid)
  407. {
  408. linear_search_item *item = new linear_search_item(filename, fid);
  409. if (!item->load(fd)) {
  410. delete item;
  411. return 0;
  412. }
  413. else
  414. return item;
  415. }
  416. linear_search_item::linear_search_item(const char *filename, int fid)
  417. : search_item(filename, fid)
  418. {
  419. }
  420. linear_search_item::~linear_search_item()
  421. {
  422. }
  423. int linear_search_item::load(int fd)
  424. {
  425. return fbuf.load(fd, name);
  426. }
  427. search_item_iterator *linear_search_item::make_search_item_iterator(
  428. const char *query)
  429. {
  430. return new linear_search_item_iterator(this, query);
  431. }
  432. linear_search_item_iterator::linear_search_item_iterator(
  433. linear_search_item *p, const char *)
  434. : lsi(p), pos(0)
  435. {
  436. }
  437. linear_search_item_iterator::~linear_search_item_iterator()
  438. {
  439. }
  440. int linear_search_item_iterator::next(const linear_searcher &searcher,
  441. const char **startp, int *lengthp,
  442. reference_id *ridp)
  443. {
  444. const char *bufstart = lsi->fbuf.get_start();
  445. const char *bufend = lsi->fbuf.get_end();
  446. const char *ptr = bufstart + pos;
  447. if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
  448. pos = *startp + *lengthp - bufstart;
  449. if (ridp)
  450. *ridp = reference_id(lsi->filename_id, *startp - bufstart);
  451. return 1;
  452. }
  453. else
  454. return 0;
  455. }