PageRenderTime 59ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/query.cc

https://github.com/estrai/xapian-omega
C++ | 2358 lines | 2063 code | 124 blank | 171 comment | 532 complexity | d3a6f2a2918915ffd1802e3f733daf09 MD5 | raw file
Possible License(s): GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. /* query.cc: query executor for omega
  2. *
  3. * Copyright 1999,2000,2001 BrightStation PLC
  4. * Copyright 2001 James Aylett
  5. * Copyright 2001,2002 Ananova Ltd
  6. * Copyright 2002 Intercede 1749 Ltd
  7. * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
  8. * Copyright 2008 Thomas Viehmann
  9. *
  10. * This program is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU General Public License as
  12. * published by the Free Software Foundation; either version 2 of the
  13. * License, or (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
  23. * USA
  24. */
  25. #include <config.h>
  26. #include <algorithm>
  27. #include <iostream>
  28. #include <map>
  29. #include <set>
  30. #include <vector>
  31. #include <cassert>
  32. #include <cctype>
  33. #include "safeerrno.h"
  34. #include <stdio.h>
  35. #include <cstdlib>
  36. #include <cstring>
  37. #include "strcasecmp.h"
  38. #include <ctime>
  39. #include "safeunistd.h"
  40. #include <sys/types.h>
  41. #include "safesysstat.h"
  42. #include "safefcntl.h"
  43. #include "realtime.h"
  44. #include <cdb.h>
  45. #include "date.h"
  46. #include "datematchdecider.h"
  47. #include "utils.h"
  48. #include "omega.h"
  49. #include "query.h"
  50. #include "cgiparam.h"
  51. #include "loadfile.h"
  52. #include "str.h"
  53. #include "stringutils.h"
  54. #include "transform.h"
  55. #include "urlencode.h"
  56. #include "unixperm.h"
  57. #include "values.h"
  58. #include "weight.h"
  59. #include <xapian.h>
  60. using namespace std;
  61. using Xapian::Utf8Iterator;
  62. using Xapian::Unicode::is_wordchar;
  63. #ifndef SNPRINTF
  64. #include <cstdarg>
  65. static int my_snprintf(char *str, size_t size, const char *format, ...)
  66. {
  67. int res;
  68. va_list ap;
  69. va_start(ap, format);
  70. str[size - 1] = '\0';
  71. res = vsprintf(str, format, ap);
  72. if (str[size - 1] || res < 0 || size_t(res) >= size)
  73. abort(); /* Overflowed! */
  74. va_end(ap);
  75. return res;
  76. }
  77. #else
  78. #define my_snprintf SNPRINTF
  79. #endif
  80. static bool query_parsed = false;
  81. static bool done_query = false;
  82. static Xapian::docid last = 0;
  83. static Xapian::MSet mset;
  84. static map<Xapian::docid, bool> ticked;
  85. static void ensure_query_parsed();
  86. static void ensure_match();
  87. static Xapian::Query query;
  88. //static string url_query_string;
  89. Xapian::Query::op default_op = Xapian::Query::OP_OR; // default matching mode
  90. static Xapian::QueryParser qp;
  91. static Xapian::NumberValueRangeProcessor * size_vrp = NULL;
  92. static Xapian::Stem *stemmer = NULL;
  93. static string eval_file(const string &fmtfile);
  94. static set<string> termset;
  95. // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
  96. static map<string, string> termprefix_to_userprefix;
  97. static string queryterms;
  98. static string error_msg;
  99. static double secs = -1;
  100. static const char DEFAULT_LOG_ENTRY[] =
  101. "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
  102. "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
  103. "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
  104. "$dbname\t"
  105. "$query\t"
  106. "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
  107. class MyStopper : public Xapian::Stopper {
  108. public:
  109. bool operator()(const string &t) const {
  110. switch (t[0]) {
  111. case 'a':
  112. return (t == "a" || t == "about" || t == "an" || t == "and" ||
  113. t == "are" || t == "as" || t == "at");
  114. case 'b':
  115. return (t == "be" || t == "by");
  116. case 'e':
  117. return (t == "en");
  118. case 'f':
  119. return (t == "for" || t == "from");
  120. case 'h':
  121. return (t == "how");
  122. case 'i':
  123. return (t == "i" || t == "in" || t == "is" || t == "it");
  124. case 'o':
  125. return (t == "of" || t == "on" || t == "or");
  126. case 't':
  127. return (t == "that" || t == "the" || t == "this" || t == "to");
  128. case 'w':
  129. return (t == "was" || t == "what" || t == "when" ||
  130. t == "where" || t == "which" || t == "who" ||
  131. t == "why" || t == "will" || t == "with");
  132. case 'y':
  133. return (t == "you" || t == "your");
  134. default:
  135. return false;
  136. }
  137. }
  138. };
  139. static size_t
  140. prefix_from_term(string &prefix, const string &term)
  141. {
  142. if (term.empty()) {
  143. prefix.resize(0);
  144. return 0;
  145. }
  146. if (term[0] == 'X') {
  147. const string::const_iterator begin = term.begin();
  148. string::const_iterator i = begin + 1;
  149. while (i != term.end() && isupper(static_cast<unsigned char>(*i))) ++i;
  150. prefix.assign(begin, i);
  151. if (i != term.end() && *i == ':') ++i;
  152. return i - begin;
  153. }
  154. prefix = term[0];
  155. return 1;
  156. }
  157. // Don't allow ".." in format names, log file names, etc as this would allow
  158. // people to open a format "../../etc/passwd" or similar.
  159. // FIXME: make this check more exact ("foo..bar" is safe)
  160. // FIXME: log when this check fails
  161. static bool
  162. vet_filename(const string &filename)
  163. {
  164. string::size_type i = filename.find("..");
  165. return (i == string::npos);
  166. }
  167. // Heuristics:
  168. // * If any terms have been removed, it's a "fresh query" so we discard any
  169. // relevance judgements
  170. // * If all previous terms are there but more have been added then we keep
  171. // the relevance judgements, but return the first page of hits
  172. //
  173. // NEW_QUERY entirely new query
  174. // SAME_QUERY unchanged query
  175. // EXTENDED_QUERY new query, but based on the old one
  176. // BAD_QUERY parse error (message in error_msg)
  177. typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
  178. static querytype
  179. set_probabilistic(const string &oldp)
  180. {
  181. // Parse the query string.
  182. qp.set_stemmer(Xapian::Stem(option["stemmer"]));
  183. qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
  184. qp.set_stopper(new MyStopper());
  185. qp.set_default_op(default_op);
  186. qp.set_database(db);
  187. // FIXME: provide a custom VRP which handles size:10..20K, etc.
  188. if (!size_vrp)
  189. size_vrp = new Xapian::NumberValueRangeProcessor(VALUE_SIZE, "size:",
  190. true);
  191. qp.add_valuerangeprocessor(size_vrp);
  192. // std::map::insert() won't overwrite an existing entry, so we'll prefer
  193. // the first user_prefix for which a particular term prefix is specified.
  194. map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
  195. for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
  196. string user_prefix = pfx->first.substr(7);
  197. qp.add_prefix(user_prefix, pfx->second);
  198. termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
  199. }
  200. pfx = option.lower_bound("boolprefix,");
  201. for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
  202. string user_prefix = pfx->first.substr(11);
  203. qp.add_boolean_prefix(user_prefix, pfx->second);
  204. termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
  205. }
  206. try {
  207. unsigned f = 0;
  208. map<string, string>::const_iterator i = option.lower_bound("flag_");
  209. for (; i != option.end() && startswith(i->first, "flag_"); ++i) {
  210. if (i->second.empty()) continue;
  211. const string & s = i->first;
  212. switch (s[5]) {
  213. case 'a':
  214. if (s == "flag_auto_multiword_synonyms") {
  215. f |= Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
  216. break;
  217. }
  218. if (s == "flag_auto_synonyms") {
  219. f |= Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
  220. break;
  221. }
  222. break;
  223. case 'b':
  224. if (s == "flag_boolean") {
  225. f |= Xapian::QueryParser::FLAG_BOOLEAN;
  226. break;
  227. }
  228. if (s == "flag_boolean_any_case") {
  229. f |= Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
  230. break;
  231. }
  232. break;
  233. case 'd':
  234. if (s == "flag_default") {
  235. f |= Xapian::QueryParser::FLAG_DEFAULT;
  236. break;
  237. }
  238. break;
  239. case 'l':
  240. if (s == "flag_lovehate") {
  241. f |= Xapian::QueryParser::FLAG_LOVEHATE;
  242. break;
  243. }
  244. break;
  245. case 'p':
  246. if (s == "flag_partial") {
  247. f |= Xapian::QueryParser::FLAG_PARTIAL;
  248. break;
  249. }
  250. if (s == "flag_phrase") {
  251. f |= Xapian::QueryParser::FLAG_PHRASE;
  252. break;
  253. }
  254. if (s == "flag_pure_not") {
  255. f |= Xapian::QueryParser::FLAG_PURE_NOT;
  256. break;
  257. }
  258. break;
  259. case 's':
  260. if (s == "flag_spelling_correction") {
  261. f |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
  262. break;
  263. }
  264. if (s == "flag_synonym") {
  265. f |= Xapian::QueryParser::FLAG_SYNONYM;
  266. break;
  267. }
  268. break;
  269. case 'w':
  270. if (s == "flag_wildcard") {
  271. f |= Xapian::QueryParser::FLAG_WILDCARD;
  272. break;
  273. }
  274. break;
  275. }
  276. }
  277. if (option["spelling"] == "true")
  278. f |= qp.FLAG_SPELLING_CORRECTION;
  279. query = qp.parse_query(query_string, f);
  280. } catch (Xapian::QueryParserError &e) {
  281. error_msg = e.get_msg();
  282. return BAD_QUERY;
  283. }
  284. Xapian::termcount n_new_terms = 0;
  285. for (Xapian::TermIterator i = query.get_terms_begin();
  286. i != query.get_terms_end(); ++i) {
  287. if (termset.find(*i) == termset.end()) {
  288. termset.insert(*i);
  289. if (!queryterms.empty()) queryterms += '\t';
  290. queryterms += *i;
  291. }
  292. n_new_terms++;
  293. }
  294. // Check new query against the previous one
  295. if (oldp.empty()) return query_string.empty() ? SAME_QUERY : NEW_QUERY;
  296. // Long, long ago we used "word1#word2#" (with trailing #) but some broken
  297. // old browsers (versions of MSIE) don't quote # in form GET submissions
  298. // and everything after the # gets interpreted as an anchor. We now allow
  299. // terms like `c#' so we want to avoid '#' anyway.
  300. //
  301. // So we switched to using "word1.word2." but that doesn't work if
  302. // the terms contain "." themselves (e.g. Tapplication/vnd.ms-excel)
  303. // so now we use "word1\tword2" instead (with no trailing separator).
  304. //
  305. // However for compatibility with templates which haven't been updated and
  306. // bookmarked queries from Omega 0.9.6 and earlier we still support ".".
  307. char separator = '\t';
  308. unsigned int n_old_terms = count(oldp.begin(), oldp.end(), '\t') + 1;
  309. if (n_old_terms == 1 && oldp[oldp.size() - 1] == '.') {
  310. separator = '.';
  311. n_old_terms = count(oldp.begin(), oldp.end(), '.');
  312. }
  313. // short-cut: if the new query has fewer terms, it must be a new one
  314. if (n_new_terms < n_old_terms) return NEW_QUERY;
  315. const char *term = oldp.c_str();
  316. const char *pend;
  317. while ((pend = strchr(term, separator)) != NULL) {
  318. if (termset.find(string(term, pend - term)) == termset.end())
  319. return NEW_QUERY;
  320. term = pend + 1;
  321. }
  322. if (*term) {
  323. if (termset.find(string(term)) == termset.end())
  324. return NEW_QUERY;
  325. }
  326. // Use termset.size() rather than n_new_terms so we correctly handle
  327. // the case when the query has repeated terms.
  328. // This works wrongly in the case when the user extends the query
  329. // by adding a term already in it, but that's unlikely and the behaviour
  330. // isn't too bad (we just don't reset page 1). We also mishandle a few
  331. // other obscure cases e.g. adding quotes to turn a query into a phrase.
  332. if (termset.size() > n_old_terms) return EXTENDED_QUERY;
  333. return SAME_QUERY;
  334. }
  335. static multimap<string, string> filter_map;
  336. typedef multimap<string, string>::const_iterator FMCI;
  337. void add_bterm(const string &term) {
  338. string prefix;
  339. if (prefix_from_term(prefix, term) > 0)
  340. filter_map.insert(multimap<string, string>::value_type(prefix, term));
  341. }
  342. static void
  343. run_query()
  344. {
  345. bool force_boolean = false;
  346. if (!filter_map.empty()) {
  347. // OR together filters with the same prefix, then AND together
  348. vector<Xapian::Query> filter_vec;
  349. vector<string> or_vec;
  350. string current;
  351. for (FMCI i = filter_map.begin(); ; i++) {
  352. bool over = (i == filter_map.end());
  353. if (over || i->first != current) {
  354. switch (or_vec.size()) {
  355. case 0:
  356. break;
  357. case 1:
  358. filter_vec.push_back(Xapian::Query(or_vec[0]));
  359. break;
  360. default:
  361. filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR,
  362. or_vec.begin(),
  363. or_vec.end()));
  364. break;
  365. }
  366. or_vec.clear();
  367. if (over) break;
  368. current = i->first;
  369. }
  370. or_vec.push_back(i->second);
  371. }
  372. Xapian::Query filter(Xapian::Query::OP_AND,
  373. filter_vec.begin(), filter_vec.end());
  374. if (query.empty()) {
  375. // If no probabilistic query is provided then promote the filters
  376. // to be THE query - filtering an empty query will give no
  377. // matches.
  378. std::swap(query, filter);
  379. if (enquire) force_boolean = true;
  380. } else {
  381. query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
  382. }
  383. }
  384. Xapian::MatchDecider * mdecider = NULL;
  385. if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
  386. MCI i = cgi_params.find("DATEVALUE");
  387. if (i != cgi_params.end()) {
  388. Xapian::valueno datevalue = string_to_int(i->second);
  389. mdecider = new DateMatchDecider(datevalue, date_start, date_end, date_span);
  390. } else {
  391. Xapian::Query date_filter(Xapian::Query::OP_OR,
  392. date_range_filter(date_start, date_end,
  393. date_span),
  394. Xapian::Query("Dlatest"));
  395. // If no probabilistic query is provided then promote the daterange
  396. // filter to be THE query instead of filtering an empty query.
  397. if (query.empty()) {
  398. query = date_filter;
  399. } else {
  400. query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
  401. }
  402. }
  403. }
  404. if (!enquire || !error_msg.empty()) return;
  405. set_weighting_scheme(*enquire, option, force_boolean);
  406. enquire->set_cutoff(threshold);
  407. if (sort_key != Xapian::BAD_VALUENO) {
  408. if (sort_after) {
  409. enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending);
  410. } else {
  411. enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending);
  412. }
  413. }
  414. enquire->set_docid_order(docid_order);
  415. if (collapse) {
  416. enquire->set_collapse_key(collapse_key);
  417. }
  418. if (!query.empty()) {
  419. #if 0
  420. // FIXME: If we start doing permissions checks based on $REMOTE_USER
  421. // we're going to break some existing setups if users upgrade. We
  422. // probably want a way to set this from OmegaScript.
  423. const char * remote_user = getenv("REMOTE_USER");
  424. if (remote_user)
  425. apply_unix_permissions(query, remote_user);
  426. #endif
  427. enquire->set_query(query);
  428. // We could use the value of topdoc as first parameter, but we
  429. // need to know the first few items in the mset to fake a
  430. // relevance set for topterms.
  431. //
  432. // If min_hits isn't set, check at least one extra result so we
  433. // know if we've reached the end of the matches or not - then we
  434. // can avoid offering a "next" button which leads to an empty page.
  435. mset = enquire->get_mset(0, topdoc + hits_per_page,
  436. topdoc + max(hits_per_page + 1, min_hits),
  437. &rset, mdecider);
  438. }
  439. }
  440. string
  441. html_escape(const string &str)
  442. {
  443. string res;
  444. string::size_type p = 0;
  445. while (p < str.size()) {
  446. char ch = str[p++];
  447. switch (ch) {
  448. case '<':
  449. res += "&lt;";
  450. continue;
  451. case '>':
  452. res += "&gt;";
  453. continue;
  454. case '&':
  455. res += "&amp;";
  456. continue;
  457. case '"':
  458. res += "&quot;";
  459. continue;
  460. default:
  461. res += ch;
  462. }
  463. }
  464. return res;
  465. }
  466. static string
  467. html_strip(const string &str)
  468. {
  469. string res;
  470. string::size_type p = 0;
  471. bool skip = false;
  472. while (p < str.size()) {
  473. char ch = str[p++];
  474. switch (ch) {
  475. case '<':
  476. skip = true;
  477. continue;
  478. case '>':
  479. skip = false;
  480. continue;
  481. default:
  482. if (! skip) res += ch;
  483. }
  484. }
  485. return res;
  486. }
  487. // FIXME split list into hash or map and use that rather than linear lookup?
  488. static int word_in_list(const string& word, const string& list)
  489. {
  490. string::size_type split = 0, split2;
  491. int count = 0;
  492. while ((split2 = list.find('\t', split)) != string::npos) {
  493. if (word.size() == split2 - split) {
  494. if (memcmp(word.data(), list.data() + split, word.size()) == 0)
  495. return count;
  496. }
  497. split = split2 + 1;
  498. ++count;
  499. }
  500. if (word.size() == list.size() - split) {
  501. if (memcmp(word.data(), list.data() + split, word.size()) == 0)
  502. return count;
  503. }
  504. return -1;
  505. }
  506. // Not a character in an identifier
  507. inline static bool
  508. p_notid(unsigned int c)
  509. {
  510. return !isalnum(static_cast<unsigned char>(c)) && c != '_';
  511. }
  512. // Not a character in an HTML tag name
  513. inline static bool
  514. p_nottag(unsigned int c)
  515. {
  516. return !isalnum(static_cast<unsigned char>(c)) && c != '.' && c != '-';
  517. }
  518. inline static bool
  519. p_plusminus(unsigned int c)
  520. {
  521. return c == '+' || c == '-';
  522. }
  523. // FIXME: shares algorithm with indextext.cc!
  524. static string
  525. html_highlight(const string &s, const string &list,
  526. const string &bra, const string &ket)
  527. {
  528. if (!stemmer) {
  529. stemmer = new Xapian::Stem(option["stemmer"]);
  530. }
  531. string res;
  532. Utf8Iterator j(s);
  533. const Utf8Iterator s_end;
  534. while (true) {
  535. Utf8Iterator first = j;
  536. while (first != s_end && !is_wordchar(*first)) ++first;
  537. if (first == s_end) break;
  538. Utf8Iterator term_end;
  539. string term;
  540. string word;
  541. const char *l = j.raw();
  542. if (*first < 128 && isupper(*first)) {
  543. j = first;
  544. Xapian::Unicode::append_utf8(term, *j);
  545. while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && isupper(*j)) {
  546. Xapian::Unicode::append_utf8(term, *j);
  547. }
  548. if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
  549. term.resize(0);
  550. }
  551. term_end = j;
  552. }
  553. if (term.empty()) {
  554. j = first;
  555. while (is_wordchar(*j)) {
  556. Xapian::Unicode::append_utf8(term, *j);
  557. ++j;
  558. if (j == s_end) break;
  559. if (*j == '&' || *j == '\'') {
  560. Utf8Iterator next = j;
  561. ++next;
  562. if (next == s_end || !is_wordchar(*next)) break;
  563. term += *j;
  564. j = next;
  565. }
  566. }
  567. term_end = j;
  568. if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
  569. string::size_type len = term.length();
  570. if (*j == '#') {
  571. term += '#';
  572. do { ++j; } while (j != s_end && *j == '#');
  573. } else {
  574. while (j != s_end && (*j == '+' || *j == '-')) {
  575. Xapian::Unicode::append_utf8(term, *j);
  576. ++j;
  577. }
  578. }
  579. if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
  580. term.resize(len);
  581. } else {
  582. term_end = j;
  583. }
  584. }
  585. }
  586. j = term_end;
  587. term = Xapian::Unicode::tolower(term);
  588. int match = word_in_list(term, list);
  589. if (match == -1) {
  590. string stem = "Z";
  591. stem += (*stemmer)(term);
  592. match = word_in_list(stem, list);
  593. }
  594. if (match >= 0) {
  595. res += html_escape(string(l, first.raw() - l));
  596. if (!bra.empty()) {
  597. res += bra;
  598. } else {
  599. static const char * colours[] = {
  600. "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
  601. "990000", "009900", "996600", "006699", "990099"
  602. };
  603. size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
  604. const char * bg = colours[idx];
  605. if (strchr(bg, 'f')) {
  606. res += "<b style=\"color:black;background-color:#";
  607. } else {
  608. res += "<b style=\"color:white;background-color:#";
  609. }
  610. res += bg;
  611. res += "\">";
  612. }
  613. word = string(first.raw(), j.raw() - first.raw());
  614. res += html_escape(word);
  615. if (!bra.empty()) {
  616. res += ket;
  617. } else {
  618. res += "</b>";
  619. }
  620. } else {
  621. res += html_escape(string(l, j.raw() - l));
  622. }
  623. }
  624. if (j != s_end) res += html_escape(string(j.raw(), j.left()));
  625. return res;
  626. }
  627. #if 0
  628. static void
  629. print_query_string(const char *after)
  630. {
  631. if (after && strncmp(after, "&B=", 3) == 0) {
  632. char prefix = after[3];
  633. string::size_type start = 0, amp = 0;
  634. while (true) {
  635. amp = url_query_string.find('&', amp);
  636. if (amp == string::npos) {
  637. cout << url_query_string.substr(start);
  638. return;
  639. }
  640. amp++;
  641. while (url_query_string[amp] == 'B' &&
  642. url_query_string[amp + 1] == '=' &&
  643. url_query_string[amp + 2] == prefix) {
  644. cout << url_query_string.substr(start, amp - start - 1);
  645. start = url_query_string.find('&', amp + 3);
  646. if (start == string::npos) return;
  647. amp = start + 1;
  648. }
  649. }
  650. }
  651. cout << url_query_string;
  652. }
  653. #endif
  654. class Fields {
  655. mutable Xapian::docid did_cached;
  656. mutable map<string, string> fields;
  657. void read_fields(Xapian::docid did) const;
  658. public:
  659. Fields() : did_cached(0) { }
  660. const string & get_field(Xapian::docid did, const string & field) const {
  661. if (did != did_cached) read_fields(did);
  662. return fields[field];
  663. }
  664. };
  665. void
  666. Fields::read_fields(Xapian::docid did) const
  667. {
  668. fields.clear();
  669. did_cached = did;
  670. const string & data = db.get_document(did).get_data();
  671. // Parse document data.
  672. string::size_type i = 0;
  673. const string & names = option["fieldnames"];
  674. if (!names.empty()) {
  675. // Each line is a field, with fieldnames taken from corresponding
  676. // entries in the tab-separated list specified by $opt{fieldnames}.
  677. string::size_type n = 0;
  678. do {
  679. string::size_type n0 = n;
  680. n = names.find('\t', n);
  681. string::size_type i0 = i;
  682. i = data.find('\n', i);
  683. fields.insert(make_pair(names.substr(n0, n - n0),
  684. data.substr(i0, i - i0)));
  685. } while (++n && ++i);
  686. } else {
  687. // Each line is a field, in the format NAME=VALUE. We assume the field
  688. // name doesn't contain an "=". Lines without an "=" are currently
  689. // just ignored.
  690. do {
  691. string::size_type i0 = i;
  692. i = data.find('\n', i);
  693. string line = data.substr(i0, i - i0);
  694. string::size_type j = line.find('=');
  695. if (j != string::npos) {
  696. string & value = fields[line.substr(0, j)];
  697. if (!value.empty()) value += '\t';
  698. value += line.substr(j + 1);
  699. }
  700. } while (++i);
  701. }
  702. }
  703. static Fields fields;
  704. static Xapian::docid q0;
  705. static Xapian::doccount hit_no;
  706. static int percent;
  707. static Xapian::weight weight;
  708. static Xapian::doccount collapsed;
  709. static string print_caption(const string &fmt, const vector<string> &param);
  710. enum tagval {
  711. CMD_,
  712. CMD_add,
  713. CMD_addfilter,
  714. CMD_allterms,
  715. CMD_and,
  716. CMD_cgi,
  717. CMD_cgilist,
  718. CMD_collapsed,
  719. CMD_date,
  720. CMD_dbname,
  721. CMD_dbsize,
  722. CMD_def,
  723. CMD_defaultop,
  724. CMD_div,
  725. CMD_eq,
  726. CMD_emptydocs,
  727. CMD_env,
  728. CMD_error,
  729. CMD_field,
  730. CMD_filesize,
  731. CMD_filters,
  732. CMD_filterterms,
  733. CMD_find,
  734. CMD_fmt,
  735. CMD_freq,
  736. CMD_ge,
  737. CMD_gt,
  738. CMD_highlight,
  739. CMD_hit,
  740. CMD_hitlist,
  741. CMD_hitsperpage,
  742. CMD_hostname,
  743. CMD_html,
  744. CMD_htmlstrip,
  745. CMD_httpheader,
  746. CMD_id,
  747. CMD_if,
  748. CMD_include,
  749. CMD_last,
  750. CMD_lastpage,
  751. CMD_le,
  752. CMD_length,
  753. CMD_list,
  754. CMD_log,
  755. CMD_lookup,
  756. CMD_lower,
  757. CMD_lt,
  758. CMD_map,
  759. CMD_max,
  760. CMD_min,
  761. CMD_mod,
  762. CMD_msize,
  763. CMD_msizeexact,
  764. CMD_mul,
  765. CMD_muldiv,
  766. CMD_ne,
  767. CMD_nice,
  768. CMD_not,
  769. CMD_now,
  770. CMD_opt,
  771. CMD_or,
  772. CMD_pack,
  773. CMD_percentage,
  774. CMD_prettyterm,
  775. CMD_query,
  776. CMD_querydescription,
  777. CMD_queryterms,
  778. CMD_range,
  779. CMD_record,
  780. CMD_relevant,
  781. CMD_relevants,
  782. CMD_score,
  783. CMD_set,
  784. CMD_setmap,
  785. CMD_setnumrangefield,
  786. CMD_setrelevant,
  787. CMD_slice,
  788. CMD_split,
  789. CMD_stoplist,
  790. CMD_sub,
  791. CMD_substr,
  792. CMD_suggestion,
  793. CMD_terms,
  794. CMD_thispage,
  795. CMD_time,
  796. CMD_topdoc,
  797. CMD_topterms,
  798. CMD_transform,
  799. CMD_uniq,
  800. CMD_unpack,
  801. CMD_unstem,
  802. CMD_upper,
  803. CMD_url,
  804. CMD_value,
  805. CMD_version,
  806. CMD_weight,
  807. CMD_MACRO // special tag for macro evaluation
  808. };
  809. struct func_attrib {
  810. int tag;
  811. int minargs, maxargs, evalargs;
  812. char ensure;
  813. };
  814. #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
  815. struct func_desc {
  816. const char *name;
  817. struct func_attrib a;
  818. };
  819. #define N -1
  820. #define M 'M'
  821. #define Q 'Q'
  822. // NB when adding a new command which ensures M or Q, update the list in
  823. // docs/omegascript.rst
  824. static struct func_desc func_tab[] = {
  825. //name minargs maxargs evalargs ensure
  826. {"",{CMD_, N, N, 0, 0}},// commented out code
  827. T(add, 0, N, N, 0), // add a list of numbers
  828. T(addfilter, 1, 1, N, 0), // add filter term
  829. T(allterms, 0, 1, N, 0), // list of all terms matching document
  830. T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
  831. T(cgi, 1, 1, N, 0), // return cgi parameter value
  832. T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
  833. T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
  834. T(date, 1, 2, N, 0), // convert time_t to strftime format
  835. // (default: YYYY-MM-DD)
  836. T(dbname, 0, 0, N, 0), // database name
  837. T(dbsize, 0, 0, N, 0), // database size (# of documents)
  838. T(def, 2, 2, 1, 0), // define a macro
  839. T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
  840. T(div, 2, 2, N, 0), // integer divide
  841. T(emptydocs, 0, 1, N, 0), // list of empty documents
  842. T(env, 1, 1, N, 0), // environment variable
  843. T(error, 0, 0, N, 0), // error message
  844. T(eq, 2, 2, N, 0), // test equality
  845. T(field, 1, 2, N, 0), // lookup field in record
  846. T(filesize, 1, 1, N, 0), // pretty printed filesize
  847. T(filters, 0, 0, N, 0), // serialisation of current filters
  848. T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
  849. T(find, 2, 2, N, 0), // find entry in list
  850. T(fmt, 0, 0, N, 0), // name of current format
  851. T(freq, 1, 1, N, 0), // frequency of a term
  852. T(ge, 2, 2, N, 0), // test >=
  853. T(gt, 2, 2, N, 0), // test >
  854. T(highlight, 2, 4, N, 0), // html escape and highlight words from list
  855. T(hit, 0, 0, N, 0), // hit number of current mset entry (starting
  856. // from 0
  857. T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
  858. T(hitsperpage, 0, 0, N, 0), // hits per page
  859. T(hostname, 1, 1, N, 0), // extract hostname from URL
  860. T(html, 1, 1, N, 0), // html escape string (<>&")
  861. T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
  862. T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
  863. T(id, 0, 0, N, 0), // docid of current doc
  864. T(if, 2, 3, 1, 0), // conditional
  865. T(include, 1, 1, 1, 0), // include another file
  866. T(last, 0, 0, N, M), // m-set number of last hit on page
  867. T(lastpage, 0, 0, N, M), // number of last hit page
  868. T(le, 2, 2, N, 0), // test <=
  869. T(length, 1, 1, N, 0), // length of list
  870. T(list, 2, 5, N, 0), // pretty print list
  871. T(log, 1, 2, 1, 0), // create a log entry
  872. T(lookup, 2, 2, N, 0), // lookup in named cdb file
  873. T(lower, 1, 1, N, 0), // convert string to lower case
  874. T(lt, 2, 2, N, 0), // test <
  875. T(map, 1, 2, 1, 0), // map a list into another list
  876. T(max, 1, N, N, 0), // maximum of a list of values
  877. T(min, 1, N, N, 0), // minimum of a list of values
  878. T(mod, 2, 2, N, 0), // integer modulus
  879. T(msize, 0, 0, N, M), // number of matches
  880. T(msizeexact, 0, 0, N, M), // is $msize exact?
  881. T(mul, 2, N, N, 0), // multiply a list of numbers
  882. T(muldiv, 3, 3, N, 0), // calculate A*B/C
  883. T(ne, 2, 2, N, 0), // test not equal
  884. T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
  885. T(not, 1, 1, N, 0), // logical not
  886. T(now, 0, 0, N, 0), // current date/time as a time_t
  887. T(opt, 1, 2, N, 0), // lookup an option value
  888. T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
  889. T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
  890. T(percentage, 0, 0, N, 0), // percentage score of current hit
  891. T(prettyterm, 1, 1, N, Q), // pretty print term name
  892. T(query, 0, 0, N, Q), // query
  893. T(querydescription,0, 0, N, Q), // query.get_description()
  894. T(queryterms, 0, 0, N, Q), // list of query terms
  895. T(range, 2, 2, N, 0), // return list of values between start and end
  896. T(record, 0, 1, N, 0), // record contents of document
  897. T(relevant, 0, 1, N, Q), // is document relevant?
  898. T(relevants, 0, 0, N, Q), // return list of relevant documents
  899. T(score, 0, 0, N, 0), // score (0-10) of current hit
  900. T(set, 2, 2, N, 0), // set option value
  901. T(setmap, 1, N, N, 0), // set map of option values
  902. T(setrelevant, 0, 1, N, Q), // set rset
  903. T(setnumrangefield, 1, 1, N, 0),
  904. T(slice, 2, 2, N, 0), // slice a list using a second list
  905. T(split, 1, 2, N, 0), // split a string to give a list
  906. T(stoplist, 0, 0, N, Q), // return list of stopped terms
  907. T(sub, 2, 2, N, 0), // subtract
  908. T(substr, 2, 3, N, 0), // substring
  909. T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
  910. T(terms, 0, 0, N, M), // list of matching terms
  911. T(thispage, 0, 0, N, M), // page number of current page
  912. T(time, 0, 0, N, M), // how long the match took (in seconds)
  913. T(topdoc, 0, 0, N, M), // first document on current page of hit list
  914. // (counting from 0)
  915. T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
  916. // (default 16)
  917. T(transform, 3, 3, N, 0), // transform with a regexp
  918. T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
  919. T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
  920. T(unstem, 1, 1, N, Q), // return list of probabilistic terms from
  921. // the query which stemmed to this term
  922. T(upper, 1, 1, N, 0), // convert string to upper case
  923. T(url, 1, 1, N, 0), // url encode argument
  924. T(value, 1, 2, N, 0), // return document value
  925. T(version, 0, 0, N, 0), // omega version string
  926. T(weight, 0, 0, N, 0), // weight of the current hit
  927. { NULL,{0, 0, 0, 0, 0}}
  928. };
  929. #undef T // Leaving T defined screws up Sun's C++ compiler!
  930. static vector<string> macros;
  931. // Call write() repeatedly until all data is written or we get a
  932. // non-recoverable error.
  933. static ssize_t
  934. write_all(int fd, const char * buf, size_t count)
  935. {
  936. while (count) {
  937. ssize_t r = write(fd, buf, count);
  938. if (rare(r < 0)) {
  939. if (errno == EINTR) continue;
  940. return r;
  941. }
  942. buf += r;
  943. count -= r;
  944. }
  945. return 0;
  946. }
  947. static string
  948. eval(const string &fmt, const vector<string> &param)
  949. {
  950. static map<string, const struct func_attrib *> func_map;
  951. if (func_map.empty()) {
  952. struct func_desc *p;
  953. for (p = func_tab; p->name != NULL; p++) {
  954. func_map[string(p->name)] = &(p->a);
  955. }
  956. }
  957. string res;
  958. string::size_type p = 0, q;
  959. while ((q = fmt.find('$', p)) != string::npos) try {
  960. res += fmt.substr(p, q - p);
  961. string::size_type code_start = q; // note down for error reporting
  962. q++;
  963. if (q >= fmt.size()) break;
  964. unsigned char ch = fmt[q];
  965. switch (ch) {
  966. // Magic sequences:
  967. // `$$' -> `$', `$(' -> `{', `$)' -> `}', `$.' -> `,'
  968. case '$':
  969. res += '$';
  970. p = q + 1;
  971. continue;
  972. case '(':
  973. res += '{';
  974. p = q + 1;
  975. continue;
  976. case ')':
  977. res += '}';
  978. p = q + 1;
  979. continue;
  980. case '.':
  981. res += ',';
  982. p = q + 1;
  983. continue;
  984. case '_':
  985. ch = '0';
  986. // FALL THRU
  987. case '1': case '2': case '3': case '4': case '5':
  988. case '6': case '7': case '8': case '9':
  989. ch -= '0';
  990. if (ch < param.size()) res += param[ch];
  991. p = q + 1;
  992. continue;
  993. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  994. case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
  995. case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
  996. case 's': case 't': case 'u': case 'v': case 'w': case 'x':
  997. case 'y': case 'z':
  998. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  999. case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
  1000. case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
  1001. case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
  1002. case 'Y': case 'Z':
  1003. case '{':
  1004. break;
  1005. default:
  1006. string msg = "Unknown $ code in: $" + fmt.substr(q);
  1007. throw msg;
  1008. }
  1009. p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
  1010. string var = fmt.substr(q, p - q);
  1011. map<string, const struct func_attrib *>::const_iterator func;
  1012. func = func_map.find(var);
  1013. if (func == func_map.end()) {
  1014. throw "Unknown function `" + var + "'";
  1015. }
  1016. vector<string> args;
  1017. if (fmt[p] == '{') {
  1018. q = p + 1;
  1019. int nest = 1;
  1020. while (true) {
  1021. p = fmt.find_first_of(",{}", p + 1);
  1022. if (p == string::npos)
  1023. throw "missing } in " + fmt.substr(code_start);
  1024. if (fmt[p] == '{') {
  1025. ++nest;
  1026. } else {
  1027. if (nest == 1) {
  1028. // should we split the args
  1029. if (func->second->minargs != N) {
  1030. args.push_back(fmt.substr(q, p - q));
  1031. q = p + 1;
  1032. }
  1033. }
  1034. if (fmt[p] == '}' && --nest == 0) break;
  1035. }
  1036. }
  1037. if (func->second->minargs == N)
  1038. args.push_back(fmt.substr(q, p - q));
  1039. p++;
  1040. }
  1041. if (func->second->minargs != N) {
  1042. if ((int)args.size() < func->second->minargs)
  1043. throw "too few arguments to $" + var;
  1044. if (func->second->maxargs != N &&
  1045. (int)args.size() > func->second->maxargs)
  1046. throw "too many arguments to $" + var;
  1047. vector<string>::size_type n;
  1048. if (func->second->evalargs != N)
  1049. n = func->second->evalargs;
  1050. else
  1051. n = args.size();
  1052. for (vector<string>::size_type j = 0; j < n; j++)
  1053. args[j] = eval(args[j], param);
  1054. }
  1055. if (func->second->ensure == 'Q' || func->second->ensure == 'M')
  1056. ensure_query_parsed();
  1057. if (func->second->ensure == 'M') ensure_match();
  1058. string value;
  1059. switch (func->second->tag) {
  1060. case CMD_:
  1061. break;
  1062. case CMD_setnumrangefield: {
  1063. Xapian::NumberValueRangeProcessor numrange_proc(string_to_int(args[0]));
  1064. qp.add_valuerangeprocessor(&numrange_proc);
  1065. ensure_query_parsed(); /* this is doing what we need, but it's more a hack than a solution
  1066. ensure_query_parsed() won't be called before this step because "ensure" value is 0
  1067. (the 5th arg in the T macro above) */
  1068. break;
  1069. }
  1070. case CMD_add: {
  1071. int total = 0;
  1072. vector<string>::const_iterator i;
  1073. for (i = args.begin(); i != args.end(); i++)
  1074. total += string_to_int(*i);
  1075. value = str(total);
  1076. break;
  1077. }
  1078. case CMD_addfilter:
  1079. add_bterm(args[0]);
  1080. break;
  1081. case CMD_allterms: {
  1082. // list of all terms indexing document
  1083. int id = q0;
  1084. if (!args.empty()) id = string_to_int(args[0]);
  1085. Xapian::TermIterator term = db.termlist_begin(id);
  1086. for ( ; term != db.termlist_end(id); term++)
  1087. value = value + *term + '\t';
  1088. if (!value.empty()) value.erase(value.size() - 1);
  1089. break;
  1090. }
  1091. case CMD_and: {
  1092. value = "true";
  1093. for (vector<string>::const_iterator i = args.begin();
  1094. i != args.end(); i++) {
  1095. if (eval(*i, param).empty()) {
  1096. value.resize(0);
  1097. break;
  1098. }
  1099. }
  1100. break;
  1101. }
  1102. case CMD_cgi: {
  1103. MCI i = cgi_params.find(args[0]);
  1104. if (i != cgi_params.end()) value = i->second;
  1105. break;
  1106. }
  1107. case CMD_cgilist: {
  1108. pair<MCI, MCI> g;
  1109. g = cgi_params.equal_range(args[0]);
  1110. for (MCI i = g.first; i != g.second; i++)
  1111. value = value + i->second + '\t';
  1112. if (!value.empty()) value.erase(value.size() - 1);
  1113. break;
  1114. }
  1115. case CMD_collapsed: {
  1116. value = str(collapsed);
  1117. break;
  1118. }
  1119. case CMD_date:
  1120. value = args[0];
  1121. if (!value.empty()) {
  1122. char buf[64] = "";
  1123. time_t date = string_to_int(value);
  1124. if (date != (time_t)-1) {
  1125. struct tm *then;
  1126. then = gmtime(&date);
  1127. string date_fmt = "%Y-%m-%d";
  1128. if (args.size() > 1) date_fmt = eval(args[1], param);
  1129. strftime(buf, sizeof buf, date_fmt.c_str(), then);
  1130. }
  1131. value = buf;
  1132. }
  1133. break;
  1134. case CMD_dbname:
  1135. value = dbname;
  1136. break;
  1137. case CMD_dbsize: {
  1138. static Xapian::doccount dbsize;
  1139. if (!dbsize) dbsize = db.get_doccount();
  1140. value = str(dbsize);
  1141. break;
  1142. }
  1143. case CMD_def: {
  1144. func_attrib *fa = new func_attrib;
  1145. fa->tag = CMD_MACRO + macros.size();
  1146. fa->minargs = 0;
  1147. fa->maxargs = 9;
  1148. fa->evalargs = N; // FIXME: or 0?
  1149. fa->ensure = 0;
  1150. macros.push_back(args[1]);
  1151. func_map[args[0]] = fa;
  1152. break;
  1153. }
  1154. case CMD_defaultop:
  1155. if (default_op == Xapian::Query::OP_AND) {
  1156. value = "and";
  1157. } else {
  1158. value = "or";
  1159. }
  1160. break;
  1161. case CMD_div: {
  1162. int denom = string_to_int(args[1]);
  1163. if (denom == 0) {
  1164. value = "divide by 0";
  1165. } else {
  1166. value = str(string_to_int(args[0]) /
  1167. string_to_int(args[1]));
  1168. }
  1169. break;
  1170. }
  1171. case CMD_eq:
  1172. if (args[0] == args[1]) value = "true";
  1173. break;
  1174. case CMD_emptydocs: {
  1175. string t;
  1176. if (!args.empty())
  1177. t = args[0];
  1178. Xapian::PostingIterator i;
  1179. for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
  1180. if (i.get_doclength() != 0) continue;
  1181. if (!value.empty()) value += '\t';
  1182. value += str(*i);
  1183. }
  1184. break;
  1185. }
  1186. case CMD_env: {
  1187. char *env = getenv(args[0].c_str());
  1188. if (env != NULL) value = env;
  1189. break;
  1190. }
  1191. case CMD_error:
  1192. if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
  1193. error_msg = "Database `" + dbname + "' couldn't be opened";
  1194. }
  1195. value = error_msg;
  1196. break;
  1197. case CMD_field: {
  1198. Xapian::docid did = q0;
  1199. if (args.size() > 1) did = string_to_int(args[1]);
  1200. value = fields.get_field(did, args[0]);
  1201. break;
  1202. }
  1203. case CMD_filesize: {
  1204. // FIXME: rounding? i18n?
  1205. int size = string_to_int(args[0]);
  1206. int intpart = size;
  1207. int fraction = -1;
  1208. const char * format = 0;
  1209. if (size < 0) {
  1210. // Negative size -> empty result.
  1211. } else if (size == 1) {
  1212. format = "%d byte";
  1213. } else if (size < 1024) {
  1214. format = "%d bytes";
  1215. } else {
  1216. if (size < 1024*1024) {
  1217. format = "%d.%cK";
  1218. } else {
  1219. size /= 1024;
  1220. if (size < 1024*1024) {
  1221. format = "%d.%cM";
  1222. } else {
  1223. size /= 1024;
  1224. format = "%d.%cG";
  1225. }
  1226. }
  1227. intpart = unsigned(size) / 1024;
  1228. fraction = unsigned(size) % 1024;
  1229. }
  1230. if (format) {
  1231. char buf[200];
  1232. int len;
  1233. if (fraction == -1) {
  1234. len = my_snprintf(buf, sizeof(buf), format, intpart);
  1235. } else {
  1236. fraction = (fraction * 10 / 1024) + '0';
  1237. len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
  1238. }
  1239. if (len < 0 || (unsigned)len > sizeof(buf)) len = sizeof(buf);
  1240. value.assign(buf, len);
  1241. }
  1242. break;
  1243. }
  1244. case CMD_filters:
  1245. value = filters;
  1246. break;
  1247. case CMD_filterterms: {
  1248. Xapian::TermIterator term = db.allterms_begin();
  1249. term.skip_to(args[0]);
  1250. while (term != db.allterms_end()) {
  1251. string t = *term;
  1252. if (!startswith(t, args[0])) break;
  1253. value = value + t + '\t';
  1254. ++term;
  1255. }
  1256. if (!value.empty()) value.erase(value.size() - 1);
  1257. break;
  1258. }
  1259. case CMD_find: {
  1260. string l = args[0], s = args[1];
  1261. string::size_type i = 0, j = 0;
  1262. size_t count = 0;
  1263. while (j != l.size()) {
  1264. j = l.find('\t', i);
  1265. if (j == string::npos) j = l.size();
  1266. if (j - i == s.length()) {
  1267. if (memcmp(s.data(), l.data() + i, j - i) == 0) {
  1268. value = str(count);
  1269. break;
  1270. }
  1271. }
  1272. ++count;
  1273. i = j + 1;
  1274. }
  1275. break;
  1276. }
  1277. case CMD_fmt:
  1278. value = fmtname;
  1279. break;
  1280. case CMD_freq:
  1281. try {
  1282. value = str(mset.get_termfreq(args[0]));
  1283. } catch (const Xapian::InvalidOperationError&) {
  1284. // An MSet will raise this error if it's empty and not
  1285. // associated with a search.
  1286. value = str(db.get_termfreq(args[0]));
  1287. }
  1288. break;
  1289. case CMD_ge:
  1290. if (string_to_int(args[0]) >= string_to_int(args[1]))
  1291. value = "true";
  1292. break;
  1293. case CMD_gt:
  1294. if (string_to_int(args[0]) > string_to_int(args[1]))
  1295. value = "true";
  1296. break;
  1297. case CMD_highlight: {
  1298. string bra, ket;
  1299. if (args.size() > 2) {
  1300. bra = args[2];
  1301. if (args.size() > 3) {
  1302. ket = args[3];
  1303. } else {
  1304. string::const_iterator i;
  1305. i = find_if(bra.begin() + 2, bra.end(), p_nottag);
  1306. ket = "</";
  1307. ket += bra.substr(1, i - bra.begin() - 1);
  1308. ket += '>';
  1309. }
  1310. }
  1311. value = html_highlight(args[0], args[1], bra, ket);
  1312. break;
  1313. }
  1314. case CMD_hit:
  1315. // 0-based mset index
  1316. value = str(hit_no);
  1317. break;
  1318. case CMD_hitlist:
  1319. #if 0
  1320. const char *q;
  1321. int ch;
  1322. url_query_string = "?DB=";
  1323. url_query_string += dbname;
  1324. url_query_string += "&P=";
  1325. q = query_string.c_str();
  1326. while ((ch = *q++) != '\0') {
  1327. switch (ch) {
  1328. case '+':
  1329. url_query_string += "%2b";
  1330. break;
  1331. case '"':
  1332. url_query_string += "%22";
  1333. break;
  1334. case ' ':
  1335. ch = '+';
  1336. /* fall through */
  1337. default:
  1338. url_query_string += ch;
  1339. }
  1340. }
  1341. // add any boolean terms
  1342. for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
  1343. url_query_string += "&B=";
  1344. url_query_string += i->second;
  1345. }
  1346. #endif
  1347. for (hit_no = topdoc; hit_no < last; hit_no++)
  1348. value += print_caption(args[0], param);
  1349. hit_no = 0;
  1350. break;
  1351. case CMD_hitsperpage:
  1352. value = str(hits_per_page);
  1353. break;
  1354. case CMD_hostname: {
  1355. value = args[0];
  1356. // remove URL scheme and/or path
  1357. string::size_type i = value.find("://");
  1358. if (i == string::npos) i = 0; else i += 3;
  1359. value = value.substr(i, value.find('/', i) - i);
  1360. // remove user@ or user:password@
  1361. i = value.find('@');
  1362. if (i != string::npos) value.erase(0, i + 1);
  1363. // remove :port
  1364. i = value.find(':');
  1365. if (i != string::npos) value.resize(i);
  1366. break;
  1367. }
  1368. case CMD_html:
  1369. value = html_escape(args[0]);
  1370. break;
  1371. case CMD_htmlstrip:
  1372. value = html_strip(args[0]);
  1373. break;
  1374. case CMD_httpheader:
  1375. if (!suppress_http_headers) {
  1376. cout << args[0] << ": " << args[1] << endl;
  1377. if (!set_content_type && args[0].length() == 12 &&
  1378. strcasecmp(args[0].c_str(), "Content-Type") == 0) {
  1379. set_content_type = true;
  1380. }
  1381. }
  1382. break;
  1383. case CMD_id:
  1384. // document id
  1385. value = str(q0);
  1386. break;
  1387. case CMD_if:
  1388. if (!args[0].empty())
  1389. value = eval(args[1], param);
  1390. else if (args.size() > 2)
  1391. value = eval(args[2], param);
  1392. break;
  1393. case CMD_include:
  1394. value = eval_file(args[0]);
  1395. break;
  1396. case CMD_last:
  1397. value = str(last);
  1398. break;
  1399. case CMD_lastpage: {
  1400. int l = mset.get_matches_estimated();
  1401. if (l > 0) l = (l - 1) / hits_per_page + 1;
  1402. value = str(l);
  1403. break;
  1404. }
  1405. case CMD_le:
  1406. if (string_to_int(args[0]) <= string_to_int(args[1]))
  1407. value = "true";
  1408. break;
  1409. case CMD_length:
  1410. if (args[0].empty()) {
  1411. value = "0";
  1412. } else {
  1413. size_t length = count(args[0].begin(), args[0].end(), '\t');
  1414. value = str(length + 1);
  1415. }
  1416. break;
  1417. case CMD_list: {
  1418. if (!args[0].empty()) {
  1419. string pre, inter, interlast, post;
  1420. switch (args.size()) {
  1421. case 2:
  1422. inter = interlast = args[1];
  1423. break;
  1424. case 3:
  1425. inter = args[1];
  1426. interlast = args[2];
  1427. break;
  1428. case 4:
  1429. pre = args[1];
  1430. inter = interlast = args[2];
  1431. post = args[3];
  1432. break;
  1433. case 5:
  1434. pre = args[1];
  1435. inter = args[2];
  1436. interlast = args[3];
  1437. post = args[4];
  1438. break;
  1439. }
  1440. value += pre;
  1441. string list = args[0];
  1442. string::size_type split = 0, split2;
  1443. while ((split2 = list.find('\t', split)) != string::npos) {
  1444. if (split) value += inter;
  1445. value += list.substr(split, split2 - split);
  1446. split = split2 + 1;
  1447. }
  1448. if (split) value += interlast;
  1449. value += list.substr(split);
  1450. value += post;
  1451. }
  1452. break;
  1453. }
  1454. case CMD_log: {
  1455. if (!vet_filename(args[0])) break;
  1456. string logfile = log_dir + args[0];
  1457. int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
  1458. if (fd == -1) break;
  1459. vector<string> noargs;
  1460. noargs.resize(1);
  1461. string line;
  1462. if (args.size() > 1) {
  1463. line = args[1];
  1464. } else {
  1465. line = DEFAULT_LOG_ENTRY;
  1466. }
  1467. line = eval(line, noargs);
  1468. line += '\n';
  1469. (void)write_all(fd, line.data(), line.length());
  1470. close(fd);
  1471. break;
  1472. }
  1473. case CMD_lookup: {
  1474. if (!vet_filename(args[0])) break;
  1475. string cdbfile = cdb_dir + args[0];
  1476. int fd = open(cdbfile.c_str(), O_RDONLY);
  1477. if (fd == -1) break;
  1478. struct cdb cdb;
  1479. cdb_init(&cdb, fd);
  1480. if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
  1481. size_t datalen = cdb_datalen(&cdb);
  1482. const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
  1483. if (q) {
  1484. value = string(static_cast<const char *>(dat), datalen);
  1485. }
  1486. }
  1487. cdb_free(&cdb);
  1488. close(fd); // FIXME: cache fds?
  1489. break;
  1490. }
  1491. case CMD_lower:
  1492. value = Xapian::Unicode::tolower(args[0]);
  1493. break;
  1494. case CMD_lt:
  1495. if (string_to_int(args[0]) < string_to_int(args[1]))
  1496. value = "true";
  1497. break;
  1498. case CMD_map:
  1499. if (!args[0].empty()) {
  1500. string l = args[0], pat = args[1];
  1501. vector<string> new_args(param);
  1502. string::size_type i = 0, j;
  1503. while (true) {
  1504. j = l.find('\t', i);
  1505. new_args[0] = l.substr(i, j - i);
  1506. value += eval(pat, new_args);
  1507. if (j == string::npos) break;
  1508. value += '\t';
  1509. i = j + 1;
  1510. }
  1511. }
  1512. break;
  1513. case CMD_max: {
  1514. vector<string>::const_iterator i = args.begin();
  1515. int val = string_to_int(*i++);
  1516. for (; i != args.end(); i++) {
  1517. int x = string_to_int(*i);
  1518. if (x > val) val = x;
  1519. }
  1520. value = str(val);
  1521. break;
  1522. }
  1523. case CMD_min: {
  1524. vector<string>::const_iterator i = args.begin();
  1525. int val = string_to_int(*i++);
  1526. for (; i != args.end(); i++) {
  1527. int x = string_to_int(*i);
  1528. if (x < val) val = x;
  1529. }
  1530. value = str(val);
  1531. break;
  1532. }
  1533. case CMD_msize:
  1534. // number of matches
  1535. value = str(mset.get_matches_estimated());
  1536. break;
  1537. case CMD_msizeexact:
  1538. // is msize exact?
  1539. if (mset.get_matches_lower_bound()
  1540. == mset.get_matches_upper_bound())
  1541. value = "true";
  1542. break;
  1543. case CMD_mod: {
  1544. int denom = string_to_int(args[1]);
  1545. if (denom == 0) {
  1546. value = "divide by 0";
  1547. } else {
  1548. value = str(string_to_int(args[0]) %
  1549. string_to_int(args[1]));
  1550. }
  1551. break;
  1552. }
  1553. case CMD_mul: {
  1554. vector<string>::const_iterator i = args.begin();
  1555. int total = string_to_int(*i++);
  1556. while (i != args.end())
  1557. total *= string_to_int(*i++);
  1558. value = str(total);
  1559. break;
  1560. }
  1561. case CMD_muldiv: {
  1562. int denom = string_to_int(args[2]);
  1563. if (denom == 0) {
  1564. value = "divide by 0";
  1565. } else {
  1566. int num = string_to_int(args[0]) * string_to_int(args[1]);
  1567. value = str(num / denom);
  1568. }
  1569. break;
  1570. }
  1571. case CMD_ne:
  1572. if (args[0] != args[1]) value = "true";
  1573. break;
  1574. case CMD_nice: {
  1575. string::const_iterator i = args[0].begin();
  1576. int len = args[0].length();
  1577. while (len) {
  1578. value += *i++;
  1579. if (--len && len % 3 == 0) value += option["thousand"];
  1580. }
  1581. break;
  1582. }
  1583. case CMD_not:
  1584. if (args[0].empty()) value = "true";
  1585. break;
  1586. case CMD_now: {
  1587. char buf[64];
  1588. my_snprintf(buf, sizeof(buf), "%lu", (unsigned long)time(NULL));
  1589. // MSVC's snprintf omits the zero byte if the string if
  1590. // sizeof(buf) long.
  1591. buf[sizeof(buf) - 1] = '\0';
  1592. value = buf;
  1593. break;
  1594. }
  1595. case CMD_opt:
  1596. if (args.size() == 2) {
  1597. value = option[args[0] + "," + args[1]];
  1598. } else {
  1599. value = option[args[0]];
  1600. }
  1601. break;
  1602. case CMD_or: {
  1603. for (vector<string>::const_iterator i = args.begin();
  1604. i != args.end(); i++) {
  1605. value = eval(*i, param);
  1606. if (!value.empty()) break;
  1607. }
  1608. break;
  1609. }
  1610. case CMD_pack:
  1611. value = int_to_binary_string(string_to_int(args[0]));
  1612. break;
  1613. case CMD_percentage:
  1614. // percentage score
  1615. value = str(percent);
  1616. break;
  1617. case CMD_prettyterm:
  1618. value = pretty_term(args[0]);
  1619. break;
  1620. case CMD_query:
  1621. value = query_string;
  1622. break;
  1623. case CMD_querydescription:
  1624. value = query.get_description();
  1625. break;
  1626. case CMD_queryterms:
  1627. value = queryterms;
  1628. break;
  1629. case CMD_range: {
  1630. int start = string_to_int(args[0]);
  1631. int end = string_to_int(args[1]);
  1632. while (start <= end) {
  1633. value += str(start);
  1634. if (start < end) value += '\t';
  1635. start++;
  1636. }
  1637. break;
  1638. }
  1639. case CMD_record: {
  1640. int id = q0;
  1641. if (!args.empty()) id = string_to_int(args[0]);
  1642. value = db.get_document(id).get_data();
  1643. break;
  1644. }
  1645. case CMD_relevant: {
  1646. // document id if relevant; empty otherwise
  1647. int id = q0;
  1648. if (!args.empty()) id = string_to_int(args[0]);
  1649. map<Xapian::docid, bool>::iterator i = ticked.find(id);
  1650. if (i != ticked.end()) {
  1651. i->second = false; // icky side-effect
  1652. value = str(id);
  1653. }
  1654. break;
  1655. }
  1656. case CMD_relevants: {
  1657. for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
  1658. i != ticked.end(); i++) {
  1659. if (i->second) {
  1660. value += str(i->first);
  1661. value += '\t';
  1662. }
  1663. }
  1664. if (!value.empty()) value.erase(value.size() - 1);
  1665. break;
  1666. }
  1667. case CMD_score:
  1668. // Score (0 to 10)
  1669. value = str(percent / 10);
  1670. break;
  1671. case CMD_set:
  1672. option[args[0]] = args[1];
  1673. break;
  1674. case CMD_setmap: {
  1675. string base = args[0] + ',';
  1676. if (args.size() % 2 != 1)
  1677. throw string("$setmap requires an odd number of arguments");
  1678. for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
  1679. option[base + args[i]] = args[i + 1];
  1680. }
  1681. break;
  1682. }
  1683. case CMD_setrelevant: {
  1684. string::size_type i = 0, j;
  1685. while (true) {
  1686. j = args[0].find_first_not_of("0123456789", i);
  1687. Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
  1688. if (id) {
  1689. rset.add_document(id);
  1690. ticked[id] = true;
  1691. }
  1692. if (j == string::npos) break;
  1693. i = j + 1;
  1694. }
  1695. break;
  1696. }
  1697. case CMD_slice: {
  1698. string list = args[0], pos = args[1];
  1699. vector<string> items;
  1700. string::size_type i = 0, j;
  1701. while (true) {
  1702. j = list.find('\t', i);
  1703. items.push_back(list.substr(i, j - i));
  1704. if (j == string::npos) break;
  1705. i = j + 1;
  1706. }
  1707. i = 0;
  1708. bool have_added = false;
  1709. while (true) {
  1710. j = pos.find('\t', i);
  1711. int item = string_to_int(pos.substr(i, j - i));
  1712. if (item >= 0 && size_t(item) < items.size()) {
  1713. if (have_added) value += '\t';
  1714. value += items[item];
  1715. have_added = true;
  1716. }
  1717. if (j == string::npos) break;
  1718. i = j + 1;
  1719. }
  1720. break;
  1721. }
  1722. case CMD_split: {
  1723. string split;
  1724. if (args.size() == 1) {
  1725. split = " ";
  1726. value = args[0];
  1727. } else {
  1728. split = args[0];
  1729. value = args[1];
  1730. }
  1731. string::size_type i = 0;
  1732. while (true) {
  1733. if (split.empty()) {
  1734. ++i;
  1735. if (i >= value.size()) break;
  1736. } else {
  1737. i = value.find(split, i);
  1738. if (i == string::npos) break;
  1739. }
  1740. value.replace(i, split.size(), 1, '\t');
  1741. ++i;
  1742. }
  1743. break;
  1744. }
  1745. case CMD_stoplist: {
  1746. Xapian::TermIterator i = qp.stoplist_begin();
  1747. Xapian::TermIterator end = qp.stoplist_end();
  1748. while (i != end) {
  1749. if (!value.empty()) value += '\t';
  1750. value += *i;
  1751. ++i;
  1752. }
  1753. break;
  1754. }
  1755. case CMD_sub:
  1756. value = str(string_to_int(args[0]) - string_to_int(args[1]));
  1757. break;
  1758. case CMD_substr: {
  1759. int start = string_to_int(args[1]);
  1760. if (start < 0) {
  1761. if (static_cast<size_t>(-start) >= args[0].size()) {
  1762. start = 0;
  1763. } else {
  1764. start = static_cast<int>(args[0].size()) + start;
  1765. }
  1766. } else {
  1767. if (static_cast<size_t>(start) >= args[0].size()) break;
  1768. }
  1769. size_t len = string::npos;
  1770. if (args.size() > 2) {

Large files files are truncated, but you can click here to view the full file