PageRenderTime 112ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 1ms

/xapian-applications/omega/query.cc

https://github.com/harlentan/xapian
C++ | 2448 lines | 2149 code | 132 blank | 167 comment | 534 complexity | 053a839d997da4d889bc229a0b0492c5 MD5 | raw file
Possible License(s): GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. /* query.cc: query executor for omega
  2. *
  3. * Copyright 1999,2000,2001 BrightStation PLC
  4. * Copyright 2001 James Aylett
  5. * Copyright 2001,2002 Ananova Ltd
  6. * Copyright 2002 Intercede 1749 Ltd
  7. * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014 Olly Betts
  8. * Copyright 2008 Thomas Viehmann
  9. *
  10. * This program is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU General Public License as
  12. * published by the Free Software Foundation; either version 2 of the
  13. * License, or (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
  23. * USA
  24. */
  25. #include <config.h>
  26. // If we're building against git after the expand API changed but before the
  27. // version gets bumped to 1.3.2, we'll get a deprecation warning from
  28. // get_eset() unless we suppress such warnings here.
  29. #define XAPIAN_DEPRECATED(D) D
  30. #include <algorithm>
  31. #include <iostream>
  32. #include <map>
  33. #include <set>
  34. #include <vector>
  35. #include <cassert>
  36. #include <cctype>
  37. #include "safeerrno.h"
  38. #include <stdio.h>
  39. #include <cstdlib>
  40. #include <cstring>
  41. #include "strcasecmp.h"
  42. #include <ctime>
  43. #include "safeunistd.h"
  44. #include <sys/types.h>
  45. #include "safesysstat.h"
  46. #include "safefcntl.h"
  47. #include "realtime.h"
  48. #include <cdb.h>
  49. #include "date.h"
  50. #include "datematchdecider.h"
  51. #include "jsonescape.h"
  52. #include "utils.h"
  53. #include "omega.h"
  54. #include "query.h"
  55. #include "cgiparam.h"
  56. #include "loadfile.h"
  57. #include "sample.h"
  58. #include "str.h"
  59. #include "stringutils.h"
  60. #include "transform.h"
  61. #include "urldecode.h"
  62. #include "urlencode.h"
  63. #include "unixperm.h"
  64. #include "values.h"
  65. #include "weight.h"
  66. #include "expand.h"
  67. #include <xapian.h>
  68. using namespace std;
  69. using Xapian::Utf8Iterator;
  70. using Xapian::Unicode::is_wordchar;
  71. #ifndef SNPRINTF
  72. #include <cstdarg>
  73. static int my_snprintf(char *str, size_t size, const char *format, ...)
  74. {
  75. int res;
  76. va_list ap;
  77. va_start(ap, format);
  78. str[size - 1] = '\0';
  79. res = vsprintf(str, format, ap);
  80. if (str[size - 1] || res < 0 || size_t(res) >= size)
  81. abort(); /* Overflowed! */
  82. va_end(ap);
  83. return res;
  84. }
  85. #else
  86. #define my_snprintf SNPRINTF
  87. #endif
  88. static bool query_parsed = false;
  89. static bool done_query = false;
  90. static Xapian::docid last = 0;
  91. static Xapian::MSet mset;
  92. static map<Xapian::docid, bool> ticked;
  93. static void ensure_query_parsed();
  94. static void ensure_match();
  95. static Xapian::Query query;
  96. //static string url_query_string;
  97. Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
  98. static Xapian::QueryParser qp;
  99. static Xapian::NumberValueRangeProcessor * size_vrp = NULL;
  100. static Xapian::Stem *stemmer = NULL;
  101. static string eval_file(const string &fmtfile);
  102. static set<string> termset;
  103. // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
  104. static map<string, string> termprefix_to_userprefix;
  105. static string queryterms;
  106. static string error_msg;
  107. static double secs = -1;
  108. static const char DEFAULT_LOG_ENTRY[] =
  109. "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
  110. "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
  111. "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
  112. "$dbname\t"
  113. "$query\t"
  114. "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
  115. class MyStopper : public Xapian::Stopper {
  116. public:
  117. bool operator()(const string &t) const {
  118. switch (t[0]) {
  119. case 'a':
  120. return (t == "a" || t == "about" || t == "an" || t == "and" ||
  121. t == "are" || t == "as" || t == "at");
  122. case 'b':
  123. return (t == "be" || t == "by");
  124. case 'e':
  125. return (t == "en");
  126. case 'f':
  127. return (t == "for" || t == "from");
  128. case 'h':
  129. return (t == "how");
  130. case 'i':
  131. return (t == "i" || t == "in" || t == "is" || t == "it");
  132. case 'o':
  133. return (t == "of" || t == "on" || t == "or");
  134. case 't':
  135. return (t == "that" || t == "the" || t == "this" || t == "to");
  136. case 'w':
  137. return (t == "was" || t == "what" || t == "when" ||
  138. t == "where" || t == "which" || t == "who" ||
  139. t == "why" || t == "will" || t == "with");
  140. case 'y':
  141. return (t == "you" || t == "your");
  142. default:
  143. return false;
  144. }
  145. }
  146. };
  147. static size_t
  148. prefix_from_term(string &prefix, const string &term)
  149. {
  150. if (term.empty()) {
  151. prefix.resize(0);
  152. return 0;
  153. }
  154. if (term[0] == 'X') {
  155. const string::const_iterator begin = term.begin();
  156. string::const_iterator i = begin + 1;
  157. while (i != term.end() && isupper(static_cast<unsigned char>(*i))) ++i;
  158. prefix.assign(begin, i);
  159. if (i != term.end() && *i == ':') ++i;
  160. return i - begin;
  161. }
  162. prefix = term[0];
  163. return 1;
  164. }
  165. // Don't allow ".." in format names, log file names, etc as this would allow
  166. // people to open a format "../../etc/passwd" or similar.
  167. // FIXME: make this check more exact ("foo..bar" is safe)
  168. // FIXME: log when this check fails
  169. static bool
  170. vet_filename(const string &filename)
  171. {
  172. string::size_type i = filename.find("..");
  173. return (i == string::npos);
  174. }
  175. // Heuristics:
  176. // * If any terms have been removed, it's a "fresh query" so we discard any
  177. // relevance judgements
  178. // * If all previous terms are there but more have been added then we keep
  179. // the relevance judgements, but return the first page of hits
  180. //
  181. // NEW_QUERY entirely new query
  182. // SAME_QUERY unchanged query
  183. // EXTENDED_QUERY new query, but based on the old one
  184. // BAD_QUERY parse error (message in error_msg)
  185. typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
  186. static map<string, string> probabilistic_query;
  187. void
  188. set_probabilistic_query(const string & prefix, const string & s)
  189. {
  190. string query_string = s;
  191. // Strip leading and trailing whitespace from query_string.
  192. trim(query_string);
  193. if (!query_string.empty())
  194. probabilistic_query.insert(make_pair(prefix, query_string));
  195. }
  196. static unsigned
  197. read_qp_flags(const string & opt_pfx, unsigned f)
  198. {
  199. map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
  200. for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
  201. unsigned mask = 0;
  202. const char * s = i->first.c_str() + opt_pfx.size();
  203. switch (s[0]) {
  204. case 'a':
  205. if (strcmp(s, "auto_multiword_synonyms") == 0) {
  206. mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
  207. break;
  208. }
  209. if (strcmp(s, "auto_synonyms") == 0) {
  210. mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
  211. break;
  212. }
  213. break;
  214. case 'b':
  215. if (strcmp(s, "boolean") == 0) {
  216. mask = Xapian::QueryParser::FLAG_BOOLEAN;
  217. break;
  218. }
  219. if (strcmp(s, "boolean_any_case") == 0) {
  220. mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
  221. break;
  222. }
  223. break;
  224. case 'd':
  225. if (strcmp(s, "default") == 0) {
  226. mask = Xapian::QueryParser::FLAG_DEFAULT;
  227. break;
  228. }
  229. break;
  230. case 'l':
  231. if (strcmp(s, "lovehate") == 0) {
  232. mask = Xapian::QueryParser::FLAG_LOVEHATE;
  233. break;
  234. }
  235. break;
  236. case 'p':
  237. if (strcmp(s, "partial") == 0) {
  238. mask = Xapian::QueryParser::FLAG_PARTIAL;
  239. break;
  240. }
  241. if (strcmp(s, "phrase") == 0) {
  242. mask = Xapian::QueryParser::FLAG_PHRASE;
  243. break;
  244. }
  245. if (strcmp(s, "pure_not") == 0) {
  246. mask = Xapian::QueryParser::FLAG_PURE_NOT;
  247. break;
  248. }
  249. break;
  250. case 's':
  251. if (strcmp(s, "spelling_correction") == 0) {
  252. mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
  253. break;
  254. }
  255. if (strcmp(s, "synonym") == 0) {
  256. mask = Xapian::QueryParser::FLAG_SYNONYM;
  257. break;
  258. }
  259. break;
  260. case 'w':
  261. if (strcmp(s, "wildcard") == 0) {
  262. mask = Xapian::QueryParser::FLAG_WILDCARD;
  263. break;
  264. }
  265. break;
  266. }
  267. if (i->second.empty()) {
  268. f &= ~mask;
  269. } else {
  270. f |= mask;
  271. }
  272. }
  273. return f;
  274. }
  275. static querytype
  276. set_probabilistic(const string &oldp)
  277. {
  278. // Parse the query string.
  279. qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
  280. qp.set_stopper(new MyStopper());
  281. qp.set_default_op(default_op);
  282. qp.set_database(db);
  283. // FIXME: provide a custom VRP which handles size:10..20K, etc.
  284. if (!size_vrp)
  285. size_vrp = new Xapian::NumberValueRangeProcessor(VALUE_SIZE, "size:",
  286. true);
  287. qp.add_valuerangeprocessor(size_vrp);
  288. map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
  289. for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
  290. string user_prefix(pfx->first, 7);
  291. const string & term_pfx_list = pfx->second;
  292. string::size_type i = 0;
  293. do {
  294. string::size_type i0 = i;
  295. i = term_pfx_list.find('\t', i);
  296. const string & term_pfx = term_pfx_list.substr(i0, i - i0);
  297. qp.add_prefix(user_prefix, term_pfx);
  298. // std::map::insert() won't overwrite an existing entry, so we'll
  299. // prefer the first user_prefix for which a particular term prefix
  300. // is specified.
  301. termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
  302. } while (++i);
  303. }
  304. pfx = option.lower_bound("boolprefix,");
  305. for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
  306. string user_prefix = pfx->first.substr(11);
  307. qp.add_boolean_prefix(user_prefix, pfx->second);
  308. termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
  309. }
  310. try {
  311. unsigned default_flags = read_qp_flags("flag_", 0);
  312. if (option["spelling"] == "true")
  313. default_flags |= qp.FLAG_SPELLING_CORRECTION;
  314. vector<Xapian::Query> queries;
  315. queries.reserve(probabilistic_query.size());
  316. map<string, string>::const_iterator j;
  317. for (j = probabilistic_query.begin();
  318. j != probabilistic_query.end();
  319. ++j) {
  320. const string & prefix = j->first;
  321. // Choose the stemmer to use for this input.
  322. string stemlang = option[prefix + ":stemmer"];
  323. if (stemlang.empty())
  324. stemlang = option["stemmer"];
  325. qp.set_stemmer(Xapian::Stem(stemlang));
  326. // Work out the flags to use for this input.
  327. unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
  328. const string & query_string = j->second;
  329. queries.push_back(qp.parse_query(query_string, f, prefix));
  330. }
  331. query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
  332. } catch (Xapian::QueryParserError &e) {
  333. error_msg = e.get_msg();
  334. return BAD_QUERY;
  335. }
  336. Xapian::termcount n_new_terms = 0;
  337. for (Xapian::TermIterator i = query.get_terms_begin();
  338. i != query.get_terms_end(); ++i) {
  339. if (termset.find(*i) == termset.end()) {
  340. termset.insert(*i);
  341. if (!queryterms.empty()) queryterms += '\t';
  342. queryterms += *i;
  343. }
  344. n_new_terms++;
  345. }
  346. // Check new query against the previous one
  347. if (oldp.empty()) {
  348. // FIXME: should take into account other probabilistic prefixes here...
  349. return probabilistic_query[string()].empty() ? SAME_QUERY : NEW_QUERY;
  350. }
  351. // The terms in oldp are separated by tabs.
  352. const char oldp_separator = '\t';
  353. size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
  354. // short-cut: if the new query has fewer terms, it must be a new one
  355. if (n_new_terms < n_old_terms) return NEW_QUERY;
  356. const char *term = oldp.c_str();
  357. const char *pend;
  358. while ((pend = strchr(term, oldp_separator)) != NULL) {
  359. if (termset.find(string(term, pend - term)) == termset.end())
  360. return NEW_QUERY;
  361. term = pend + 1;
  362. }
  363. if (*term) {
  364. if (termset.find(string(term)) == termset.end())
  365. return NEW_QUERY;
  366. }
  367. // Use termset.size() rather than n_new_terms so we correctly handle
  368. // the case when the query has repeated terms.
  369. // This works wrongly in the case when the user extends the query
  370. // by adding a term already in it, but that's unlikely and the behaviour
  371. // isn't too bad (we just don't reset page 1). We also mishandle a few
  372. // other obscure cases e.g. adding quotes to turn a query into a phrase.
  373. if (termset.size() > n_old_terms) return EXTENDED_QUERY;
  374. return SAME_QUERY;
  375. }
  376. static multimap<string, string> filter_map;
  377. typedef multimap<string, string>::const_iterator FMCI;
  378. void add_bterm(const string &term) {
  379. string prefix;
  380. if (prefix_from_term(prefix, term) > 0)
  381. filter_map.insert(multimap<string, string>::value_type(prefix, term));
  382. }
  383. static void
  384. run_query()
  385. {
  386. bool force_boolean = false;
  387. if (!filter_map.empty()) {
  388. // OR together filters with the same prefix, then AND together
  389. vector<Xapian::Query> filter_vec;
  390. vector<string> or_vec;
  391. string current;
  392. for (FMCI i = filter_map.begin(); ; i++) {
  393. bool over = (i == filter_map.end());
  394. if (over || i->first != current) {
  395. switch (or_vec.size()) {
  396. case 0:
  397. break;
  398. case 1:
  399. filter_vec.push_back(Xapian::Query(or_vec[0]));
  400. break;
  401. default:
  402. filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR,
  403. or_vec.begin(),
  404. or_vec.end()));
  405. break;
  406. }
  407. or_vec.clear();
  408. if (over) break;
  409. current = i->first;
  410. }
  411. or_vec.push_back(i->second);
  412. }
  413. Xapian::Query filter(Xapian::Query::OP_AND,
  414. filter_vec.begin(), filter_vec.end());
  415. if (query.empty()) {
  416. // If no probabilistic query is provided then promote the filters
  417. // to be THE query - filtering an empty query will give no
  418. // matches.
  419. std::swap(query, filter);
  420. if (enquire) force_boolean = true;
  421. } else {
  422. query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
  423. }
  424. }
  425. Xapian::MatchDecider * mdecider = NULL;
  426. if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
  427. MCI i = cgi_params.find("DATEVALUE");
  428. if (i != cgi_params.end()) {
  429. Xapian::valueno datevalue = string_to_int(i->second);
  430. mdecider = new DateMatchDecider(datevalue, date_start, date_end, date_span);
  431. } else {
  432. Xapian::Query date_filter(Xapian::Query::OP_OR,
  433. date_range_filter(date_start, date_end,
  434. date_span),
  435. Xapian::Query("Dlatest"));
  436. // If no probabilistic query is provided then promote the daterange
  437. // filter to be THE query instead of filtering an empty query.
  438. if (query.empty()) {
  439. query = date_filter;
  440. } else {
  441. query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
  442. }
  443. }
  444. }
  445. if (!enquire || !error_msg.empty()) return;
  446. set_weighting_scheme(*enquire, option, force_boolean);
  447. enquire->set_cutoff(threshold);
  448. if (sort_key != Xapian::BAD_VALUENO) {
  449. if (sort_after) {
  450. enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending);
  451. } else {
  452. enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending);
  453. }
  454. }
  455. enquire->set_docid_order(docid_order);
  456. if (collapse) {
  457. enquire->set_collapse_key(collapse_key);
  458. }
  459. if (!query.empty()) {
  460. #if 0
  461. // FIXME: If we start doing permissions checks based on $REMOTE_USER
  462. // we're going to break some existing setups if users upgrade. We
  463. // probably want a way to set this from OmegaScript.
  464. const char * remote_user = getenv("REMOTE_USER");
  465. if (remote_user)
  466. apply_unix_permissions(query, remote_user);
  467. #endif
  468. enquire->set_query(query);
  469. // We could use the value of topdoc as first parameter, but we
  470. // need to know the first few items in the mset to fake a
  471. // relevance set for topterms.
  472. //
  473. // If min_hits isn't set, check at least one extra result so we
  474. // know if we've reached the end of the matches or not - then we
  475. // can avoid offering a "next" button which leads to an empty page.
  476. mset = enquire->get_mset(0, topdoc + hits_per_page,
  477. topdoc + max(hits_per_page + 1, min_hits),
  478. &rset, mdecider);
  479. }
  480. }
  481. string
  482. html_escape(const string &str)
  483. {
  484. string res;
  485. string::size_type p = 0;
  486. while (p < str.size()) {
  487. char ch = str[p++];
  488. switch (ch) {
  489. case '<':
  490. res += "&lt;";
  491. continue;
  492. case '>':
  493. res += "&gt;";
  494. continue;
  495. case '&':
  496. res += "&amp;";
  497. continue;
  498. case '"':
  499. res += "&quot;";
  500. continue;
  501. default:
  502. res += ch;
  503. }
  504. }
  505. return res;
  506. }
  507. static string
  508. html_strip(const string &str)
  509. {
  510. string res;
  511. string::size_type p = 0;
  512. bool skip = false;
  513. while (p < str.size()) {
  514. char ch = str[p++];
  515. switch (ch) {
  516. case '<':
  517. skip = true;
  518. continue;
  519. case '>':
  520. skip = false;
  521. continue;
  522. default:
  523. if (! skip) res += ch;
  524. }
  525. }
  526. return res;
  527. }
  528. // FIXME split list into hash or map and use that rather than linear lookup?
  529. static int word_in_list(const string& word, const string& list)
  530. {
  531. string::size_type split = 0, split2;
  532. int count = 0;
  533. while ((split2 = list.find('\t', split)) != string::npos) {
  534. if (word.size() == split2 - split) {
  535. if (memcmp(word.data(), list.data() + split, word.size()) == 0)
  536. return count;
  537. }
  538. split = split2 + 1;
  539. ++count;
  540. }
  541. if (word.size() == list.size() - split) {
  542. if (memcmp(word.data(), list.data() + split, word.size()) == 0)
  543. return count;
  544. }
  545. return -1;
  546. }
  547. // Not a character in an identifier
  548. inline static bool
  549. p_notid(unsigned int c)
  550. {
  551. return !isalnum(static_cast<unsigned char>(c)) && c != '_';
  552. }
  553. // Not a character in an HTML tag name
  554. inline static bool
  555. p_nottag(unsigned int c)
  556. {
  557. return !isalnum(static_cast<unsigned char>(c)) && c != '.' && c != '-';
  558. }
  559. // FIXME: shares algorithm with indextext.cc!
  560. static string
  561. html_highlight(const string &s, const string &list,
  562. const string &bra, const string &ket)
  563. {
  564. if (!stemmer) {
  565. stemmer = new Xapian::Stem(option["stemmer"]);
  566. }
  567. string res;
  568. Utf8Iterator j(s);
  569. const Utf8Iterator s_end;
  570. while (true) {
  571. Utf8Iterator first = j;
  572. while (first != s_end && !is_wordchar(*first)) ++first;
  573. if (first == s_end) break;
  574. Utf8Iterator term_end;
  575. string term;
  576. string word;
  577. const char *l = j.raw();
  578. if (*first < 128 && isupper(*first)) {
  579. j = first;
  580. Xapian::Unicode::append_utf8(term, *j);
  581. while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && isupper(*j)) {
  582. Xapian::Unicode::append_utf8(term, *j);
  583. }
  584. if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
  585. term.resize(0);
  586. }
  587. term_end = j;
  588. }
  589. if (term.empty()) {
  590. j = first;
  591. while (is_wordchar(*j)) {
  592. Xapian::Unicode::append_utf8(term, *j);
  593. ++j;
  594. if (j == s_end) break;
  595. if (*j == '&' || *j == '\'') {
  596. Utf8Iterator next = j;
  597. ++next;
  598. if (next == s_end || !is_wordchar(*next)) break;
  599. term += *j;
  600. j = next;
  601. }
  602. }
  603. term_end = j;
  604. if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
  605. string::size_type len = term.length();
  606. if (*j == '#') {
  607. term += '#';
  608. do { ++j; } while (j != s_end && *j == '#');
  609. } else {
  610. while (j != s_end && (*j == '+' || *j == '-')) {
  611. Xapian::Unicode::append_utf8(term, *j);
  612. ++j;
  613. }
  614. }
  615. if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
  616. term.resize(len);
  617. } else {
  618. term_end = j;
  619. }
  620. }
  621. }
  622. j = term_end;
  623. term = Xapian::Unicode::tolower(term);
  624. int match = word_in_list(term, list);
  625. if (match == -1) {
  626. string stem = "Z";
  627. stem += (*stemmer)(term);
  628. match = word_in_list(stem, list);
  629. }
  630. if (match >= 0) {
  631. res += html_escape(string(l, first.raw() - l));
  632. if (!bra.empty()) {
  633. res += bra;
  634. } else {
  635. static const char * colours[] = {
  636. "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
  637. "990000", "009900", "996600", "006699", "990099"
  638. };
  639. size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
  640. const char * bg = colours[idx];
  641. if (strchr(bg, 'f')) {
  642. res += "<b style=\"color:black;background-color:#";
  643. } else {
  644. res += "<b style=\"color:white;background-color:#";
  645. }
  646. res += bg;
  647. res += "\">";
  648. }
  649. word = string(first.raw(), j.raw() - first.raw());
  650. res += html_escape(word);
  651. if (!bra.empty()) {
  652. res += ket;
  653. } else {
  654. res += "</b>";
  655. }
  656. } else {
  657. res += html_escape(string(l, j.raw() - l));
  658. }
  659. }
  660. if (j != s_end) res += html_escape(string(j.raw(), j.left()));
  661. return res;
  662. }
  663. #if 0
  664. static void
  665. print_query_string(const char *after)
  666. {
  667. if (after && strncmp(after, "&B=", 3) == 0) {
  668. char prefix = after[3];
  669. string::size_type start = 0, amp = 0;
  670. while (true) {
  671. amp = url_query_string.find('&', amp);
  672. if (amp == string::npos) {
  673. cout << url_query_string.substr(start);
  674. return;
  675. }
  676. amp++;
  677. while (url_query_string[amp] == 'B' &&
  678. url_query_string[amp + 1] == '=' &&
  679. url_query_string[amp + 2] == prefix) {
  680. cout << url_query_string.substr(start, amp - start - 1);
  681. start = url_query_string.find('&', amp + 3);
  682. if (start == string::npos) return;
  683. amp = start + 1;
  684. }
  685. }
  686. }
  687. cout << url_query_string;
  688. }
  689. #endif
  690. class Fields {
  691. mutable Xapian::docid did_cached;
  692. mutable map<string, string> fields;
  693. void read_fields(Xapian::docid did) const;
  694. public:
  695. Fields() : did_cached(0) { }
  696. const string & get_field(Xapian::docid did, const string & field) const {
  697. if (did != did_cached) read_fields(did);
  698. return fields[field];
  699. }
  700. };
  701. void
  702. Fields::read_fields(Xapian::docid did) const
  703. {
  704. fields.clear();
  705. did_cached = did;
  706. const string & data = db.get_document(did).get_data();
  707. // Parse document data.
  708. string::size_type i = 0;
  709. const string & names = option["fieldnames"];
  710. if (!names.empty()) {
  711. // Each line is a field, with fieldnames taken from corresponding
  712. // entries in the tab-separated list specified by $opt{fieldnames}.
  713. string::size_type n = 0;
  714. do {
  715. string::size_type n0 = n;
  716. n = names.find('\t', n);
  717. string::size_type i0 = i;
  718. i = data.find('\n', i);
  719. fields.insert(make_pair(names.substr(n0, n - n0),
  720. data.substr(i0, i - i0)));
  721. } while (++n && ++i);
  722. } else {
  723. // Each line is a field, in the format NAME=VALUE. We assume the field
  724. // name doesn't contain an "=". Lines without an "=" are currently
  725. // just ignored.
  726. do {
  727. string::size_type i0 = i;
  728. i = data.find('\n', i);
  729. string line = data.substr(i0, i - i0);
  730. string::size_type j = line.find('=');
  731. if (j != string::npos) {
  732. string & value = fields[line.substr(0, j)];
  733. if (!value.empty()) value += '\t';
  734. value += line.substr(j + 1);
  735. }
  736. } while (++i);
  737. }
  738. }
  739. static Fields fields;
  740. static Xapian::docid q0;
  741. static Xapian::doccount hit_no;
  742. static int percent;
  743. static double weight;
  744. static Xapian::doccount collapsed;
  745. static string print_caption(const string &fmt, const vector<string> &param);
  746. enum tagval {
  747. CMD_,
  748. CMD_add,
  749. CMD_addfilter,
  750. CMD_allterms,
  751. CMD_and,
  752. CMD_cgi,
  753. CMD_cgilist,
  754. CMD_collapsed,
  755. CMD_date,
  756. CMD_dbname,
  757. CMD_dbsize,
  758. CMD_def,
  759. CMD_defaultop,
  760. CMD_div,
  761. CMD_eq,
  762. CMD_emptydocs,
  763. CMD_env,
  764. CMD_error,
  765. CMD_field,
  766. CMD_filesize,
  767. CMD_filters,
  768. CMD_filterterms,
  769. CMD_find,
  770. CMD_fmt,
  771. CMD_freq,
  772. CMD_ge,
  773. CMD_gt,
  774. CMD_highlight,
  775. CMD_hit,
  776. CMD_hitlist,
  777. CMD_hitsperpage,
  778. CMD_hostname,
  779. CMD_html,
  780. CMD_htmlstrip,
  781. CMD_httpheader,
  782. CMD_id,
  783. CMD_if,
  784. CMD_include,
  785. CMD_json,
  786. CMD_jsonarray,
  787. CMD_last,
  788. CMD_lastpage,
  789. CMD_le,
  790. CMD_length,
  791. CMD_list,
  792. CMD_log,
  793. CMD_lookup,
  794. CMD_lower,
  795. CMD_lt,
  796. CMD_map,
  797. CMD_max,
  798. CMD_min,
  799. CMD_mod,
  800. CMD_msize,
  801. CMD_msizeexact,
  802. CMD_mul,
  803. CMD_muldiv,
  804. CMD_ne,
  805. CMD_nice,
  806. CMD_not,
  807. CMD_now,
  808. CMD_opt,
  809. CMD_or,
  810. CMD_pack,
  811. CMD_percentage,
  812. CMD_prettyterm,
  813. CMD_prettyurl,
  814. CMD_query,
  815. CMD_querydescription,
  816. CMD_queryterms,
  817. CMD_range,
  818. CMD_record,
  819. CMD_relevant,
  820. CMD_relevants,
  821. CMD_score,
  822. CMD_set,
  823. CMD_setmap,
  824. CMD_setrelevant,
  825. CMD_slice,
  826. CMD_snippet,
  827. CMD_split,
  828. CMD_stoplist,
  829. CMD_sub,
  830. CMD_substr,
  831. CMD_suggestion,
  832. CMD_terms,
  833. CMD_thispage,
  834. CMD_time,
  835. CMD_topdoc,
  836. CMD_topterms,
  837. CMD_transform,
  838. CMD_truncate,
  839. CMD_uniq,
  840. CMD_unpack,
  841. CMD_unstem,
  842. CMD_upper,
  843. CMD_url,
  844. CMD_value,
  845. CMD_version,
  846. CMD_weight,
  847. CMD_MACRO // special tag for macro evaluation
  848. };
  849. struct func_attrib {
  850. int tag;
  851. int minargs, maxargs, evalargs;
  852. char ensure;
  853. };
  854. #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
  855. struct func_desc {
  856. const char *name;
  857. struct func_attrib a;
  858. };
  859. #define N -1
  860. #define M 'M'
  861. #define Q 'Q'
  862. // NB when adding a new command which ensures M or Q, update the list in
  863. // docs/omegascript.rst
  864. static struct func_desc func_tab[] = {
  865. //name minargs maxargs evalargs ensure
  866. {"",{CMD_, N, N, 0, 0}},// commented out code
  867. T(add, 0, N, N, 0), // add a list of numbers
  868. T(addfilter, 1, 1, N, 0), // add filter term
  869. T(allterms, 0, 1, N, 0), // list of all terms matching document
  870. T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
  871. T(cgi, 1, 1, N, 0), // return cgi parameter value
  872. T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
  873. T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
  874. T(date, 1, 2, N, 0), // convert time_t to strftime format
  875. // (default: YYYY-MM-DD)
  876. T(dbname, 0, 0, N, 0), // database name
  877. T(dbsize, 0, 0, N, 0), // database size (# of documents)
  878. T(def, 2, 2, 1, 0), // define a macro
  879. T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
  880. T(div, 2, 2, N, 0), // integer divide
  881. T(emptydocs, 0, 1, N, 0), // list of empty documents
  882. T(env, 1, 1, N, 0), // environment variable
  883. T(error, 0, 0, N, 0), // error message
  884. T(eq, 2, 2, N, 0), // test equality
  885. T(field, 1, 2, N, 0), // lookup field in record
  886. T(filesize, 1, 1, N, 0), // pretty printed filesize
  887. T(filters, 0, 0, N, 0), // serialisation of current filters
  888. T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
  889. T(find, 2, 2, N, 0), // find entry in list
  890. T(fmt, 0, 0, N, 0), // name of current format
  891. T(freq, 1, 1, N, 0), // frequency of a term
  892. T(ge, 2, 2, N, 0), // test >=
  893. T(gt, 2, 2, N, 0), // test >
  894. T(highlight, 2, 4, N, 0), // html escape and highlight words from list
  895. T(hit, 0, 0, N, 0), // hit number of current mset entry (starting
  896. // from 0
  897. T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
  898. T(hitsperpage, 0, 0, N, 0), // hits per page
  899. T(hostname, 1, 1, N, 0), // extract hostname from URL
  900. T(html, 1, 1, N, 0), // html escape string (<>&")
  901. T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
  902. T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
  903. T(id, 0, 0, N, 0), // docid of current doc
  904. T(if, 2, 3, 1, 0), // conditional
  905. T(include, 1, 1, 1, 0), // include another file
  906. T(json, 1, 1, N, 0), // JSON string escaping
  907. T(jsonarray, 1, 1, N, 0), // Format list as a JSON array of strings
  908. T(last, 0, 0, N, M), // m-set number of last hit on page
  909. T(lastpage, 0, 0, N, M), // number of last hit page
  910. T(le, 2, 2, N, 0), // test <=
  911. T(length, 1, 1, N, 0), // length of list
  912. T(list, 2, 5, N, 0), // pretty print list
  913. T(log, 1, 2, 1, 0), // create a log entry
  914. T(lookup, 2, 2, N, 0), // lookup in named cdb file
  915. T(lower, 1, 1, N, 0), // convert string to lower case
  916. T(lt, 2, 2, N, 0), // test <
  917. T(map, 1, 2, 1, 0), // map a list into another list
  918. T(max, 1, N, N, 0), // maximum of a list of values
  919. T(min, 1, N, N, 0), // minimum of a list of values
  920. T(mod, 2, 2, N, 0), // integer modulus
  921. T(msize, 0, 0, N, M), // number of matches
  922. T(msizeexact, 0, 0, N, M), // is $msize exact?
  923. T(mul, 2, N, N, 0), // multiply a list of numbers
  924. T(muldiv, 3, 3, N, 0), // calculate A*B/C
  925. T(ne, 2, 2, N, 0), // test not equal
  926. T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
  927. T(not, 1, 1, N, 0), // logical not
  928. T(now, 0, 0, N, 0), // current date/time as a time_t
  929. T(opt, 1, 2, N, 0), // lookup an option value
  930. T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
  931. T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
  932. T(percentage, 0, 0, N, 0), // percentage score of current hit
  933. T(prettyterm, 1, 1, N, Q), // pretty print term name
  934. T(prettyurl, 1, 1, N, 0), // pretty version of URL
  935. T(query, 0, 1, N, Q), // query
  936. T(querydescription,0, 0, N, Q), // query.get_description()
  937. T(queryterms, 0, 0, N, Q), // list of query terms
  938. T(range, 2, 2, N, 0), // return list of values between start and end
  939. T(record, 0, 1, N, 0), // record contents of document
  940. T(relevant, 0, 1, N, Q), // is document relevant?
  941. T(relevants, 0, 0, N, Q), // return list of relevant documents
  942. T(score, 0, 0, N, 0), // score (0-10) of current hit
  943. T(set, 2, 2, N, 0), // set option value
  944. T(setmap, 1, N, N, 0), // set map of option values
  945. T(setrelevant, 0, 1, N, Q), // set rset
  946. T(slice, 2, 2, N, 0), // slice a list using a second list
  947. T(snippet, 1, 2, N, 0), // generate snippet from text
  948. T(split, 1, 2, N, 0), // split a string to give a list
  949. T(stoplist, 0, 0, N, Q), // return list of stopped terms
  950. T(sub, 2, 2, N, 0), // subtract
  951. T(substr, 2, 3, N, 0), // substring
  952. T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
  953. T(terms, 0, 0, N, M), // list of matching terms
  954. T(thispage, 0, 0, N, M), // page number of current page
  955. T(time, 0, 0, N, M), // how long the match took (in seconds)
  956. T(topdoc, 0, 0, N, M), // first document on current page of hit list
  957. // (counting from 0)
  958. T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
  959. // (default 16)
  960. T(transform, 3, 3, N, 0), // transform with a regexp
  961. T(truncate, 2, 4, N, 0), // truncate after a word
  962. T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
  963. T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
  964. T(unstem, 1, 1, N, Q), // return list of probabilistic terms from
  965. // the query which stemmed to this term
  966. T(upper, 1, 1, N, 0), // convert string to upper case
  967. T(url, 1, 1, N, 0), // url encode argument
  968. T(value, 1, 2, N, 0), // return document value
  969. T(version, 0, 0, N, 0), // omega version string
  970. T(weight, 0, 0, N, 0), // weight of the current hit
  971. { NULL,{0, 0, 0, 0, 0}}
  972. };
  973. #undef T // Leaving T defined screws up Sun's C++ compiler!
  974. static vector<string> macros;
  975. // Call write() repeatedly until all data is written or we get a
  976. // non-recoverable error.
  977. static ssize_t
  978. write_all(int fd, const char * buf, size_t count)
  979. {
  980. while (count) {
  981. ssize_t r = write(fd, buf, count);
  982. if (rare(r < 0)) {
  983. if (errno == EINTR) continue;
  984. return r;
  985. }
  986. buf += r;
  987. count -= r;
  988. }
  989. return 0;
  990. }
  991. static string
  992. eval(const string &fmt, const vector<string> &param)
  993. {
  994. static map<string, const struct func_attrib *> func_map;
  995. if (func_map.empty()) {
  996. struct func_desc *p;
  997. for (p = func_tab; p->name != NULL; p++) {
  998. func_map[string(p->name)] = &(p->a);
  999. }
  1000. }
  1001. string res;
  1002. string::size_type p = 0, q;
  1003. while ((q = fmt.find('$', p)) != string::npos) try {
  1004. res += fmt.substr(p, q - p);
  1005. string::size_type code_start = q; // note down for error reporting
  1006. q++;
  1007. if (q >= fmt.size()) break;
  1008. unsigned char ch = fmt[q];
  1009. switch (ch) {
  1010. // Magic sequences:
  1011. // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
  1012. case '$':
  1013. res += '$';
  1014. p = q + 1;
  1015. continue;
  1016. case '(':
  1017. res += '{';
  1018. p = q + 1;
  1019. continue;
  1020. case ')':
  1021. res += '}';
  1022. p = q + 1;
  1023. continue;
  1024. case '.':
  1025. res += ',';
  1026. p = q + 1;
  1027. continue;
  1028. case '_':
  1029. ch = '0';
  1030. // FALL THRU
  1031. case '1': case '2': case '3': case '4': case '5':
  1032. case '6': case '7': case '8': case '9':
  1033. ch -= '0';
  1034. if (ch < param.size()) res += param[ch];
  1035. p = q + 1;
  1036. continue;
  1037. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  1038. case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
  1039. case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
  1040. case 's': case 't': case 'u': case 'v': case 'w': case 'x':
  1041. case 'y': case 'z':
  1042. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  1043. case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
  1044. case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
  1045. case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
  1046. case 'Y': case 'Z':
  1047. case '{':
  1048. break;
  1049. default:
  1050. string msg = "Unknown $ code in: $" + fmt.substr(q);
  1051. throw msg;
  1052. }
  1053. p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
  1054. string var = fmt.substr(q, p - q);
  1055. map<string, const struct func_attrib *>::const_iterator func;
  1056. func = func_map.find(var);
  1057. if (func == func_map.end()) {
  1058. throw "Unknown function '" + var + "'";
  1059. }
  1060. vector<string> args;
  1061. if (fmt[p] == '{') {
  1062. q = p + 1;
  1063. int nest = 1;
  1064. while (true) {
  1065. p = fmt.find_first_of(",{}", p + 1);
  1066. if (p == string::npos)
  1067. throw "missing } in " + fmt.substr(code_start);
  1068. if (fmt[p] == '{') {
  1069. ++nest;
  1070. } else {
  1071. if (nest == 1) {
  1072. // should we split the args
  1073. if (func->second->minargs != N) {
  1074. args.push_back(fmt.substr(q, p - q));
  1075. q = p + 1;
  1076. }
  1077. }
  1078. if (fmt[p] == '}' && --nest == 0) break;
  1079. }
  1080. }
  1081. if (func->second->minargs == N)
  1082. args.push_back(fmt.substr(q, p - q));
  1083. p++;
  1084. }
  1085. if (func->second->minargs != N) {
  1086. if ((int)args.size() < func->second->minargs)
  1087. throw "too few arguments to $" + var;
  1088. if (func->second->maxargs != N &&
  1089. (int)args.size() > func->second->maxargs)
  1090. throw "too many arguments to $" + var;
  1091. vector<string>::size_type n;
  1092. if (func->second->evalargs != N)
  1093. n = func->second->evalargs;
  1094. else
  1095. n = args.size();
  1096. for (vector<string>::size_type j = 0; j < n; j++)
  1097. args[j] = eval(args[j], param);
  1098. }
  1099. if (func->second->ensure == 'Q' || func->second->ensure == 'M')
  1100. ensure_query_parsed();
  1101. if (func->second->ensure == 'M') ensure_match();
  1102. string value;
  1103. switch (func->second->tag) {
  1104. case CMD_:
  1105. break;
  1106. case CMD_add: {
  1107. int total = 0;
  1108. vector<string>::const_iterator i;
  1109. for (i = args.begin(); i != args.end(); i++)
  1110. total += string_to_int(*i);
  1111. value = str(total);
  1112. break;
  1113. }
  1114. case CMD_addfilter:
  1115. add_bterm(args[0]);
  1116. break;
  1117. case CMD_allterms: {
  1118. // list of all terms indexing document
  1119. int id = q0;
  1120. if (!args.empty()) id = string_to_int(args[0]);
  1121. Xapian::TermIterator term = db.termlist_begin(id);
  1122. for ( ; term != db.termlist_end(id); term++)
  1123. value = value + *term + '\t';
  1124. if (!value.empty()) value.erase(value.size() - 1);
  1125. break;
  1126. }
  1127. case CMD_and: {
  1128. value = "true";
  1129. for (vector<string>::const_iterator i = args.begin();
  1130. i != args.end(); i++) {
  1131. if (eval(*i, param).empty()) {
  1132. value.resize(0);
  1133. break;
  1134. }
  1135. }
  1136. break;
  1137. }
  1138. case CMD_cgi: {
  1139. MCI i = cgi_params.find(args[0]);
  1140. if (i != cgi_params.end()) value = i->second;
  1141. break;
  1142. }
  1143. case CMD_cgilist: {
  1144. pair<MCI, MCI> g;
  1145. g = cgi_params.equal_range(args[0]);
  1146. for (MCI i = g.first; i != g.second; i++)
  1147. value = value + i->second + '\t';
  1148. if (!value.empty()) value.erase(value.size() - 1);
  1149. break;
  1150. }
  1151. case CMD_collapsed: {
  1152. value = str(collapsed);
  1153. break;
  1154. }
  1155. case CMD_date:
  1156. value = args[0];
  1157. if (!value.empty()) {
  1158. char buf[64] = "";
  1159. time_t date = string_to_int(value);
  1160. if (date != (time_t)-1) {
  1161. struct tm *then;
  1162. then = gmtime(&date);
  1163. string date_fmt = "%Y-%m-%d";
  1164. if (args.size() > 1) date_fmt = eval(args[1], param);
  1165. strftime(buf, sizeof buf, date_fmt.c_str(), then);
  1166. }
  1167. value = buf;
  1168. }
  1169. break;
  1170. case CMD_dbname:
  1171. value = dbname;
  1172. break;
  1173. case CMD_dbsize: {
  1174. static Xapian::doccount dbsize;
  1175. if (!dbsize) dbsize = db.get_doccount();
  1176. value = str(dbsize);
  1177. break;
  1178. }
  1179. case CMD_def: {
  1180. func_attrib *fa = new func_attrib;
  1181. fa->tag = CMD_MACRO + macros.size();
  1182. fa->minargs = 0;
  1183. fa->maxargs = 9;
  1184. fa->evalargs = N; // FIXME: or 0?
  1185. fa->ensure = 0;
  1186. macros.push_back(args[1]);
  1187. func_map[args[0]] = fa;
  1188. break;
  1189. }
  1190. case CMD_defaultop:
  1191. if (default_op == Xapian::Query::OP_AND) {
  1192. value = "and";
  1193. } else {
  1194. value = "or";
  1195. }
  1196. break;
  1197. case CMD_div: {
  1198. int denom = string_to_int(args[1]);
  1199. if (denom == 0) {
  1200. value = "divide by 0";
  1201. } else {
  1202. value = str(string_to_int(args[0]) /
  1203. string_to_int(args[1]));
  1204. }
  1205. break;
  1206. }
  1207. case CMD_eq:
  1208. if (args[0] == args[1]) value = "true";
  1209. break;
  1210. case CMD_emptydocs: {
  1211. string t;
  1212. if (!args.empty())
  1213. t = args[0];
  1214. Xapian::PostingIterator i;
  1215. for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
  1216. if (i.get_doclength() != 0) continue;
  1217. if (!value.empty()) value += '\t';
  1218. value += str(*i);
  1219. }
  1220. break;
  1221. }
  1222. case CMD_env: {
  1223. char *env = getenv(args[0].c_str());
  1224. if (env != NULL) value = env;
  1225. break;
  1226. }
  1227. case CMD_error:
  1228. if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
  1229. error_msg = "Database '" + dbname + "' couldn't be opened";
  1230. }
  1231. value = error_msg;
  1232. break;
  1233. case CMD_field: {
  1234. Xapian::docid did = q0;
  1235. if (args.size() > 1) did = string_to_int(args[1]);
  1236. value = fields.get_field(did, args[0]);
  1237. break;
  1238. }
  1239. case CMD_filesize: {
  1240. // FIXME: rounding? i18n?
  1241. int size = string_to_int(args[0]);
  1242. int intpart = size;
  1243. int fraction = -1;
  1244. const char * format = 0;
  1245. if (size < 0) {
  1246. // Negative size -> empty result.
  1247. } else if (size == 1) {
  1248. format = "%d byte";
  1249. } else if (size < 1024) {
  1250. format = "%d bytes";
  1251. } else {
  1252. if (size < 1024*1024) {
  1253. format = "%d.%cK";
  1254. } else {
  1255. size /= 1024;
  1256. if (size < 1024*1024) {
  1257. format = "%d.%cM";
  1258. } else {
  1259. size /= 1024;
  1260. format = "%d.%cG";
  1261. }
  1262. }
  1263. intpart = unsigned(size) / 1024;
  1264. fraction = unsigned(size) % 1024;
  1265. }
  1266. if (format) {
  1267. char buf[200];
  1268. int len;
  1269. if (fraction == -1) {
  1270. len = my_snprintf(buf, sizeof(buf), format, intpart);
  1271. } else {
  1272. fraction = (fraction * 10 / 1024) + '0';
  1273. len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
  1274. }
  1275. if (len < 0 || (unsigned)len > sizeof(buf)) len = sizeof(buf);
  1276. value.assign(buf, len);
  1277. }
  1278. break;
  1279. }
  1280. case CMD_filters:
  1281. value = filters;
  1282. break;
  1283. case CMD_filterterms: {
  1284. Xapian::TermIterator term = db.allterms_begin();
  1285. term.skip_to(args[0]);
  1286. while (term != db.allterms_end()) {
  1287. string t = *term;
  1288. if (!startswith(t, args[0])) break;
  1289. value = value + t + '\t';
  1290. ++term;
  1291. }
  1292. if (!value.empty()) value.erase(value.size() - 1);
  1293. break;
  1294. }
  1295. case CMD_find: {
  1296. string l = args[0], s = args[1];
  1297. string::size_type i = 0, j = 0;
  1298. size_t count = 0;
  1299. while (j != l.size()) {
  1300. j = l.find('\t', i);
  1301. if (j == string::npos) j = l.size();
  1302. if (j - i == s.length()) {
  1303. if (memcmp(s.data(), l.data() + i, j - i) == 0) {
  1304. value = str(count);
  1305. break;
  1306. }
  1307. }
  1308. ++count;
  1309. i = j + 1;
  1310. }
  1311. break;
  1312. }
  1313. case CMD_fmt:
  1314. value = fmtname;
  1315. break;
  1316. case CMD_freq:
  1317. try {
  1318. value = str(mset.get_termfreq(args[0]));
  1319. } catch (const Xapian::InvalidOperationError&) {
  1320. // An MSet will raise this error if it's empty and not
  1321. // associated with a search.
  1322. value = str(db.get_termfreq(args[0]));
  1323. }
  1324. break;
  1325. case CMD_ge:
  1326. if (string_to_int(args[0]) >= string_to_int(args[1]))
  1327. value = "true";
  1328. break;
  1329. case CMD_gt:
  1330. if (string_to_int(args[0]) > string_to_int(args[1]))
  1331. value = "true";
  1332. break;
  1333. case CMD_highlight: {
  1334. string bra, ket;
  1335. if (args.size() > 2) {
  1336. bra = args[2];
  1337. if (args.size() > 3) {
  1338. ket = args[3];
  1339. } else {
  1340. string::const_iterator i;
  1341. i = find_if(bra.begin() + 2, bra.end(), p_nottag);
  1342. ket = "</";
  1343. ket += bra.substr(1, i - bra.begin() - 1);
  1344. ket += '>';
  1345. }
  1346. }
  1347. value = html_highlight(args[0], args[1], bra, ket);
  1348. break;
  1349. }
  1350. case CMD_hit:
  1351. // 0-based mset index
  1352. value = str(hit_no);
  1353. break;
  1354. case CMD_hitlist:
  1355. #if 0
  1356. const char *q;
  1357. int ch;
  1358. url_query_string = "?DB=";
  1359. url_query_string += dbname;
  1360. url_query_string += "&P=";
  1361. q = probabilistic_query[string()].c_str();
  1362. while ((ch = *q++) != '\0') {
  1363. switch (ch) {
  1364. case '+':
  1365. url_query_string += "%2b";
  1366. break;
  1367. case '"':
  1368. url_query_string += "%22";
  1369. break;
  1370. case ' ':
  1371. ch = '+';
  1372. /* fall through */
  1373. default:
  1374. url_query_string += ch;
  1375. }
  1376. }
  1377. // add any boolean terms
  1378. for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
  1379. url_query_string += "&B=";
  1380. url_query_string += i->second;
  1381. }
  1382. #endif
  1383. for (hit_no = topdoc; hit_no < last; hit_no++)
  1384. value += print_caption(args[0], param);
  1385. hit_no = 0;
  1386. break;
  1387. case CMD_hitsperpage:
  1388. value = str(hits_per_page);
  1389. break;
  1390. case CMD_hostname: {
  1391. value = args[0];
  1392. // remove URL scheme and/or path
  1393. string::size_type i = value.find("://");
  1394. if (i == string::npos) i = 0; else i += 3;
  1395. value = value.substr(i, value.find('/', i) - i);
  1396. // remove user@ or user:password@
  1397. i = value.find('@');
  1398. if (i != string::npos) value.erase(0, i + 1);
  1399. // remove :port
  1400. i = value.find(':');
  1401. if (i != string::npos) value.resize(i);
  1402. break;
  1403. }
  1404. case CMD_html:
  1405. value = html_escape(args[0]);
  1406. break;
  1407. case CMD_htmlstrip:
  1408. value = html_strip(args[0]);
  1409. break;
  1410. case CMD_httpheader:
  1411. if (!suppress_http_headers) {
  1412. cout << args[0] << ": " << args[1] << endl;
  1413. if (!set_content_type && args[0].length() == 12 &&
  1414. strcasecmp(args[0].c_str(), "Content-Type") == 0) {
  1415. set_content_type = true;
  1416. }
  1417. }
  1418. break;
  1419. case CMD_id:
  1420. // document id
  1421. value = str(q0);
  1422. break;
  1423. case CMD_if:
  1424. if (!args[0].empty())
  1425. value = eval(args[1], param);
  1426. else if (args.size() > 2)
  1427. value = eval(args[2], param);
  1428. break;
  1429. case CMD_include:
  1430. value = eval_file(args[0]);
  1431. break;
  1432. case CMD_json:
  1433. value = args[0];
  1434. json_escape(value);
  1435. break;
  1436. case CMD_jsonarray: {
  1437. const string & l = args[0];
  1438. string::size_type i = 0, j;
  1439. if (l.empty()) {
  1440. value = "[]";
  1441. break;
  1442. }
  1443. value = "[\"]";
  1444. while (true) {
  1445. j = l.find('\t', i);
  1446. string elt(l, i, j - i);
  1447. json_escape(elt);
  1448. value += elt;
  1449. if (j == string::npos) break;
  1450. value += "\",\"";
  1451. i = j + 1;
  1452. }
  1453. value += "\"]";
  1454. break;
  1455. }
  1456. case CMD_last:
  1457. value = str(last);
  1458. break;
  1459. case CMD_lastpage: {
  1460. int l = mset.get_matches_estimated();
  1461. if (l > 0) l = (l - 1) / hits_per_page + 1;
  1462. value = str(l);
  1463. break;
  1464. }
  1465. case CMD_le:
  1466. if (string_to_int(args[0]) <= string_to_int(args[1]))
  1467. value = "true";
  1468. break;
  1469. case CMD_length:
  1470. if (args[0].empty()) {
  1471. value = "0";
  1472. } else {
  1473. size_t length = count(args[0].begin(), args[0].end(), '\t');
  1474. value = str(length + 1);
  1475. }
  1476. break;
  1477. case CMD_list: {
  1478. if (!args[0].empty()) {
  1479. string pre, inter, interlast, post;
  1480. switch (args.size()) {
  1481. case 2:
  1482. inter = interlast = args[1];
  1483. break;
  1484. case 3:
  1485. inter = args[1];
  1486. interlast = args[2];
  1487. break;
  1488. case 4:
  1489. pre = args[1];
  1490. inter = interlast = args[2];
  1491. post = args[3];
  1492. break;
  1493. case 5:
  1494. pre = args[1];
  1495. inter = args[2];
  1496. interlast = args[3];
  1497. post = args[4];
  1498. break;
  1499. }
  1500. value += pre;
  1501. string list = args[0];
  1502. string::size_type split = 0, split2;
  1503. while ((split2 = list.find('\t', split)) != string::npos) {
  1504. if (split) value += inter;
  1505. value += list.substr(split, split2 - split);
  1506. split = split2 + 1;
  1507. }
  1508. if (split) value += interlast;
  1509. value += list.substr(split);
  1510. value += post;
  1511. }
  1512. break;
  1513. }
  1514. case CMD_log: {
  1515. if (!vet_filename(args[0])) break;
  1516. string logfile = log_dir + args[0];
  1517. int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
  1518. if (fd == -1) break;
  1519. vector<string> noargs;
  1520. noargs.resize(1);
  1521. string line;
  1522. if (args.size() > 1) {
  1523. line = args[1];
  1524. } else {
  1525. line = DEFAULT_LOG_ENTRY;
  1526. }
  1527. line = eval(line, noargs);
  1528. line += '\n';
  1529. (void)write_all(fd, line.data(), line.length());
  1530. close(fd);
  1531. break;
  1532. }
  1533. case CMD_lookup: {
  1534. if (!vet_filename(args[0])) break;
  1535. string cdbfile = cdb_dir + args[0];
  1536. int fd = open(cdbfile.c_str(), O_RDONLY);
  1537. if (fd == -1) break;
  1538. struct cdb cdb;
  1539. cdb_init(&cdb, fd);
  1540. if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
  1541. size_t datalen = cdb_datalen(&cdb);
  1542. const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
  1543. if (q) {
  1544. value = string(static_cast<const char *>(dat), datalen);
  1545. }
  1546. }
  1547. cdb_free(&cdb);
  1548. close(fd); // FIXME: cache fds?
  1549. break;
  1550. }
  1551. case CMD_lower:
  1552. value = Xapian::Unicode::tolower(args[0]);
  1553. break;
  1554. case CMD_lt:
  1555. if (string_to_int(args[0]) < string_to_int(args[1]))
  1556. value = "true";
  1557. break;
  1558. case CMD_map:
  1559. if (!args[0].empty()) {
  1560. string l = args[0], pat = args[1];
  1561. vector<string> new_args(param);
  1562. string::size_type i = 0, j;
  1563. while (true) {
  1564. j = l.find('\t', i);
  1565. new_args[0] = l.substr(i, j - i);
  1566. value += eval(pat, new_args);
  1567. if (j == string::npos) break;
  1568. value += '\t';
  1569. i = j + 1;
  1570. }
  1571. }
  1572. break;
  1573. case CMD_max: {
  1574. vector<string>::const_iterator i = args.begin();
  1575. int val = string_to_int(*i++);
  1576. for (; i != args.end(); i++) {
  1577. int x = string_to_int(*i);
  1578. if (x > val) val = x;
  1579. }
  1580. value = str(val);
  1581. break;
  1582. }
  1583. case CMD_min: {
  1584. vector<string>::const_iterator i = args.begin();
  1585. int val = string_to_int(*i++);
  1586. for (; i != args.end(); i++) {
  1587. int x = string_to_int(*i);
  1588. if (x < val) val = x;
  1589. }
  1590. value = str(val);
  1591. break;
  1592. }
  1593. case CMD_msize:
  1594. // number of matches
  1595. value = str(mset.get_matches_estimated());
  1596. break;
  1597. case CMD_msizeexact:
  1598. // is msize exact?
  1599. if (mset.get_matches_lower_bound()
  1600. == mset.get_matches_upper_bound())
  1601. value = "true";
  1602. break;
  1603. case CMD_mod: {
  1604. int denom = string_to_int(args[1]);
  1605. if (denom == 0) {
  1606. value = "divide by 0";
  1607. } else {
  1608. value = str(string_to_int(args[0]) %
  1609. string_to_int(args[1]));
  1610. }
  1611. break;
  1612. }
  1613. case CMD_mul: {
  1614. vector<string>::const_iterator i = args.begin();
  1615. int total = string_to_int(*i++);
  1616. while (i != args.end())
  1617. total *= string_to_int(*i++);
  1618. value = str(total);
  1619. break;
  1620. }
  1621. case CMD_muldiv: {
  1622. int denom = string_to_int(args[2]);
  1623. if (denom == 0) {
  1624. value = "divide by 0";
  1625. } else {
  1626. int num = string_to_int(args[0]) * string_to_int(args[1]);
  1627. value = str(num / denom);
  1628. }
  1629. break;
  1630. }
  1631. case CMD_ne:
  1632. if (args[0] != args[1]) value = "true";
  1633. break;
  1634. case CMD_nice: {
  1635. string::const_iterator i = args[0].begin();
  1636. int len = args[0].length();
  1637. while (len) {
  1638. value += *i++;
  1639. if (--len && len % 3 == 0) value += option["thousand"];
  1640. }
  1641. break;
  1642. }
  1643. case CMD_not:
  1644. if (args[0].empty()) value = "true";
  1645. break;
  1646. case CMD_now: {
  1647. char buf[64];
  1648. my_snprintf(buf, sizeof(buf), "%lu", (unsigned long)time(NULL));
  1649. // MSVC's snprintf omits the zero byte if the string if
  1650. // sizeof(buf) long.
  1651. buf[sizeof(buf) - 1] = '\0';
  1652. value = buf;
  1653. break;
  1654. }
  1655. case CMD_opt:
  1656. if (args.size() == 2) {
  1657. value = option[args[0] + "," + args[1]];
  1658. } else {
  1659. value = option[args[0]];
  1660. }
  1661. break;
  1662. case CMD_or: {
  1663. for (vector<string>::const_iterator i = args.begin();
  1664. i != args.end(); i++) {
  1665. value = eval(*i, param);
  1666. if (!value.empty()) break;
  1667. }
  1668. break;
  1669. }
  1670. case CMD_pack:
  1671. value = int_to_binary_string(string_to_int(args[0]));
  1672. break;
  1673. case CMD_percentage:
  1674. // percentage score
  1675. value = str(percent);
  1676. break;
  1677. case CMD_prettyterm:
  1678. value = pretty_term(args[0]);
  1679. break;
  1680. case CMD_prettyurl:
  1681. value = args[0];
  1682. url_prettify(value);
  1683. break;
  1684. case CMD_query:
  1685. value = probabilistic_query[args.empty() ? string() : args[0]];
  1686. break;
  1687. case CMD_querydescription:
  1688. value = query.get_description();
  1689. break;
  1690. case CMD_queryterms:
  1691. value = queryterms;
  1692. break;
  1693. case CMD_range: {
  1694. int start = string_to_int(args[0]);
  1695. int end = string_to_int(args[1]);
  1696. while (start <= end) {
  1697. value += str(start);
  1698. if (start < end) value += '\t';
  1699. start++;
  1700. }
  1701. break;
  1702. }
  1703. case CMD_record: {
  1704. int id = q0;
  1705. if (!args.empty()) id = string_to_int(args[0]);
  1706. value = db.get_document(id).get_data();
  1707. break;
  1708. }
  1709. case CMD_relevant: {
  1710. // document id if relevant; empty otherwise
  1711. int id = q0;
  1712. if (!args.empty()) id = string_to_int(args[0]);
  1713. map<Xapian::docid, bool>::iterator i = ticked.find(id);
  1714. if (i != ticked.end()) {
  1715. i->second = false; // icky side-effect
  1716. value = str(id);
  1717. }
  1718. break;
  1719. }
  1720. case CMD_relevants: {
  1721. for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
  1722. i != ticked.end(); i++) {
  1723. if (i->second) {
  1724. value += str(i->first);
  1725. value += '\t';
  1726. }
  1727. }
  1728. if (!value.empty()) value.erase(value.size() - 1);
  1729. break;
  1730. }
  1731. case CMD_score:
  1732. // Score (0 to 10)
  1733. value = str(percent / 10);
  1734. break;
  1735. case CMD_set:
  1736. option[args[0]] = args[1];
  1737. break;
  1738. case CMD_setmap: {
  1739. string base = args[0] + ',';
  1740. if (args.size() % 2 != 1)
  1741. throw string("$setmap requires an odd number of arguments");
  1742. for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
  1743. option[base + args[i]] = args[i + 1];
  1744. }
  1745. break;
  1746. }
  1747. case CMD_setrelevant: {
  1748. string::size_type i = 0, j;
  1749. while (true) {
  1750. j = args[0].find_first_not_of("0123456789", i);
  1751. Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
  1752. if (id) {
  1753. rset.add_document(id);
  1754. ticked[id] = true;
  1755. }
  1756. if (j == string::npos) break;
  1757. i = j + 1;
  1758. }
  1759. break;
  1760. }
  1761. case CMD_slice: {
  1762. string list = args[0], pos = args[1];
  1763. vector<string> items;
  1764. string::size_type i = 0, j;
  1765. while (true) {
  1766. j = list.find('\t', i);
  1767. items.push_back(list.substr(i, j - i));
  1768. if (j ==

Large files files are truncated, but you can click here to view the full file