PageRenderTime 58ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/kyotocabinet-1.2.70/lab/kcdict/kcdictmgr.cc

#
C++ | 1575 lines | 1432 code | 46 blank | 97 comment | 812 complexity | 0b219fdd75ec2353acb96d2b79d9ad41 MD5 | raw file
Possible License(s): GPL-3.0
  1. /*************************************************************************************************
  2. * The command line utility of the word dictionary
  3. * Copyright (C) 2009-2011 FAL Labs
  4. * This file is part of Kyoto Cabinet.
  5. * This program is free software: you can redistribute it and/or modify it under the terms of
  6. * the GNU General Public License as published by the Free Software Foundation, either version
  7. * 3 of the License, or any later version.
  8. * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
  9. * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. * See the GNU General Public License for more details.
  11. * You should have received a copy of the GNU General Public License along with this program.
  12. * If not, see <http://www.gnu.org/licenses/>.
  13. *************************************************************************************************/
  14. #include <kcutil.h>
  15. #include <kcpolydb.h>
  16. #include <kcdbext.h>
  17. namespace kc = kyotocabinet;
  18. // enumurations
  19. enum { ZM_DEFAULT, ZM_ZLIB, ZM_LZO, ZM_LZMA };
  20. // constants
  21. const size_t THREADNUM = 8; // number of threads
  22. const size_t AMBGRATIO = 3; // ratio of threshold of ambiguous search
  23. const size_t AMBGMIN = 3; // minimum threshold of ambiguous search
  24. // global variables
  25. const char* g_progname; // program name
  26. // function prototypes
  27. int main(int argc, char** argv);
  28. static void usage();
  29. static void dberrprint(kc::BasicDB* db, const char* info);
  30. static void utftoucs(const char* src, size_t size, uint32_t* dest, size_t* np);
  31. static void normalizequery(const char* qbuf, size_t qsiz, std::string* dest);
  32. static void normalizeucs(uint32_t* ary, size_t onum, size_t* np);
  33. template<class CHARTYPE>
  34. static size_t levdist(const CHARTYPE* abuf, size_t asiz, const CHARTYPE* bbuf, size_t bsiz);
  35. static int32_t runimport(int argc, char** argv);
  36. static int32_t runsearch(int argc, char** argv);
  37. static int32_t runsuggest(int argc, char** argv);
  38. static int32_t procimport(const char* path, const char* srcpath, int32_t zmode);
  39. static int32_t procsearch(const char* path, const char* query, int32_t zmode, int64_t max,
  40. int32_t mode, bool ts, bool pk);
  41. static int32_t procsuggest(const char* path, const char* query, int64_t max, int32_t mode);
  42. // structure for sorting indexed records
  43. struct IndexedRecord {
  44. int64_t rank;
  45. std::string text;
  46. bool operator <(const IndexedRecord& right) const {
  47. if (rank != right.rank) return rank < right.rank;
  48. return text < right.text;
  49. }
  50. };
  51. // structure for sorting ambiguous records
  52. struct AmbiguousRecord {
  53. size_t dist;
  54. std::string key;
  55. uint32_t order;
  56. std::string text;
  57. bool operator <(const AmbiguousRecord& right) const {
  58. if (dist != right.dist) return dist < right.dist;
  59. if (key != right.key) return key < right.key;
  60. return order < right.order;
  61. }
  62. bool less(size_t rdist, const std::string& rkey, uint32_t rorder) const {
  63. if (dist != rdist) return dist < rdist;
  64. if (key != rkey) return key < rkey;
  65. return order < rorder;
  66. }
  67. };
  68. // structure for sorting plain records
  69. struct PlainRecord {
  70. std::string key;
  71. uint32_t order;
  72. std::string text;
  73. bool operator <(const PlainRecord& right) const {
  74. if (key != right.key) return key < right.key;
  75. return order < right.order;
  76. }
  77. bool less(const std::string& rkey, uint32_t rorder) const {
  78. if (key != rkey) return key < rkey;
  79. return order < rorder;
  80. }
  81. };
  82. // main routine
  83. int main(int argc, char** argv) {
  84. g_progname = argv[0];
  85. kc::setstdiobin();
  86. if (argc < 2) usage();
  87. int32_t rv = 0;
  88. if (!std::strcmp(argv[1], "import")) {
  89. rv = runimport(argc, argv);
  90. } else if (!std::strcmp(argv[1], "search")) {
  91. rv = runsearch(argc, argv);
  92. } else if (!std::strcmp(argv[1], "suggest")) {
  93. rv = runsuggest(argc, argv);
  94. } else {
  95. usage();
  96. }
  97. return rv;
  98. }
  99. // print the usage and exit
  100. static void usage() {
  101. std::cerr << g_progname << ": the command line utility of the word dictionary" << std::endl;
  102. std::cerr << std::endl;
  103. std::cerr << " " << g_progname << " import [-cz|-co|-cx] path src" << std::endl;
  104. std::cerr << " " << g_progname << " search [-cz|-co|-cx] [-max num] [-f|-a|-m|-r|-tm|-tr]"
  105. " [-ts] [-iu] [-pk] path query" << std::endl;
  106. std::cerr << " " << g_progname << " suggest [-max num] [-m|-r] [-iu] path query" <<
  107. std::endl;
  108. std::cerr << std::endl;
  109. std::exit(1);
  110. }
  111. // print error message of database
  112. static void dberrprint(kc::BasicDB* db, const char* info) {
  113. const kc::BasicDB::Error& err = db->error();
  114. std::cerr << g_progname << ": " << info << ": " << db->path().c_str() << ": " << err.code() <<
  115. ": " << err.name() << ": " << err.message() << std::endl;
  116. }
  117. // convert a UTF-8 string into a UCS-4 array
  118. static void utftoucs(const char* src, size_t size, uint32_t* dest, size_t* np) {
  119. _assert_(src && dest && np);
  120. const unsigned char* rp = (unsigned char*)src;
  121. const unsigned char* ep = rp + size;
  122. size_t dnum = 0;
  123. while (rp < ep) {
  124. uint32_t c = *rp;
  125. if (c < 0x80) {
  126. dest[dnum++] = c;
  127. } else if (c < 0xe0) {
  128. if (c >= 0xc0 && rp + 1 < ep) {
  129. c = ((c & 0x1f) << 6) | (rp[1] & 0x3f);
  130. if (c >= 0x80) dest[dnum++] = c;
  131. rp++;
  132. }
  133. } else if (c < 0xf0) {
  134. if (rp + 2 < ep) {
  135. c = ((c & 0x0f) << 12) | ((rp[1] & 0x3f) << 6) | (rp[2] & 0x3f);
  136. if (c >= 0x800) dest[dnum++] = c;
  137. rp += 2;
  138. }
  139. } else if (c < 0xf8) {
  140. if (rp + 3 < ep) {
  141. c = ((c & 0x07) << 18) | ((rp[1] & 0x3f) << 12) | ((rp[2] & 0x3f) << 6) |
  142. (rp[3] & 0x3f);
  143. if (c >= 0x10000) dest[dnum++] = c;
  144. rp += 3;
  145. }
  146. } else if (c < 0xfc) {
  147. if (rp + 4 < ep) {
  148. c = ((c & 0x03) << 24) | ((rp[1] & 0x3f) << 18) | ((rp[2] & 0x3f) << 12) |
  149. ((rp[3] & 0x3f) << 6) | (rp[4] & 0x3f);
  150. if (c >= 0x200000) dest[dnum++] = c;
  151. rp += 4;
  152. }
  153. } else if (c < 0xfe) {
  154. if (rp + 5 < ep) {
  155. c = ((c & 0x01) << 30) | ((rp[1] & 0x3f) << 24) | ((rp[2] & 0x3f) << 18) |
  156. ((rp[3] & 0x3f) << 12) | ((rp[4] & 0x3f) << 6) | (rp[5] & 0x3f);
  157. if (c >= 0x4000000) dest[dnum++] = c;
  158. rp += 5;
  159. }
  160. }
  161. rp++;
  162. }
  163. *np = dnum;
  164. }
  165. // normalize a query
  166. static void normalizequery(const char* qbuf, size_t qsiz, std::string* dest) {
  167. uint32_t ucsstack[1024];
  168. uint32_t* ucs = qsiz > sizeof(ucsstack) / sizeof(*ucsstack) ? new uint32_t[qsiz] : ucsstack;
  169. size_t unum;
  170. utftoucs(qbuf, qsiz, ucs, &unum);
  171. size_t nnum;
  172. normalizeucs(ucs, unum, &nnum);
  173. qsiz++;
  174. char utfstack[2048];
  175. char* utf = qsiz > sizeof(utfstack) ? new char[qsiz] : utfstack;
  176. qsiz = kc::strucstoutf(ucs, nnum, utf);
  177. dest->append(utf, qsiz);
  178. if (utf != utfstack) delete[] utf;
  179. if (ucs != ucsstack) delete[] ucs;
  180. }
  181. // normalize a USC-4 array
  182. static void normalizeucs(uint32_t* ary, size_t onum, size_t* np) {
  183. bool lowmode = true;
  184. bool nacmode = true;
  185. bool spcmode = true;
  186. size_t nnum = 0;
  187. for (size_t i = 0; i < onum; i++) {
  188. uint32_t c = ary[i];
  189. if (c >= 0x10000) {
  190. ary[nnum++] = c;
  191. continue;
  192. }
  193. uint32_t high = c >> 8;
  194. if (high == 0x00) {
  195. if (c < 0x0020 || c == 0x007f) {
  196. // control characters
  197. if (spcmode) {
  198. ary[nnum++] = 0x0020;
  199. } else if (c == 0x0009 || c == 0x000a || c == 0x000d) {
  200. ary[nnum++] = c;
  201. } else {
  202. ary[nnum++] = 0x0020;
  203. }
  204. } else if (c == 0x00a0) {
  205. // no-break space
  206. ary[nnum++] = 0x0020;
  207. } else {
  208. // otherwise
  209. if (lowmode) {
  210. if (c < 0x007f) {
  211. if (c >= 0x0041 && c <= 0x005a) c += 0x20;
  212. } else if (c >= 0x00c0 && c <= 0x00de && c != 0x00d7) {
  213. c += 0x20;
  214. }
  215. }
  216. if (nacmode) {
  217. if (c >= 0x00c0 && c <= 0x00c5) {
  218. c = 'A';
  219. } else if (c == 0x00c7) {
  220. c = 'C';
  221. } if (c >= 0x00c7 && c <= 0x00cb) {
  222. c = 'E';
  223. } if (c >= 0x00cc && c <= 0x00cf) {
  224. c = 'I';
  225. } else if (c == 0x00d0) {
  226. c = 'D';
  227. } else if (c == 0x00d1) {
  228. c = 'N';
  229. } if ((c >= 0x00d2 && c <= 0x00d6) || c == 0x00d8) {
  230. c = 'O';
  231. } if (c >= 0x00d9 && c <= 0x00dc) {
  232. c = 'U';
  233. } if (c == 0x00dd || c == 0x00de) {
  234. c = 'Y';
  235. } else if (c == 0x00df) {
  236. c = 's';
  237. } else if (c >= 0x00e0 && c <= 0x00e5) {
  238. c = 'a';
  239. } else if (c == 0x00e7) {
  240. c = 'c';
  241. } if (c >= 0x00e7 && c <= 0x00eb) {
  242. c = 'e';
  243. } if (c >= 0x00ec && c <= 0x00ef) {
  244. c = 'i';
  245. } else if (c == 0x00f0) {
  246. c = 'd';
  247. } else if (c == 0x00f1) {
  248. c = 'n';
  249. } if ((c >= 0x00f2 && c <= 0x00f6) || c == 0x00f8) {
  250. c = 'o';
  251. } if (c >= 0x00f9 && c <= 0x00fc) {
  252. c = 'u';
  253. } if (c >= 0x00fd && c <= 0x00ff) {
  254. c = 'y';
  255. }
  256. }
  257. ary[nnum++] = c;
  258. }
  259. } else if (high == 0x01) {
  260. // latin-1 extended
  261. if (lowmode) {
  262. if (c <= 0x0137) {
  263. if ((c & 1) == 0) c++;
  264. } else if (c == 0x0138) {
  265. c += 0;
  266. } else if (c <= 0x0148) {
  267. if ((c & 1) == 1) c++;
  268. } else if (c == 0x0149) {
  269. c += 0;
  270. } else if (c <= 0x0177) {
  271. if ((c & 1) == 0) c++;
  272. } else if (c == 0x0178) {
  273. c = 0x00ff;
  274. } else if (c <= 0x017e) {
  275. if ((c & 1) == 1) c++;
  276. } else if (c == 0x017f) {
  277. c += 0;
  278. }
  279. }
  280. if (nacmode) {
  281. if (c == 0x00ff) {
  282. c = 'y';
  283. } else if (c <= 0x0105) {
  284. c = ((c & 1) == 0) ? 'A' : 'a';
  285. } else if (c <= 0x010d) {
  286. c = ((c & 1) == 0) ? 'C' : 'c';
  287. } else if (c <= 0x0111) {
  288. c = ((c & 1) == 0) ? 'D' : 'd';
  289. } else if (c <= 0x011b) {
  290. c = ((c & 1) == 0) ? 'E' : 'e';
  291. } else if (c <= 0x0123) {
  292. c = ((c & 1) == 0) ? 'G' : 'g';
  293. } else if (c <= 0x0127) {
  294. c = ((c & 1) == 0) ? 'H' : 'h';
  295. } else if (c <= 0x0131) {
  296. c = ((c & 1) == 0) ? 'I' : 'i';
  297. } else if (c == 0x0134) {
  298. c = 'J';
  299. } else if (c == 0x0135) {
  300. c = 'j';
  301. } else if (c == 0x0136) {
  302. c = 'K';
  303. } else if (c == 0x0137) {
  304. c = 'k';
  305. } else if (c == 0x0138) {
  306. c = 'k';
  307. } else if (c >= 0x0139 && c <= 0x0142) {
  308. c = ((c & 1) == 1) ? 'L' : 'l';
  309. } else if (c >= 0x0143 && c <= 0x0148) {
  310. c = ((c & 1) == 1) ? 'N' : 'n';
  311. } else if (c >= 0x0149 && c <= 0x014b) {
  312. c = ((c & 1) == 0) ? 'N' : 'n';
  313. } else if (c >= 0x014c && c <= 0x0151) {
  314. c = ((c & 1) == 0) ? 'O' : 'o';
  315. } else if (c >= 0x0154 && c <= 0x0159) {
  316. c = ((c & 1) == 0) ? 'R' : 'r';
  317. } else if (c >= 0x015a && c <= 0x0161) {
  318. c = ((c & 1) == 0) ? 'S' : 's';
  319. } else if (c >= 0x0162 && c <= 0x0167) {
  320. c = ((c & 1) == 0) ? 'T' : 't';
  321. } else if (c >= 0x0168 && c <= 0x0173) {
  322. c = ((c & 1) == 0) ? 'U' : 'u';
  323. } else if (c == 0x0174) {
  324. c = 'W';
  325. } else if (c == 0x0175) {
  326. c = 'w';
  327. } else if (c == 0x0176) {
  328. c = 'Y';
  329. } else if (c == 0x0177) {
  330. c = 'y';
  331. } else if (c == 0x0178) {
  332. c = 'Y';
  333. } else if (c >= 0x0179 && c <= 0x017e) {
  334. c = ((c & 1) == 1) ? 'Z' : 'z';
  335. } else if (c == 0x017f) {
  336. c = 's';
  337. }
  338. }
  339. ary[nnum++] = c;
  340. } else if (high == 0x03) {
  341. // greek
  342. if (lowmode) {
  343. if (c >= 0x0391 && c <= 0x03a9) {
  344. c += 0x20;
  345. } else if (c >= 0x03d8 && c <= 0x03ef) {
  346. if ((c & 1) == 0) c++;
  347. } else if (c == 0x0374 || c == 0x03f7 || c == 0x03fa) {
  348. c++;
  349. }
  350. }
  351. ary[nnum++] = c;
  352. } else if (high == 0x04) {
  353. // cyrillic
  354. if (lowmode) {
  355. if (c <= 0x040f) {
  356. c += 0x50;
  357. } else if (c <= 0x042f) {
  358. c += 0x20;
  359. } else if (c >= 0x0460 && c <= 0x0481) {
  360. if ((c & 1) == 0) c++;
  361. } else if (c >= 0x048a && c <= 0x04bf) {
  362. if ((c & 1) == 0) c++;
  363. } else if (c == 0x04c0) {
  364. c = 0x04cf;
  365. } else if (c >= 0x04c1 && c <= 0x04ce) {
  366. if ((c & 1) == 1) c++;
  367. } else if (c >= 0x04d0) {
  368. if ((c & 1) == 0) c++;
  369. }
  370. }
  371. ary[nnum++] = c;
  372. } else if (high == 0x20) {
  373. if (c == 0x2002) {
  374. // en space
  375. ary[nnum++] = 0x0020;
  376. } else if (c == 0x2003) {
  377. // em space
  378. ary[nnum++] = 0x0020;
  379. } else if (c == 0x2009) {
  380. // thin space
  381. ary[nnum++] = 0x0020;
  382. } else if (c == 0x2010) {
  383. // hyphen
  384. ary[nnum++] = 0x002d;
  385. } else if (c == 0x2015) {
  386. // fullwidth horizontal line
  387. ary[nnum++] = 0x002d;
  388. } else if (c == 0x2019) {
  389. // apostrophe
  390. ary[nnum++] = 0x0027;
  391. } else if (c == 0x2033) {
  392. // double quotes
  393. ary[nnum++] = 0x0022;
  394. } else {
  395. // (otherwise)
  396. ary[nnum++] = c;
  397. }
  398. } else if (high == 0x22) {
  399. if (c == 0x2212) {
  400. // minus sign
  401. ary[nnum++] = 0x002d;
  402. } else {
  403. // (otherwise)
  404. ary[nnum++] = c;
  405. }
  406. } else if (high == 0x30) {
  407. if (c == 0x3000) {
  408. // fullwidth space
  409. if (spcmode) {
  410. ary[nnum++] = 0x0020;
  411. } else {
  412. ary[nnum++] = c;
  413. }
  414. } else {
  415. // (otherwise)
  416. ary[nnum++] = c;
  417. }
  418. } else if (high == 0xff) {
  419. if (c == 0xff01) {
  420. // fullwidth exclamation
  421. ary[nnum++] = 0x0021;
  422. } else if (c == 0xff03) {
  423. // fullwidth igeta
  424. ary[nnum++] = 0x0023;
  425. } else if (c == 0xff04) {
  426. // fullwidth dollar
  427. ary[nnum++] = 0x0024;
  428. } else if (c == 0xff05) {
  429. // fullwidth parcent
  430. ary[nnum++] = 0x0025;
  431. } else if (c == 0xff06) {
  432. // fullwidth ampersand
  433. ary[nnum++] = 0x0026;
  434. } else if (c == 0xff0a) {
  435. // fullwidth asterisk
  436. ary[nnum++] = 0x002a;
  437. } else if (c == 0xff0b) {
  438. // fullwidth plus
  439. ary[nnum++] = 0x002b;
  440. } else if (c == 0xff0c) {
  441. // fullwidth comma
  442. ary[nnum++] = 0x002c;
  443. } else if (c == 0xff0e) {
  444. // fullwidth period
  445. ary[nnum++] = 0x002e;
  446. } else if (c == 0xff0f) {
  447. // fullwidth slash
  448. ary[nnum++] = 0x002f;
  449. } else if (c == 0xff1a) {
  450. // fullwidth colon
  451. ary[nnum++] = 0x003a;
  452. } else if (c == 0xff1b) {
  453. // fullwidth semicolon
  454. ary[nnum++] = 0x003b;
  455. } else if (c == 0xff1d) {
  456. // fullwidth equal
  457. ary[nnum++] = 0x003d;
  458. } else if (c == 0xff1f) {
  459. // fullwidth question
  460. ary[nnum++] = 0x003f;
  461. } else if (c == 0xff20) {
  462. // fullwidth atmark
  463. ary[nnum++] = 0x0040;
  464. } else if (c == 0xff3c) {
  465. // fullwidth backslash
  466. ary[nnum++] = 0x005c;
  467. } else if (c == 0xff3e) {
  468. // fullwidth circumflex
  469. ary[nnum++] = 0x005e;
  470. } else if (c == 0xff3f) {
  471. // fullwidth underscore
  472. ary[nnum++] = 0x005f;
  473. } else if (c == 0xff5c) {
  474. // fullwidth vertical line
  475. ary[nnum++] = 0x007c;
  476. } else if (c >= 0xff21 && c <= 0xff3a) {
  477. // fullwidth alphabets
  478. if (lowmode) {
  479. c -= 0xfee0;
  480. if (c >= 0x0041 && c <= 0x005a) c += 0x20;
  481. ary[nnum++] = c;
  482. } else {
  483. ary[nnum++] = c - 0xfee0;
  484. }
  485. } else if (c >= 0xff41 && c <= 0xff5a) {
  486. // fullwidth small alphabets
  487. ary[nnum++] = c - 0xfee0;
  488. } else if (c >= 0xff10 && c <= 0xff19) {
  489. // fullwidth numbers
  490. ary[nnum++] = c - 0xfee0;
  491. } else if (c == 0xff61) {
  492. // halfwidth full stop
  493. ary[nnum++] = 0x3002;
  494. } else if (c == 0xff62) {
  495. // halfwidth left corner
  496. ary[nnum++] = 0x300c;
  497. } else if (c == 0xff63) {
  498. // halfwidth right corner
  499. ary[nnum++] = 0x300d;
  500. } else if (c == 0xff64) {
  501. // halfwidth comma
  502. ary[nnum++] = 0x3001;
  503. } else if (c == 0xff65) {
  504. // halfwidth middle dot
  505. ary[nnum++] = 0x30fb;
  506. } else if (c == 0xff66) {
  507. // halfwidth wo
  508. ary[nnum++] = 0x30f2;
  509. } else if (c >= 0xff67 && c <= 0xff6b) {
  510. // halfwidth small a-o
  511. ary[nnum++] = (c - 0xff67) * 2 + 0x30a1;
  512. } else if (c >= 0xff6c && c <= 0xff6e) {
  513. // halfwidth small ya-yo
  514. ary[nnum++] = (c - 0xff6c) * 2 + 0x30e3;
  515. } else if (c == 0xff6f) {
  516. // halfwidth small tu
  517. ary[nnum++] = 0x30c3;
  518. } else if (c == 0xff70) {
  519. // halfwidth prolonged mark
  520. ary[nnum++] = 0x30fc;
  521. } else if (c >= 0xff71 && c <= 0xff75) {
  522. // halfwidth a-o
  523. uint32_t tc = (c - 0xff71) * 2 + 0x30a2;
  524. if (c == 0xff73 && i < onum - 1 && ary[i+1] == 0xff9e) {
  525. tc = 0x30f4;
  526. i++;
  527. }
  528. ary[nnum++] = tc;
  529. } else if (c >= 0xff76 && c <= 0xff7a) {
  530. // halfwidth ka-ko
  531. uint32_t tc = (c - 0xff76) * 2 + 0x30ab;
  532. if (i < onum - 1 && ary[i+1] == 0xff9e) {
  533. tc++;
  534. i++;
  535. }
  536. ary[nnum++] = tc;
  537. } else if (c >= 0xff7b && c <= 0xff7f) {
  538. // halfwidth sa-so
  539. uint32_t tc = (c - 0xff7b) * 2 + 0x30b5;
  540. if (i < onum - 1 && ary[i+1] == 0xff9e) {
  541. tc++;
  542. i++;
  543. }
  544. ary[nnum++] = tc;
  545. } else if (c >= 0xff80 && c <= 0xff84) {
  546. // halfwidth ta-to
  547. uint32_t tc = (c - 0xff80) * 2 + 0x30bf + (c >= 0xff82 ? 1 : 0);
  548. if (i < onum - 1 && ary[i+1] == 0xff9e) {
  549. tc++;
  550. i++;
  551. }
  552. ary[nnum++] = tc;
  553. } else if (c >= 0xff85 && c <= 0xff89) {
  554. // halfwidth na-no
  555. ary[nnum++] = c - 0xcebb;
  556. } else if (c >= 0xff8a && c <= 0xff8e) {
  557. // halfwidth ha-ho
  558. uint32_t tc = (c - 0xff8a) * 3 + 0x30cf;
  559. if (i < onum - 1) {
  560. if (ary[i+1] == 0xff9e) {
  561. tc++;
  562. i++;
  563. } else if (ary[i+1] == 0xff9f) {
  564. tc += 2;
  565. i++;
  566. }
  567. }
  568. ary[nnum++] = tc;
  569. } else if (c >= 0xff8f && c <= 0xff93) {
  570. // halfwidth ma-mo
  571. ary[nnum++] = c - 0xceb1;
  572. } else if (c >= 0xff94 && c <= 0xff96) {
  573. // halfwidth ya-yo
  574. ary[nnum++] = (c - 0xff94) * 2 + 0x30e4;
  575. } else if (c >= 0xff97 && c <= 0xff9b) {
  576. // halfwidth ra-ro
  577. ary[nnum++] = c - 0xceae;
  578. } else if (c == 0xff9c) {
  579. // halfwidth wa
  580. ary[nnum++] = 0x30ef;
  581. } else if (c == 0xff9d) {
  582. // halfwidth nn
  583. ary[nnum++] = 0x30f3;
  584. } else {
  585. // otherwise
  586. ary[nnum++] = c;
  587. }
  588. } else {
  589. // otherwise
  590. ary[nnum++] = c;
  591. }
  592. }
  593. *np = nnum;
  594. }
  595. // get the levenshtein distance of two arrays
  596. template<class CHARTYPE>
  597. static size_t levdist(const CHARTYPE* abuf, size_t asiz, const CHARTYPE* bbuf, size_t bsiz) {
  598. size_t dsiz = bsiz + 1;
  599. size_t tsiz = (asiz + 1) * dsiz;
  600. uint8_t tblstack[2048];
  601. uint8_t* tbl = tsiz > sizeof(tblstack) ? new uint8_t[tsiz] : tblstack;
  602. tbl[0] = 0;
  603. for (size_t i = 1; i <= asiz; i++) {
  604. tbl[i*dsiz] = i;
  605. }
  606. for (size_t i = 1; i <= bsiz; i++) {
  607. tbl[i] = i;
  608. }
  609. abuf--;
  610. bbuf--;
  611. for (size_t i = 1; i <= asiz; i++) {
  612. for (size_t j = 1; j <= bsiz; j++) {
  613. uint32_t ac = tbl[(i-1)*dsiz+j] + 1;
  614. uint32_t bc = tbl[i*dsiz+j-1] + 1;
  615. uint32_t cc = tbl[(i-1)*dsiz+j-1] + (abuf[i] != bbuf[j]);
  616. ac = ac < bc ? ac : bc;
  617. tbl[i*dsiz+j] = ac < cc ? ac : cc;
  618. }
  619. }
  620. size_t ed = tbl[asiz*dsiz+bsiz];
  621. if (tbl != tblstack) delete[] tbl;
  622. return ed;
  623. }
  624. // parse arguments of import command
  625. static int32_t runimport(int argc, char** argv) {
  626. bool argbrk = false;
  627. const char* path = NULL;
  628. const char* srcpath = NULL;
  629. int32_t zmode = ZM_DEFAULT;
  630. for (int32_t i = 2; i < argc; i++) {
  631. if (!argbrk && argv[i][0] == '-') {
  632. if (!std::strcmp(argv[i], "--")) {
  633. argbrk = true;
  634. } else if (!std::strcmp(argv[i], "-cz")) {
  635. zmode = ZM_ZLIB;
  636. } else if (!std::strcmp(argv[i], "-co")) {
  637. zmode = ZM_LZO;
  638. } else if (!std::strcmp(argv[i], "-cx")) {
  639. zmode = ZM_LZMA;
  640. } else {
  641. usage();
  642. }
  643. } else if (!path) {
  644. argbrk = true;
  645. path = argv[i];
  646. } else if (!srcpath) {
  647. srcpath = argv[i];
  648. } else {
  649. usage();
  650. }
  651. }
  652. if (!path || !srcpath) usage();
  653. int32_t rv = procimport(path, srcpath, zmode);
  654. return rv;
  655. }
  656. // parse arguments of search command
  657. static int32_t runsearch(int argc, char** argv) {
  658. bool argbrk = false;
  659. const char* path = NULL;
  660. const char* query = NULL;
  661. int32_t zmode = ZM_DEFAULT;
  662. int64_t max = 10;
  663. int32_t mode = 0;
  664. bool ts = false;
  665. bool iu = false;
  666. bool pk = false;
  667. for (int32_t i = 2; i < argc; i++) {
  668. if (!argbrk && argv[i][0] == '-') {
  669. if (!std::strcmp(argv[i], "--")) {
  670. argbrk = true;
  671. } else if (!std::strcmp(argv[i], "-cz")) {
  672. zmode = ZM_ZLIB;
  673. } else if (!std::strcmp(argv[i], "-co")) {
  674. zmode = ZM_LZO;
  675. } else if (!std::strcmp(argv[i], "-cx")) {
  676. zmode = ZM_LZMA;
  677. } else if (!std::strcmp(argv[i], "-max")) {
  678. if (++i >= argc) usage();
  679. max = kc::atoix(argv[i]);
  680. } else if (!std::strcmp(argv[i], "-f")) {
  681. mode = 'f';
  682. } else if (!std::strcmp(argv[i], "-a")) {
  683. mode = 'a';
  684. } else if (!std::strcmp(argv[i], "-m")) {
  685. mode = 'm';
  686. } else if (!std::strcmp(argv[i], "-r")) {
  687. mode = 'r';
  688. } else if (!std::strcmp(argv[i], "-tm")) {
  689. mode = 'M';
  690. } else if (!std::strcmp(argv[i], "-tr")) {
  691. mode = 'R';
  692. } else if (!std::strcmp(argv[i], "-ts")) {
  693. ts = true;
  694. } else if (!std::strcmp(argv[i], "-iu")) {
  695. iu = true;
  696. } else if (!std::strcmp(argv[i], "-pk")) {
  697. pk = true;
  698. } else {
  699. usage();
  700. }
  701. } else if (!path) {
  702. argbrk = true;
  703. path = argv[i];
  704. } else if (!query) {
  705. query = argv[i];
  706. } else {
  707. usage();
  708. }
  709. }
  710. if (!path || !query) usage();
  711. const char* qbuf;
  712. if (iu) {
  713. size_t qsiz;
  714. qbuf = kc::urldecode(query, &qsiz);
  715. query = qbuf;
  716. } else {
  717. qbuf = NULL;
  718. }
  719. int32_t rv = procsearch(path, query, zmode, max, mode, ts, pk);
  720. delete[] qbuf;
  721. return rv;
  722. }
  723. // parse arguments of suggest command
  724. static int32_t runsuggest(int argc, char** argv) {
  725. bool argbrk = false;
  726. const char* path = NULL;
  727. const char* query = NULL;
  728. int64_t max = 10;
  729. int32_t mode = 0;
  730. bool iu = false;
  731. for (int32_t i = 2; i < argc; i++) {
  732. if (!argbrk && argv[i][0] == '-') {
  733. if (!std::strcmp(argv[i], "--")) {
  734. argbrk = true;
  735. } else if (!std::strcmp(argv[i], "-max")) {
  736. if (++i >= argc) usage();
  737. max = kc::atoix(argv[i]);
  738. } else if (!std::strcmp(argv[i], "-f")) {
  739. mode = 'f';
  740. } else if (!std::strcmp(argv[i], "-m")) {
  741. mode = 'm';
  742. } else if (!std::strcmp(argv[i], "-r")) {
  743. mode = 'r';
  744. } else if (!std::strcmp(argv[i], "-iu")) {
  745. iu = true;
  746. } else {
  747. usage();
  748. }
  749. } else if (!path) {
  750. argbrk = true;
  751. path = argv[i];
  752. } else if (!query) {
  753. query = argv[i];
  754. } else {
  755. usage();
  756. }
  757. }
  758. if (!path || !query) usage();
  759. const char* qbuf;
  760. if (iu) {
  761. size_t qsiz;
  762. qbuf = kc::urldecode(query, &qsiz);
  763. query = qbuf;
  764. } else {
  765. qbuf = NULL;
  766. }
  767. int32_t rv = procsuggest(path, query, max, mode);
  768. delete[] qbuf;
  769. return rv;
  770. }
  771. // perform import command
  772. static int32_t procimport(const char* path, const char* srcpath, int32_t zmode) {
  773. kc::TextDB srcdb;
  774. if (!srcdb.open(srcpath, kc::TextDB::OREADER)) {
  775. dberrprint(&srcdb, "DB::open failed");
  776. return 1;
  777. }
  778. kc::TreeDB destdb;
  779. int32_t opts = kc::TreeDB::TSMALL | kc::TreeDB::TLINEAR;
  780. kc::Compressor* zcomp = NULL;
  781. if (zmode != ZM_DEFAULT) {
  782. opts |= kc::TreeDB::TCOMPRESS;
  783. switch (zmode) {
  784. case ZM_LZO: {
  785. zcomp = new kc::LZOCompressor<kc::LZO::RAW>;
  786. break;
  787. }
  788. case ZM_LZMA: {
  789. zcomp = new kc::LZMACompressor<kc::LZMA::RAW>;
  790. break;
  791. }
  792. }
  793. }
  794. destdb.tune_options(opts);
  795. if (zcomp) destdb.tune_compressor(zcomp);
  796. if (!destdb.open(path, kc::TreeDB::OWRITER | kc::TreeDB::OCREATE | kc::TreeDB::OTRUNCATE)) {
  797. dberrprint(&destdb, "DB::open failed");
  798. delete zcomp;
  799. return 1;
  800. }
  801. bool err = false;
  802. class MapReduceImpl : public kc::MapReduce {
  803. public:
  804. MapReduceImpl(kc::TreeDB* destdb) :
  805. destdb_(destdb), lock_(), err_(false), mapcnt_(0), redcnt_(0) {}
  806. bool error() {
  807. return err_;
  808. }
  809. private:
  810. bool map(const char* kbuf, size_t ksiz, const char* vbuf, size_t vsiz) {
  811. bool err = false;
  812. std::vector<std::string> fields;
  813. kc::strsplit(std::string(vbuf, vsiz), '\t', &fields);
  814. if (fields.size() >= 5) {
  815. std::string key;
  816. normalizequery(fields[0].data(), fields[0].size(), &key);
  817. std::string value;
  818. kc::strprintf(&value, "%s\t%s\t%s\t%s",
  819. fields[1].c_str(), fields[2].c_str(),
  820. fields[3].c_str(), fields[4].c_str());
  821. if (!emit(key.data(), key.size(), value.data(), value.size())) err = true;
  822. }
  823. int64_t cnt = mapcnt_.add(1) + 1;
  824. if (cnt % 10000 == 0) {
  825. std::string message = kc::strprintf("processed %lld entries", (long long)cnt);
  826. if (!log("map", message.c_str())) err = true;
  827. }
  828. return !err;
  829. }
  830. bool reduce(const char* kbuf, size_t ksiz, ValueIterator* iter) {
  831. bool err = false;
  832. std::vector<IndexedRecord> records;
  833. const char* vbuf;
  834. size_t vsiz;
  835. while ((vbuf = iter->next(&vsiz)) != NULL) {
  836. std::vector<std::string> fields;
  837. kc::strsplit(std::string(vbuf, vsiz), '\t', &fields);
  838. if (fields.size() >= 4) {
  839. int64_t rank = kc::atoi(fields[0].c_str());
  840. std::string text;
  841. kc::strprintf(&text, "%s\t%s\t%s",
  842. fields[1].c_str(), fields[2].c_str(), fields[3].c_str());
  843. IndexedRecord rec = { rank, text };
  844. records.push_back(rec);
  845. }
  846. }
  847. std::sort(records.begin(), records.end());
  848. if (records.size() > 1000) records.resize(1000);
  849. std::vector<IndexedRecord>::iterator rit = records.begin();
  850. std::vector<IndexedRecord>::iterator ritend = records.end();
  851. int32_t seq = 0;
  852. while (rit != ritend) {
  853. std::string key(kbuf, ksiz);
  854. kc::strprintf(&key, "\t%03lld", (long long)++seq);
  855. if (!destdb_->set(key.data(), key.size(), rit->text.data(), rit->text.size())) {
  856. err = true;
  857. err_ = true;
  858. }
  859. rit++;
  860. }
  861. int64_t cnt = redcnt_.add(1) + 1;
  862. if (cnt % 10000 == 0) {
  863. std::string message = kc::strprintf("processed %lld entries", (long long)cnt);
  864. if (!log("reduce", message.c_str())) err = true;
  865. }
  866. return !err;
  867. }
  868. bool log(const char* name, const char* message) {
  869. kc::ScopedMutex lock(&lock_);
  870. std::cout << name << ": " << message << std::endl;
  871. return true;
  872. }
  873. private:
  874. kc::TreeDB* destdb_;
  875. kc::Mutex lock_;
  876. bool err_;
  877. kc::AtomicInt64 mapcnt_;
  878. kc::AtomicInt64 redcnt_;
  879. };
  880. MapReduceImpl mr(&destdb);
  881. mr.tune_thread(THREADNUM, THREADNUM, THREADNUM);
  882. if (!mr.execute(&srcdb, "", kc::MapReduce::XPARAMAP | kc::MapReduce::XPARAFLS)) {
  883. dberrprint(&srcdb, "MapReduce::execute failed");
  884. err = true;
  885. }
  886. if (mr.error()) {
  887. dberrprint(&srcdb, "MapReduce::execute failed");
  888. err = true;
  889. }
  890. if (!destdb.close()) {
  891. dberrprint(&destdb, "DB::close failed");
  892. err = true;
  893. }
  894. delete zcomp;
  895. if (!srcdb.close()) {
  896. dberrprint(&srcdb, "DB::close failed");
  897. err = true;
  898. }
  899. return err ? 1 : 0;
  900. }
  901. // perform search command
  902. static int32_t procsearch(const char* path, const char* query, int32_t zmode, int64_t max,
  903. int32_t mode, bool ts, bool pk) {
  904. kc::TreeDB db;
  905. kc::Compressor* zcomp = NULL;
  906. if (zmode != ZM_DEFAULT) {
  907. switch (zmode) {
  908. case ZM_LZO: {
  909. zcomp = new kc::LZOCompressor<kc::LZO::RAW>;
  910. break;
  911. }
  912. case ZM_LZMA: {
  913. zcomp = new kc::LZMACompressor<kc::LZMA::RAW>;
  914. break;
  915. }
  916. }
  917. }
  918. if (zcomp) db.tune_compressor(zcomp);
  919. if (!db.open(path, kc::TreeDB::OREADER)) {
  920. dberrprint(&db, "DB::open failed");
  921. delete zcomp;
  922. return 1;
  923. }
  924. std::string nquery;
  925. if (ts) {
  926. nquery = query;
  927. kc::strtolower(&nquery);
  928. } else {
  929. normalizequery(query, std::strlen(query), &nquery);
  930. }
  931. bool err = false;
  932. if (mode == 'a') {
  933. class VisitorImpl : public kc::DB::Visitor {
  934. public:
  935. VisitorImpl(const std::string& query, int64_t max, bool pk) :
  936. qbuf_(), qsiz_(0), max_(max), pk_(pk),
  937. thres_(0), minsiz_(0), maxsiz_(0), lock_(), queue_() {
  938. qsiz_ = query.size();
  939. qbuf_ = new uint32_t[qsiz_+1];
  940. utftoucs(query.data(), query.size(), qbuf_, &qsiz_);
  941. if (qsiz_ > kc::UINT8MAX) qsiz_ = kc::UINT8MAX;
  942. thres_ = qsiz_ / AMBGRATIO;
  943. if (thres_ < AMBGMIN) thres_ = AMBGMIN;
  944. minsiz_ = qsiz_ > thres_ ? qsiz_ - thres_ : 0;
  945. maxsiz_ = qsiz_ + thres_;
  946. }
  947. ~VisitorImpl() {
  948. delete[] qbuf_;
  949. }
  950. private:
  951. const char* visit_full(const char* kbuf, size_t ksiz,
  952. const char* vbuf, size_t vsiz, size_t* sp) {
  953. size_t oksiz = ksiz;
  954. while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
  955. ksiz--;
  956. }
  957. if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
  958. uint32_t ucsstack[kc::UINT8MAX];
  959. uint32_t* ucs = ksiz > sizeof(ucsstack) / sizeof(*ucsstack) ?
  960. new uint32_t[ksiz] : ucsstack;
  961. size_t usiz;
  962. utftoucs(kbuf, ksiz, ucs, &usiz);
  963. if (usiz > kc::UINT8MAX) usiz = kc::UINT8MAX;
  964. if (usiz < minsiz_ || usiz > maxsiz_) {
  965. if (ucs != ucsstack) delete[] ucs;
  966. return NOP;
  967. }
  968. size_t dist = levdist(ucs, usiz, qbuf_, qsiz_);
  969. if (dist <= thres_) {
  970. std::string key(kbuf, ksiz);
  971. uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
  972. kc::ScopedMutex lock(&lock_);
  973. if ((int64_t)queue_.size() < max_) {
  974. AmbiguousRecord rec = { dist, key, order, std::string(vbuf, vsiz) };
  975. queue_.push(rec);
  976. } else {
  977. const AmbiguousRecord& top = queue_.top();
  978. if (!top.less(dist, key, order)) {
  979. queue_.pop();
  980. AmbiguousRecord rec = { dist, key, order, std::string(vbuf, vsiz) };
  981. queue_.push(rec);
  982. }
  983. }
  984. }
  985. if (ucs != ucsstack) delete[] ucs;
  986. return NOP;
  987. }
  988. void visit_after() {
  989. std::vector<AmbiguousRecord> recs;
  990. while (!queue_.empty()) {
  991. recs.push_back(queue_.top());
  992. queue_.pop();
  993. }
  994. std::vector<AmbiguousRecord>::reverse_iterator rit = recs.rbegin();
  995. std::vector<AmbiguousRecord>::reverse_iterator ritend = recs.rend();
  996. while (rit != ritend) {
  997. if (pk_) std::cout << rit->key << "\t";
  998. std::cout << rit->text << "\t" << rit->dist << std::endl;
  999. ++rit;
  1000. }
  1001. }
  1002. uint32_t* qbuf_;
  1003. size_t qsiz_;
  1004. int64_t max_;
  1005. bool pk_;
  1006. size_t thres_;
  1007. size_t minsiz_;
  1008. size_t maxsiz_;
  1009. kc::Mutex lock_;
  1010. std::priority_queue<AmbiguousRecord> queue_;
  1011. };
  1012. VisitorImpl visitor(nquery, max, pk);
  1013. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1014. dberrprint(&db, "DB::scan_parallel faileda");
  1015. err = true;
  1016. }
  1017. } else if (mode == 'm') {
  1018. class VisitorImpl : public kc::DB::Visitor {
  1019. public:
  1020. VisitorImpl(const std::string& query, int64_t max, bool pk) :
  1021. query_(query), max_(max), pk_(pk), lock_(), queue_() {}
  1022. private:
  1023. const char* visit_full(const char* kbuf, size_t ksiz,
  1024. const char* vbuf, size_t vsiz, size_t* sp) {
  1025. size_t oksiz = ksiz;
  1026. while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
  1027. ksiz--;
  1028. }
  1029. if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
  1030. std::string key(kbuf, ksiz);
  1031. if (key.find(query_) != std::string::npos) {
  1032. uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
  1033. kc::ScopedMutex lock(&lock_);
  1034. if ((int64_t)queue_.size() < max_) {
  1035. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1036. queue_.push(rec);
  1037. } else {
  1038. const PlainRecord& top = queue_.top();
  1039. if (!top.less(key, order)) {
  1040. queue_.pop();
  1041. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1042. queue_.push(rec);
  1043. }
  1044. }
  1045. }
  1046. return NOP;
  1047. }
  1048. void visit_after() {
  1049. std::vector<PlainRecord> recs;
  1050. while (!queue_.empty()) {
  1051. recs.push_back(queue_.top());
  1052. queue_.pop();
  1053. }
  1054. std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
  1055. std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
  1056. while (rit != ritend) {
  1057. if (pk_) std::cout << rit->key << "\t";
  1058. std::cout << rit->text << std::endl;
  1059. ++rit;
  1060. }
  1061. }
  1062. std::string query_;
  1063. int64_t max_;
  1064. bool pk_;
  1065. kc::Mutex lock_;
  1066. std::priority_queue<PlainRecord> queue_;
  1067. };
  1068. VisitorImpl visitor(nquery, max, pk);
  1069. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1070. dberrprint(&db, "DB::scan_parallel failed");
  1071. err = true;
  1072. }
  1073. } else if (mode == 'M') {
  1074. class VisitorImpl : public kc::DB::Visitor {
  1075. public:
  1076. VisitorImpl(const std::string& query, int64_t max, bool ts, bool pk) :
  1077. query_(query), max_(max), ts_(ts), pk_(pk), lock_(), queue_() {}
  1078. private:
  1079. const char* visit_full(const char* kbuf, size_t ksiz,
  1080. const char* vbuf, size_t vsiz, size_t* sp) {
  1081. const char* rbuf = vbuf;
  1082. size_t rsiz = vsiz;
  1083. while (rsiz > 0 && *rbuf != '\t') {
  1084. rbuf++;
  1085. rsiz--;
  1086. }
  1087. if (rsiz > 0 && *rbuf == '\t') {
  1088. rbuf++;
  1089. rsiz--;
  1090. }
  1091. while (rsiz > 0 && *rbuf != '\t') {
  1092. rbuf++;
  1093. rsiz--;
  1094. }
  1095. if (rsiz > 0 && *rbuf == '\t') {
  1096. rbuf++;
  1097. rsiz--;
  1098. }
  1099. bool hit = false;
  1100. if (ts_) {
  1101. hit = kc::memimem(rbuf, rsiz, query_.data(), query_.size()) != NULL;
  1102. } else {
  1103. std::string value;
  1104. normalizequery(rbuf, rsiz, &value);
  1105. hit = value.find(query_) != std::string::npos;
  1106. }
  1107. if (hit) {
  1108. size_t oksiz = ksiz;
  1109. while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
  1110. ksiz--;
  1111. }
  1112. if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
  1113. std::string key(kbuf, ksiz);
  1114. uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
  1115. kc::ScopedMutex lock(&lock_);
  1116. if ((int64_t)queue_.size() < max_) {
  1117. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1118. queue_.push(rec);
  1119. } else {
  1120. const PlainRecord& top = queue_.top();
  1121. if (!top.less(key, order)) {
  1122. queue_.pop();
  1123. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1124. queue_.push(rec);
  1125. }
  1126. }
  1127. }
  1128. return NOP;
  1129. }
  1130. void visit_after() {
  1131. std::vector<PlainRecord> recs;
  1132. while (!queue_.empty()) {
  1133. recs.push_back(queue_.top());
  1134. queue_.pop();
  1135. }
  1136. std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
  1137. std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
  1138. while (rit != ritend) {
  1139. if (pk_) std::cout << rit->key << "\t";
  1140. std::cout << rit->text << std::endl;
  1141. ++rit;
  1142. }
  1143. }
  1144. std::string query_;
  1145. int64_t max_;
  1146. bool ts_;
  1147. bool pk_;
  1148. kc::Mutex lock_;
  1149. std::priority_queue<PlainRecord> queue_;
  1150. };
  1151. VisitorImpl visitor(nquery, max, ts, pk);
  1152. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1153. dberrprint(&db, "DB::scan_parallel failed");
  1154. err = true;
  1155. }
  1156. } else if (mode == 'r') {
  1157. class VisitorImpl : public kc::DB::Visitor {
  1158. public:
  1159. VisitorImpl(const std::string& query, int64_t max, bool pk) :
  1160. regex_(), max_(max), pk_(pk), lock_(), queue_() {
  1161. regex_.compile(query, kc::Regex::MATCHONLY);
  1162. }
  1163. private:
  1164. const char* visit_full(const char* kbuf, size_t ksiz,
  1165. const char* vbuf, size_t vsiz, size_t* sp) {
  1166. size_t oksiz = ksiz;
  1167. while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
  1168. ksiz--;
  1169. }
  1170. if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
  1171. std::string key(kbuf, ksiz);
  1172. if (regex_.match(key)) {
  1173. uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
  1174. kc::ScopedMutex lock(&lock_);
  1175. if ((int64_t)queue_.size() < max_) {
  1176. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1177. queue_.push(rec);
  1178. } else {
  1179. const PlainRecord& top = queue_.top();
  1180. if (!top.less(key, order)) {
  1181. queue_.pop();
  1182. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1183. queue_.push(rec);
  1184. }
  1185. }
  1186. }
  1187. return NOP;
  1188. }
  1189. void visit_after() {
  1190. std::vector<PlainRecord> recs;
  1191. while (!queue_.empty()) {
  1192. recs.push_back(queue_.top());
  1193. queue_.pop();
  1194. }
  1195. std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
  1196. std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
  1197. while (rit != ritend) {
  1198. if (pk_) std::cout << rit->key << "\t";
  1199. std::cout << rit->text << std::endl;
  1200. ++rit;
  1201. }
  1202. }
  1203. kc::Regex regex_;
  1204. int64_t max_;
  1205. bool pk_;
  1206. kc::Mutex lock_;
  1207. std::priority_queue<PlainRecord> queue_;
  1208. };
  1209. VisitorImpl visitor(nquery, max, pk);
  1210. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1211. dberrprint(&db, "DB::scan_parallel failed");
  1212. err = true;
  1213. }
  1214. } else if (mode == 'R') {
  1215. class VisitorImpl : public kc::DB::Visitor {
  1216. public:
  1217. VisitorImpl(const std::string& query, int64_t max, bool ts, bool pk) :
  1218. regex_(), max_(max), ts_(ts), pk_(pk), lock_(), queue_() {
  1219. regex_.compile(query, kc::Regex::MATCHONLY);
  1220. }
  1221. private:
  1222. const char* visit_full(const char* kbuf, size_t ksiz,
  1223. const char* vbuf, size_t vsiz, size_t* sp) {
  1224. const char* rbuf = vbuf;
  1225. size_t rsiz = vsiz;
  1226. while (rsiz > 0 && *rbuf != '\t') {
  1227. rbuf++;
  1228. rsiz--;
  1229. }
  1230. if (rsiz > 0 && *rbuf == '\t') {
  1231. rbuf++;
  1232. rsiz--;
  1233. }
  1234. while (rsiz > 0 && *rbuf != '\t') {
  1235. rbuf++;
  1236. rsiz--;
  1237. }
  1238. if (rsiz > 0 && *rbuf == '\t') {
  1239. rbuf++;
  1240. rsiz--;
  1241. }
  1242. bool hit = false;
  1243. if (ts_) {
  1244. std::string value(rbuf, rsiz);
  1245. kc::strtolower(&value);
  1246. hit = regex_.match(value);
  1247. } else {
  1248. std::string value(rbuf, rsiz);
  1249. normalizequery(rbuf, rsiz, &value);
  1250. hit = regex_.match(value);
  1251. }
  1252. if (hit) {
  1253. size_t oksiz = ksiz;
  1254. while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
  1255. ksiz--;
  1256. }
  1257. if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
  1258. std::string key(kbuf, ksiz);
  1259. uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
  1260. kc::ScopedMutex lock(&lock_);
  1261. if ((int64_t)queue_.size() < max_) {
  1262. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1263. queue_.push(rec);
  1264. } else {
  1265. const PlainRecord& top = queue_.top();
  1266. if (!top.less(key, order)) {
  1267. queue_.pop();
  1268. PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
  1269. queue_.push(rec);
  1270. }
  1271. }
  1272. }
  1273. return NOP;
  1274. }
  1275. void visit_after() {
  1276. std::vector<PlainRecord> recs;
  1277. while (!queue_.empty()) {
  1278. recs.push_back(queue_.top());
  1279. queue_.pop();
  1280. }
  1281. std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
  1282. std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
  1283. while (rit != ritend) {
  1284. if (pk_) std::cout << rit->key << "\t";
  1285. std::cout << rit->text << std::endl;
  1286. ++rit;
  1287. }
  1288. }
  1289. kc::Regex regex_;
  1290. int64_t max_;
  1291. bool ts_;
  1292. bool pk_;
  1293. kc::Mutex lock_;
  1294. std::priority_queue<PlainRecord> queue_;
  1295. };
  1296. VisitorImpl visitor(nquery, max, ts, pk);
  1297. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1298. dberrprint(&db, "DB::scan_parallel failed");
  1299. err = true;
  1300. }
  1301. } else {
  1302. std::string qstr(nquery);
  1303. if (mode == 'f') qstr.append("\t");
  1304. kc::TreeDB::Cursor* cur = db.cursor();
  1305. cur->jump(qstr);
  1306. char* kbuf;
  1307. size_t ksiz;
  1308. const char* vbuf;
  1309. size_t vsiz;
  1310. while (max > 0 && (kbuf = cur->get(&ksiz, &vbuf, &vsiz, true)) != NULL) {
  1311. if (ksiz >= qstr.size() && !std::memcmp(kbuf, qstr.data(), qstr.size())) {
  1312. if (pk) {
  1313. while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
  1314. ksiz--;
  1315. }
  1316. if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
  1317. std::cout.write(kbuf, ksiz);
  1318. std::cout << "\t";
  1319. }
  1320. std::cout.write(vbuf, vsiz);
  1321. std::cout << std::endl;
  1322. } else {
  1323. max = 0;
  1324. }
  1325. delete[] kbuf;
  1326. max--;
  1327. }
  1328. delete cur;
  1329. }
  1330. if (!db.close()) {
  1331. dberrprint(&db, "DB::close failed");
  1332. err = true;
  1333. }
  1334. delete zcomp;
  1335. return err ? 1 : 0;
  1336. }
  1337. // perform suggest command
  1338. static int32_t procsuggest(const char* path, const char* query, int64_t max, int32_t mode) {
  1339. kc::TextDB db;
  1340. if (!db.open(path, kc::TextDB::OREADER)) {
  1341. dberrprint(&db, "DB::open failed");
  1342. return 1;
  1343. }
  1344. std::string nquery;
  1345. normalizequery(query, std::strlen(query), &nquery);
  1346. bool err = false;
  1347. if (mode == 'm') {
  1348. class VisitorImpl : public kc::DB::Visitor {
  1349. public:
  1350. VisitorImpl(const std::string& query, int64_t max) :
  1351. query_(query), max_(max), lock_(), queue_() {}
  1352. private:
  1353. const char* visit_full(const char* kbuf, size_t ksiz,
  1354. const char* vbuf, size_t vsiz, size_t* sp) {
  1355. std::string key(vbuf, vsiz);
  1356. if (key.find(query_) != std::string::npos) {
  1357. kc::ScopedMutex lock(&lock_);
  1358. if ((int64_t)queue_.size() < max_) {
  1359. PlainRecord rec = { key, 0, "" };
  1360. queue_.push(rec);
  1361. } else {
  1362. const PlainRecord& top = queue_.top();
  1363. if (!top.less(key, 0)) {
  1364. queue_.pop();
  1365. PlainRecord rec = { key, 0, "" };
  1366. queue_.push(rec);
  1367. }
  1368. }
  1369. }
  1370. return NOP;
  1371. }
  1372. void visit_after() {
  1373. std::vector<PlainRecord> recs;
  1374. while (!queue_.empty()) {
  1375. recs.push_back(queue_.top());
  1376. queue_.pop();
  1377. }
  1378. std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
  1379. std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
  1380. while (rit != ritend) {
  1381. std::cout << rit->key << std::endl;
  1382. ++rit;
  1383. }
  1384. }
  1385. std::string query_;
  1386. int64_t max_;
  1387. kc::Mutex lock_;
  1388. std::priority_queue<PlainRecord> queue_;
  1389. };
  1390. VisitorImpl visitor(nquery, max);
  1391. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1392. dberrprint(&db, "DB::scan_parallel failed");
  1393. err = true;
  1394. }
  1395. } else if (mode == 'r') {
  1396. class VisitorImpl : public kc::DB::Visitor {
  1397. public:
  1398. VisitorImpl(const std::string& query, int64_t max) :
  1399. regex_(), max_(max), lock_(), queue_() {
  1400. regex_.compile(query, kc::Regex::MATCHONLY);
  1401. }
  1402. private:
  1403. const char* visit_full(const char* kbuf, size_t ksiz,
  1404. const char* vbuf, size_t vsiz, size_t* sp) {
  1405. std::string key(vbuf, vsiz);
  1406. if (regex_.match(key)) {
  1407. kc::ScopedMutex lock(&lock_);
  1408. if ((int64_t)queue_.size() < max_) {
  1409. PlainRecord rec = { key, 0, "" };
  1410. queue_.push(rec);
  1411. } else {
  1412. const PlainRecord& top = queue_.top();
  1413. if (!top.less(key, 0)) {
  1414. queue_.pop();
  1415. PlainRecord rec = { key, 0, "" };
  1416. queue_.push(rec);
  1417. }
  1418. }
  1419. }
  1420. return NOP;
  1421. }
  1422. void visit_after() {
  1423. std::vector<PlainRecord> recs;
  1424. while (!queue_.empty()) {
  1425. recs.push_back(queue_.top());
  1426. queue_.pop();
  1427. }
  1428. std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
  1429. std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
  1430. while (rit != ritend) {
  1431. std::cout << rit->key << std::endl;
  1432. ++rit;
  1433. }
  1434. }
  1435. kc::Regex regex_;
  1436. int64_t max_;
  1437. kc::Mutex lock_;
  1438. std::priority_queue<PlainRecord> queue_;
  1439. };
  1440. VisitorImpl visitor(nquery, max);
  1441. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1442. dberrprint(&db, "DB::scan_parallel failed");
  1443. err = true;
  1444. }
  1445. } else {
  1446. class VisitorImpl : public kc::DB::Visitor {
  1447. public:
  1448. VisitorImpl(const std::string& query, int64_t max) :
  1449. qbuf_(), qsiz_(0), max_(max),
  1450. thres_(0), minsiz_(0), maxsiz_(0), lock_(), queue_() {
  1451. qsiz_ = query.size();
  1452. qbuf_ = new uint32_t[qsiz_+1];
  1453. utftoucs(query.data(), query.size(), qbuf_, &qsiz_);
  1454. if (qsiz_ > kc::UINT8MAX) qsiz_ = kc::UINT8MAX;
  1455. thres_ = qsiz_ / AMBGRATIO;
  1456. if (thres_ < AMBGMIN) thres_ = AMBGMIN;
  1457. minsiz_ = qsiz_ > thres_ ? qsiz_ - thres_ : 0;
  1458. maxsiz_ = qsiz_ + thres_;
  1459. }
  1460. ~VisitorImpl() {
  1461. delete[] qbuf_;
  1462. }
  1463. private:
  1464. const char* visit_full(const char* kbuf, size_t ksiz,
  1465. const char* vbuf, size_t vsiz, size_t* sp) {
  1466. uint32_t ucsstack[kc::UINT8MAX];
  1467. uint32_t* ucs = vsiz > sizeof(ucsstack) / sizeof(*ucsstack) ?
  1468. new uint32_t[vsiz] : ucsstack;
  1469. size_t usiz;
  1470. utftoucs(vbuf, vsiz, ucs, &usiz);
  1471. if (usiz > kc::UINT8MAX) usiz = kc::UINT8MAX;
  1472. if (usiz < minsiz_ || usiz > maxsiz_) {
  1473. if (ucs != ucsstack) delete[] ucs;
  1474. return NOP;
  1475. }
  1476. size_t dist = levdist(ucs, usiz, qbuf_, qsiz_);
  1477. if (dist <= thres_) {
  1478. std::string key(vbuf, vsiz);
  1479. kc::ScopedMutex lock(&lock_);
  1480. if ((int64_t)queue_.size() < max_) {
  1481. AmbiguousRecord rec = { dist, key, 0, "" };
  1482. queue_.push(rec);
  1483. } else {
  1484. const AmbiguousRecord& top = queue_.top();
  1485. if (!top.less(dist, key, 0)) {
  1486. queue_.pop();
  1487. AmbiguousRecord rec = { dist, key, 0, "" };
  1488. queue_.push(rec);
  1489. }
  1490. }
  1491. }
  1492. if (ucs != ucsstack) delete[] ucs;
  1493. return NOP;
  1494. }
  1495. void visit_after() {
  1496. std::vector<AmbiguousRecord> recs;
  1497. while (!queue_.empty()) {
  1498. recs.push_back(queue_.top());
  1499. queue_.pop();
  1500. }
  1501. std::vector<AmbiguousRecord>::reverse_iterator rit = recs.rbegin();
  1502. std::vector<AmbiguousRecord>::reverse_iterator ritend = recs.rend();
  1503. while (rit != ritend) {
  1504. std::cout << rit->key << "\t" << rit->dist << std::endl;
  1505. ++rit;
  1506. }
  1507. }
  1508. uint32_t* qbuf_;
  1509. size_t qsiz_;
  1510. int64_t max_;
  1511. size_t thres_;
  1512. size_t minsiz_;
  1513. size_t maxsiz_;
  1514. kc::Mutex lock_;
  1515. std::priority_queue<AmbiguousRecord> queue_;
  1516. };
  1517. VisitorImpl visitor(nquery, max);
  1518. if (!db.scan_parallel(&visitor, THREADNUM)) {
  1519. dberrprint(&db, "DB::scan_parallel faileda");
  1520. err = true;
  1521. }
  1522. }
  1523. if (!db.close()) {
  1524. dberrprint(&db, "DB::close failed");
  1525. err = true;
  1526. }
  1527. return err ? 1 : 0;
  1528. }
  1529. // END OF FILE