/kyotocabinet-1.2.70/lab/kcdict/kcdictmgr.cc
C++ | 1575 lines | 1432 code | 46 blank | 97 comment | 812 complexity | 0b219fdd75ec2353acb96d2b79d9ad41 MD5 | raw file
Possible License(s): GPL-3.0
- /*************************************************************************************************
- * The command line utility of the word dictionary
- * Copyright (C) 2009-2011 FAL Labs
- * This file is part of Kyoto Cabinet.
- * This program is free software: you can redistribute it and/or modify it under the terms of
- * the GNU General Public License as published by the Free Software Foundation, either version
- * 3 of the License, or any later version.
- * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- * You should have received a copy of the GNU General Public License along with this program.
- * If not, see <http://www.gnu.org/licenses/>.
- *************************************************************************************************/
- #include <kcutil.h>
- #include <kcpolydb.h>
- #include <kcdbext.h>
- namespace kc = kyotocabinet;
- // enumurations
- enum { ZM_DEFAULT, ZM_ZLIB, ZM_LZO, ZM_LZMA };
- // constants
- const size_t THREADNUM = 8; // number of threads
- const size_t AMBGRATIO = 3; // ratio of threshold of ambiguous search
- const size_t AMBGMIN = 3; // minimum threshold of ambiguous search
- // global variables
- const char* g_progname; // program name
- // function prototypes
- int main(int argc, char** argv);
- static void usage();
- static void dberrprint(kc::BasicDB* db, const char* info);
- static void utftoucs(const char* src, size_t size, uint32_t* dest, size_t* np);
- static void normalizequery(const char* qbuf, size_t qsiz, std::string* dest);
- static void normalizeucs(uint32_t* ary, size_t onum, size_t* np);
- template<class CHARTYPE>
- static size_t levdist(const CHARTYPE* abuf, size_t asiz, const CHARTYPE* bbuf, size_t bsiz);
- static int32_t runimport(int argc, char** argv);
- static int32_t runsearch(int argc, char** argv);
- static int32_t runsuggest(int argc, char** argv);
- static int32_t procimport(const char* path, const char* srcpath, int32_t zmode);
- static int32_t procsearch(const char* path, const char* query, int32_t zmode, int64_t max,
- int32_t mode, bool ts, bool pk);
- static int32_t procsuggest(const char* path, const char* query, int64_t max, int32_t mode);
- // structure for sorting indexed records
- struct IndexedRecord {
- int64_t rank;
- std::string text;
- bool operator <(const IndexedRecord& right) const {
- if (rank != right.rank) return rank < right.rank;
- return text < right.text;
- }
- };
- // structure for sorting ambiguous records
- struct AmbiguousRecord {
- size_t dist;
- std::string key;
- uint32_t order;
- std::string text;
- bool operator <(const AmbiguousRecord& right) const {
- if (dist != right.dist) return dist < right.dist;
- if (key != right.key) return key < right.key;
- return order < right.order;
- }
- bool less(size_t rdist, const std::string& rkey, uint32_t rorder) const {
- if (dist != rdist) return dist < rdist;
- if (key != rkey) return key < rkey;
- return order < rorder;
- }
- };
- // structure for sorting plain records
- struct PlainRecord {
- std::string key;
- uint32_t order;
- std::string text;
- bool operator <(const PlainRecord& right) const {
- if (key != right.key) return key < right.key;
- return order < right.order;
- }
- bool less(const std::string& rkey, uint32_t rorder) const {
- if (key != rkey) return key < rkey;
- return order < rorder;
- }
- };
- // main routine
- int main(int argc, char** argv) {
- g_progname = argv[0];
- kc::setstdiobin();
- if (argc < 2) usage();
- int32_t rv = 0;
- if (!std::strcmp(argv[1], "import")) {
- rv = runimport(argc, argv);
- } else if (!std::strcmp(argv[1], "search")) {
- rv = runsearch(argc, argv);
- } else if (!std::strcmp(argv[1], "suggest")) {
- rv = runsuggest(argc, argv);
- } else {
- usage();
- }
- return rv;
- }
- // print the usage and exit
- static void usage() {
- std::cerr << g_progname << ": the command line utility of the word dictionary" << std::endl;
- std::cerr << std::endl;
- std::cerr << " " << g_progname << " import [-cz|-co|-cx] path src" << std::endl;
- std::cerr << " " << g_progname << " search [-cz|-co|-cx] [-max num] [-f|-a|-m|-r|-tm|-tr]"
- " [-ts] [-iu] [-pk] path query" << std::endl;
- std::cerr << " " << g_progname << " suggest [-max num] [-m|-r] [-iu] path query" <<
- std::endl;
- std::cerr << std::endl;
- std::exit(1);
- }
- // print error message of database
- static void dberrprint(kc::BasicDB* db, const char* info) {
- const kc::BasicDB::Error& err = db->error();
- std::cerr << g_progname << ": " << info << ": " << db->path().c_str() << ": " << err.code() <<
- ": " << err.name() << ": " << err.message() << std::endl;
- }
- // convert a UTF-8 string into a UCS-4 array
- static void utftoucs(const char* src, size_t size, uint32_t* dest, size_t* np) {
- _assert_(src && dest && np);
- const unsigned char* rp = (unsigned char*)src;
- const unsigned char* ep = rp + size;
- size_t dnum = 0;
- while (rp < ep) {
- uint32_t c = *rp;
- if (c < 0x80) {
- dest[dnum++] = c;
- } else if (c < 0xe0) {
- if (c >= 0xc0 && rp + 1 < ep) {
- c = ((c & 0x1f) << 6) | (rp[1] & 0x3f);
- if (c >= 0x80) dest[dnum++] = c;
- rp++;
- }
- } else if (c < 0xf0) {
- if (rp + 2 < ep) {
- c = ((c & 0x0f) << 12) | ((rp[1] & 0x3f) << 6) | (rp[2] & 0x3f);
- if (c >= 0x800) dest[dnum++] = c;
- rp += 2;
- }
- } else if (c < 0xf8) {
- if (rp + 3 < ep) {
- c = ((c & 0x07) << 18) | ((rp[1] & 0x3f) << 12) | ((rp[2] & 0x3f) << 6) |
- (rp[3] & 0x3f);
- if (c >= 0x10000) dest[dnum++] = c;
- rp += 3;
- }
- } else if (c < 0xfc) {
- if (rp + 4 < ep) {
- c = ((c & 0x03) << 24) | ((rp[1] & 0x3f) << 18) | ((rp[2] & 0x3f) << 12) |
- ((rp[3] & 0x3f) << 6) | (rp[4] & 0x3f);
- if (c >= 0x200000) dest[dnum++] = c;
- rp += 4;
- }
- } else if (c < 0xfe) {
- if (rp + 5 < ep) {
- c = ((c & 0x01) << 30) | ((rp[1] & 0x3f) << 24) | ((rp[2] & 0x3f) << 18) |
- ((rp[3] & 0x3f) << 12) | ((rp[4] & 0x3f) << 6) | (rp[5] & 0x3f);
- if (c >= 0x4000000) dest[dnum++] = c;
- rp += 5;
- }
- }
- rp++;
- }
- *np = dnum;
- }
- // normalize a query
- static void normalizequery(const char* qbuf, size_t qsiz, std::string* dest) {
- uint32_t ucsstack[1024];
- uint32_t* ucs = qsiz > sizeof(ucsstack) / sizeof(*ucsstack) ? new uint32_t[qsiz] : ucsstack;
- size_t unum;
- utftoucs(qbuf, qsiz, ucs, &unum);
- size_t nnum;
- normalizeucs(ucs, unum, &nnum);
- qsiz++;
- char utfstack[2048];
- char* utf = qsiz > sizeof(utfstack) ? new char[qsiz] : utfstack;
- qsiz = kc::strucstoutf(ucs, nnum, utf);
- dest->append(utf, qsiz);
- if (utf != utfstack) delete[] utf;
- if (ucs != ucsstack) delete[] ucs;
- }
- // normalize a USC-4 array
- static void normalizeucs(uint32_t* ary, size_t onum, size_t* np) {
- bool lowmode = true;
- bool nacmode = true;
- bool spcmode = true;
- size_t nnum = 0;
- for (size_t i = 0; i < onum; i++) {
- uint32_t c = ary[i];
- if (c >= 0x10000) {
- ary[nnum++] = c;
- continue;
- }
- uint32_t high = c >> 8;
- if (high == 0x00) {
- if (c < 0x0020 || c == 0x007f) {
- // control characters
- if (spcmode) {
- ary[nnum++] = 0x0020;
- } else if (c == 0x0009 || c == 0x000a || c == 0x000d) {
- ary[nnum++] = c;
- } else {
- ary[nnum++] = 0x0020;
- }
- } else if (c == 0x00a0) {
- // no-break space
- ary[nnum++] = 0x0020;
- } else {
- // otherwise
- if (lowmode) {
- if (c < 0x007f) {
- if (c >= 0x0041 && c <= 0x005a) c += 0x20;
- } else if (c >= 0x00c0 && c <= 0x00de && c != 0x00d7) {
- c += 0x20;
- }
- }
- if (nacmode) {
- if (c >= 0x00c0 && c <= 0x00c5) {
- c = 'A';
- } else if (c == 0x00c7) {
- c = 'C';
- } if (c >= 0x00c7 && c <= 0x00cb) {
- c = 'E';
- } if (c >= 0x00cc && c <= 0x00cf) {
- c = 'I';
- } else if (c == 0x00d0) {
- c = 'D';
- } else if (c == 0x00d1) {
- c = 'N';
- } if ((c >= 0x00d2 && c <= 0x00d6) || c == 0x00d8) {
- c = 'O';
- } if (c >= 0x00d9 && c <= 0x00dc) {
- c = 'U';
- } if (c == 0x00dd || c == 0x00de) {
- c = 'Y';
- } else if (c == 0x00df) {
- c = 's';
- } else if (c >= 0x00e0 && c <= 0x00e5) {
- c = 'a';
- } else if (c == 0x00e7) {
- c = 'c';
- } if (c >= 0x00e7 && c <= 0x00eb) {
- c = 'e';
- } if (c >= 0x00ec && c <= 0x00ef) {
- c = 'i';
- } else if (c == 0x00f0) {
- c = 'd';
- } else if (c == 0x00f1) {
- c = 'n';
- } if ((c >= 0x00f2 && c <= 0x00f6) || c == 0x00f8) {
- c = 'o';
- } if (c >= 0x00f9 && c <= 0x00fc) {
- c = 'u';
- } if (c >= 0x00fd && c <= 0x00ff) {
- c = 'y';
- }
- }
- ary[nnum++] = c;
- }
- } else if (high == 0x01) {
- // latin-1 extended
- if (lowmode) {
- if (c <= 0x0137) {
- if ((c & 1) == 0) c++;
- } else if (c == 0x0138) {
- c += 0;
- } else if (c <= 0x0148) {
- if ((c & 1) == 1) c++;
- } else if (c == 0x0149) {
- c += 0;
- } else if (c <= 0x0177) {
- if ((c & 1) == 0) c++;
- } else if (c == 0x0178) {
- c = 0x00ff;
- } else if (c <= 0x017e) {
- if ((c & 1) == 1) c++;
- } else if (c == 0x017f) {
- c += 0;
- }
- }
- if (nacmode) {
- if (c == 0x00ff) {
- c = 'y';
- } else if (c <= 0x0105) {
- c = ((c & 1) == 0) ? 'A' : 'a';
- } else if (c <= 0x010d) {
- c = ((c & 1) == 0) ? 'C' : 'c';
- } else if (c <= 0x0111) {
- c = ((c & 1) == 0) ? 'D' : 'd';
- } else if (c <= 0x011b) {
- c = ((c & 1) == 0) ? 'E' : 'e';
- } else if (c <= 0x0123) {
- c = ((c & 1) == 0) ? 'G' : 'g';
- } else if (c <= 0x0127) {
- c = ((c & 1) == 0) ? 'H' : 'h';
- } else if (c <= 0x0131) {
- c = ((c & 1) == 0) ? 'I' : 'i';
- } else if (c == 0x0134) {
- c = 'J';
- } else if (c == 0x0135) {
- c = 'j';
- } else if (c == 0x0136) {
- c = 'K';
- } else if (c == 0x0137) {
- c = 'k';
- } else if (c == 0x0138) {
- c = 'k';
- } else if (c >= 0x0139 && c <= 0x0142) {
- c = ((c & 1) == 1) ? 'L' : 'l';
- } else if (c >= 0x0143 && c <= 0x0148) {
- c = ((c & 1) == 1) ? 'N' : 'n';
- } else if (c >= 0x0149 && c <= 0x014b) {
- c = ((c & 1) == 0) ? 'N' : 'n';
- } else if (c >= 0x014c && c <= 0x0151) {
- c = ((c & 1) == 0) ? 'O' : 'o';
- } else if (c >= 0x0154 && c <= 0x0159) {
- c = ((c & 1) == 0) ? 'R' : 'r';
- } else if (c >= 0x015a && c <= 0x0161) {
- c = ((c & 1) == 0) ? 'S' : 's';
- } else if (c >= 0x0162 && c <= 0x0167) {
- c = ((c & 1) == 0) ? 'T' : 't';
- } else if (c >= 0x0168 && c <= 0x0173) {
- c = ((c & 1) == 0) ? 'U' : 'u';
- } else if (c == 0x0174) {
- c = 'W';
- } else if (c == 0x0175) {
- c = 'w';
- } else if (c == 0x0176) {
- c = 'Y';
- } else if (c == 0x0177) {
- c = 'y';
- } else if (c == 0x0178) {
- c = 'Y';
- } else if (c >= 0x0179 && c <= 0x017e) {
- c = ((c & 1) == 1) ? 'Z' : 'z';
- } else if (c == 0x017f) {
- c = 's';
- }
- }
- ary[nnum++] = c;
- } else if (high == 0x03) {
- // greek
- if (lowmode) {
- if (c >= 0x0391 && c <= 0x03a9) {
- c += 0x20;
- } else if (c >= 0x03d8 && c <= 0x03ef) {
- if ((c & 1) == 0) c++;
- } else if (c == 0x0374 || c == 0x03f7 || c == 0x03fa) {
- c++;
- }
- }
- ary[nnum++] = c;
- } else if (high == 0x04) {
- // cyrillic
- if (lowmode) {
- if (c <= 0x040f) {
- c += 0x50;
- } else if (c <= 0x042f) {
- c += 0x20;
- } else if (c >= 0x0460 && c <= 0x0481) {
- if ((c & 1) == 0) c++;
- } else if (c >= 0x048a && c <= 0x04bf) {
- if ((c & 1) == 0) c++;
- } else if (c == 0x04c0) {
- c = 0x04cf;
- } else if (c >= 0x04c1 && c <= 0x04ce) {
- if ((c & 1) == 1) c++;
- } else if (c >= 0x04d0) {
- if ((c & 1) == 0) c++;
- }
- }
- ary[nnum++] = c;
- } else if (high == 0x20) {
- if (c == 0x2002) {
- // en space
- ary[nnum++] = 0x0020;
- } else if (c == 0x2003) {
- // em space
- ary[nnum++] = 0x0020;
- } else if (c == 0x2009) {
- // thin space
- ary[nnum++] = 0x0020;
- } else if (c == 0x2010) {
- // hyphen
- ary[nnum++] = 0x002d;
- } else if (c == 0x2015) {
- // fullwidth horizontal line
- ary[nnum++] = 0x002d;
- } else if (c == 0x2019) {
- // apostrophe
- ary[nnum++] = 0x0027;
- } else if (c == 0x2033) {
- // double quotes
- ary[nnum++] = 0x0022;
- } else {
- // (otherwise)
- ary[nnum++] = c;
- }
- } else if (high == 0x22) {
- if (c == 0x2212) {
- // minus sign
- ary[nnum++] = 0x002d;
- } else {
- // (otherwise)
- ary[nnum++] = c;
- }
- } else if (high == 0x30) {
- if (c == 0x3000) {
- // fullwidth space
- if (spcmode) {
- ary[nnum++] = 0x0020;
- } else {
- ary[nnum++] = c;
- }
- } else {
- // (otherwise)
- ary[nnum++] = c;
- }
- } else if (high == 0xff) {
- if (c == 0xff01) {
- // fullwidth exclamation
- ary[nnum++] = 0x0021;
- } else if (c == 0xff03) {
- // fullwidth igeta
- ary[nnum++] = 0x0023;
- } else if (c == 0xff04) {
- // fullwidth dollar
- ary[nnum++] = 0x0024;
- } else if (c == 0xff05) {
- // fullwidth parcent
- ary[nnum++] = 0x0025;
- } else if (c == 0xff06) {
- // fullwidth ampersand
- ary[nnum++] = 0x0026;
- } else if (c == 0xff0a) {
- // fullwidth asterisk
- ary[nnum++] = 0x002a;
- } else if (c == 0xff0b) {
- // fullwidth plus
- ary[nnum++] = 0x002b;
- } else if (c == 0xff0c) {
- // fullwidth comma
- ary[nnum++] = 0x002c;
- } else if (c == 0xff0e) {
- // fullwidth period
- ary[nnum++] = 0x002e;
- } else if (c == 0xff0f) {
- // fullwidth slash
- ary[nnum++] = 0x002f;
- } else if (c == 0xff1a) {
- // fullwidth colon
- ary[nnum++] = 0x003a;
- } else if (c == 0xff1b) {
- // fullwidth semicolon
- ary[nnum++] = 0x003b;
- } else if (c == 0xff1d) {
- // fullwidth equal
- ary[nnum++] = 0x003d;
- } else if (c == 0xff1f) {
- // fullwidth question
- ary[nnum++] = 0x003f;
- } else if (c == 0xff20) {
- // fullwidth atmark
- ary[nnum++] = 0x0040;
- } else if (c == 0xff3c) {
- // fullwidth backslash
- ary[nnum++] = 0x005c;
- } else if (c == 0xff3e) {
- // fullwidth circumflex
- ary[nnum++] = 0x005e;
- } else if (c == 0xff3f) {
- // fullwidth underscore
- ary[nnum++] = 0x005f;
- } else if (c == 0xff5c) {
- // fullwidth vertical line
- ary[nnum++] = 0x007c;
- } else if (c >= 0xff21 && c <= 0xff3a) {
- // fullwidth alphabets
- if (lowmode) {
- c -= 0xfee0;
- if (c >= 0x0041 && c <= 0x005a) c += 0x20;
- ary[nnum++] = c;
- } else {
- ary[nnum++] = c - 0xfee0;
- }
- } else if (c >= 0xff41 && c <= 0xff5a) {
- // fullwidth small alphabets
- ary[nnum++] = c - 0xfee0;
- } else if (c >= 0xff10 && c <= 0xff19) {
- // fullwidth numbers
- ary[nnum++] = c - 0xfee0;
- } else if (c == 0xff61) {
- // halfwidth full stop
- ary[nnum++] = 0x3002;
- } else if (c == 0xff62) {
- // halfwidth left corner
- ary[nnum++] = 0x300c;
- } else if (c == 0xff63) {
- // halfwidth right corner
- ary[nnum++] = 0x300d;
- } else if (c == 0xff64) {
- // halfwidth comma
- ary[nnum++] = 0x3001;
- } else if (c == 0xff65) {
- // halfwidth middle dot
- ary[nnum++] = 0x30fb;
- } else if (c == 0xff66) {
- // halfwidth wo
- ary[nnum++] = 0x30f2;
- } else if (c >= 0xff67 && c <= 0xff6b) {
- // halfwidth small a-o
- ary[nnum++] = (c - 0xff67) * 2 + 0x30a1;
- } else if (c >= 0xff6c && c <= 0xff6e) {
- // halfwidth small ya-yo
- ary[nnum++] = (c - 0xff6c) * 2 + 0x30e3;
- } else if (c == 0xff6f) {
- // halfwidth small tu
- ary[nnum++] = 0x30c3;
- } else if (c == 0xff70) {
- // halfwidth prolonged mark
- ary[nnum++] = 0x30fc;
- } else if (c >= 0xff71 && c <= 0xff75) {
- // halfwidth a-o
- uint32_t tc = (c - 0xff71) * 2 + 0x30a2;
- if (c == 0xff73 && i < onum - 1 && ary[i+1] == 0xff9e) {
- tc = 0x30f4;
- i++;
- }
- ary[nnum++] = tc;
- } else if (c >= 0xff76 && c <= 0xff7a) {
- // halfwidth ka-ko
- uint32_t tc = (c - 0xff76) * 2 + 0x30ab;
- if (i < onum - 1 && ary[i+1] == 0xff9e) {
- tc++;
- i++;
- }
- ary[nnum++] = tc;
- } else if (c >= 0xff7b && c <= 0xff7f) {
- // halfwidth sa-so
- uint32_t tc = (c - 0xff7b) * 2 + 0x30b5;
- if (i < onum - 1 && ary[i+1] == 0xff9e) {
- tc++;
- i++;
- }
- ary[nnum++] = tc;
- } else if (c >= 0xff80 && c <= 0xff84) {
- // halfwidth ta-to
- uint32_t tc = (c - 0xff80) * 2 + 0x30bf + (c >= 0xff82 ? 1 : 0);
- if (i < onum - 1 && ary[i+1] == 0xff9e) {
- tc++;
- i++;
- }
- ary[nnum++] = tc;
- } else if (c >= 0xff85 && c <= 0xff89) {
- // halfwidth na-no
- ary[nnum++] = c - 0xcebb;
- } else if (c >= 0xff8a && c <= 0xff8e) {
- // halfwidth ha-ho
- uint32_t tc = (c - 0xff8a) * 3 + 0x30cf;
- if (i < onum - 1) {
- if (ary[i+1] == 0xff9e) {
- tc++;
- i++;
- } else if (ary[i+1] == 0xff9f) {
- tc += 2;
- i++;
- }
- }
- ary[nnum++] = tc;
- } else if (c >= 0xff8f && c <= 0xff93) {
- // halfwidth ma-mo
- ary[nnum++] = c - 0xceb1;
- } else if (c >= 0xff94 && c <= 0xff96) {
- // halfwidth ya-yo
- ary[nnum++] = (c - 0xff94) * 2 + 0x30e4;
- } else if (c >= 0xff97 && c <= 0xff9b) {
- // halfwidth ra-ro
- ary[nnum++] = c - 0xceae;
- } else if (c == 0xff9c) {
- // halfwidth wa
- ary[nnum++] = 0x30ef;
- } else if (c == 0xff9d) {
- // halfwidth nn
- ary[nnum++] = 0x30f3;
- } else {
- // otherwise
- ary[nnum++] = c;
- }
- } else {
- // otherwise
- ary[nnum++] = c;
- }
- }
- *np = nnum;
- }
- // get the levenshtein distance of two arrays
- template<class CHARTYPE>
- static size_t levdist(const CHARTYPE* abuf, size_t asiz, const CHARTYPE* bbuf, size_t bsiz) {
- size_t dsiz = bsiz + 1;
- size_t tsiz = (asiz + 1) * dsiz;
- uint8_t tblstack[2048];
- uint8_t* tbl = tsiz > sizeof(tblstack) ? new uint8_t[tsiz] : tblstack;
- tbl[0] = 0;
- for (size_t i = 1; i <= asiz; i++) {
- tbl[i*dsiz] = i;
- }
- for (size_t i = 1; i <= bsiz; i++) {
- tbl[i] = i;
- }
- abuf--;
- bbuf--;
- for (size_t i = 1; i <= asiz; i++) {
- for (size_t j = 1; j <= bsiz; j++) {
- uint32_t ac = tbl[(i-1)*dsiz+j] + 1;
- uint32_t bc = tbl[i*dsiz+j-1] + 1;
- uint32_t cc = tbl[(i-1)*dsiz+j-1] + (abuf[i] != bbuf[j]);
- ac = ac < bc ? ac : bc;
- tbl[i*dsiz+j] = ac < cc ? ac : cc;
- }
- }
- size_t ed = tbl[asiz*dsiz+bsiz];
- if (tbl != tblstack) delete[] tbl;
- return ed;
- }
- // parse arguments of import command
- static int32_t runimport(int argc, char** argv) {
- bool argbrk = false;
- const char* path = NULL;
- const char* srcpath = NULL;
- int32_t zmode = ZM_DEFAULT;
- for (int32_t i = 2; i < argc; i++) {
- if (!argbrk && argv[i][0] == '-') {
- if (!std::strcmp(argv[i], "--")) {
- argbrk = true;
- } else if (!std::strcmp(argv[i], "-cz")) {
- zmode = ZM_ZLIB;
- } else if (!std::strcmp(argv[i], "-co")) {
- zmode = ZM_LZO;
- } else if (!std::strcmp(argv[i], "-cx")) {
- zmode = ZM_LZMA;
- } else {
- usage();
- }
- } else if (!path) {
- argbrk = true;
- path = argv[i];
- } else if (!srcpath) {
- srcpath = argv[i];
- } else {
- usage();
- }
- }
- if (!path || !srcpath) usage();
- int32_t rv = procimport(path, srcpath, zmode);
- return rv;
- }
- // parse arguments of search command
- static int32_t runsearch(int argc, char** argv) {
- bool argbrk = false;
- const char* path = NULL;
- const char* query = NULL;
- int32_t zmode = ZM_DEFAULT;
- int64_t max = 10;
- int32_t mode = 0;
- bool ts = false;
- bool iu = false;
- bool pk = false;
- for (int32_t i = 2; i < argc; i++) {
- if (!argbrk && argv[i][0] == '-') {
- if (!std::strcmp(argv[i], "--")) {
- argbrk = true;
- } else if (!std::strcmp(argv[i], "-cz")) {
- zmode = ZM_ZLIB;
- } else if (!std::strcmp(argv[i], "-co")) {
- zmode = ZM_LZO;
- } else if (!std::strcmp(argv[i], "-cx")) {
- zmode = ZM_LZMA;
- } else if (!std::strcmp(argv[i], "-max")) {
- if (++i >= argc) usage();
- max = kc::atoix(argv[i]);
- } else if (!std::strcmp(argv[i], "-f")) {
- mode = 'f';
- } else if (!std::strcmp(argv[i], "-a")) {
- mode = 'a';
- } else if (!std::strcmp(argv[i], "-m")) {
- mode = 'm';
- } else if (!std::strcmp(argv[i], "-r")) {
- mode = 'r';
- } else if (!std::strcmp(argv[i], "-tm")) {
- mode = 'M';
- } else if (!std::strcmp(argv[i], "-tr")) {
- mode = 'R';
- } else if (!std::strcmp(argv[i], "-ts")) {
- ts = true;
- } else if (!std::strcmp(argv[i], "-iu")) {
- iu = true;
- } else if (!std::strcmp(argv[i], "-pk")) {
- pk = true;
- } else {
- usage();
- }
- } else if (!path) {
- argbrk = true;
- path = argv[i];
- } else if (!query) {
- query = argv[i];
- } else {
- usage();
- }
- }
- if (!path || !query) usage();
- const char* qbuf;
- if (iu) {
- size_t qsiz;
- qbuf = kc::urldecode(query, &qsiz);
- query = qbuf;
- } else {
- qbuf = NULL;
- }
- int32_t rv = procsearch(path, query, zmode, max, mode, ts, pk);
- delete[] qbuf;
- return rv;
- }
- // parse arguments of suggest command
- static int32_t runsuggest(int argc, char** argv) {
- bool argbrk = false;
- const char* path = NULL;
- const char* query = NULL;
- int64_t max = 10;
- int32_t mode = 0;
- bool iu = false;
- for (int32_t i = 2; i < argc; i++) {
- if (!argbrk && argv[i][0] == '-') {
- if (!std::strcmp(argv[i], "--")) {
- argbrk = true;
- } else if (!std::strcmp(argv[i], "-max")) {
- if (++i >= argc) usage();
- max = kc::atoix(argv[i]);
- } else if (!std::strcmp(argv[i], "-f")) {
- mode = 'f';
- } else if (!std::strcmp(argv[i], "-m")) {
- mode = 'm';
- } else if (!std::strcmp(argv[i], "-r")) {
- mode = 'r';
- } else if (!std::strcmp(argv[i], "-iu")) {
- iu = true;
- } else {
- usage();
- }
- } else if (!path) {
- argbrk = true;
- path = argv[i];
- } else if (!query) {
- query = argv[i];
- } else {
- usage();
- }
- }
- if (!path || !query) usage();
- const char* qbuf;
- if (iu) {
- size_t qsiz;
- qbuf = kc::urldecode(query, &qsiz);
- query = qbuf;
- } else {
- qbuf = NULL;
- }
- int32_t rv = procsuggest(path, query, max, mode);
- delete[] qbuf;
- return rv;
- }
- // perform import command
- static int32_t procimport(const char* path, const char* srcpath, int32_t zmode) {
- kc::TextDB srcdb;
- if (!srcdb.open(srcpath, kc::TextDB::OREADER)) {
- dberrprint(&srcdb, "DB::open failed");
- return 1;
- }
- kc::TreeDB destdb;
- int32_t opts = kc::TreeDB::TSMALL | kc::TreeDB::TLINEAR;
- kc::Compressor* zcomp = NULL;
- if (zmode != ZM_DEFAULT) {
- opts |= kc::TreeDB::TCOMPRESS;
- switch (zmode) {
- case ZM_LZO: {
- zcomp = new kc::LZOCompressor<kc::LZO::RAW>;
- break;
- }
- case ZM_LZMA: {
- zcomp = new kc::LZMACompressor<kc::LZMA::RAW>;
- break;
- }
- }
- }
- destdb.tune_options(opts);
- if (zcomp) destdb.tune_compressor(zcomp);
- if (!destdb.open(path, kc::TreeDB::OWRITER | kc::TreeDB::OCREATE | kc::TreeDB::OTRUNCATE)) {
- dberrprint(&destdb, "DB::open failed");
- delete zcomp;
- return 1;
- }
- bool err = false;
- class MapReduceImpl : public kc::MapReduce {
- public:
- MapReduceImpl(kc::TreeDB* destdb) :
- destdb_(destdb), lock_(), err_(false), mapcnt_(0), redcnt_(0) {}
- bool error() {
- return err_;
- }
- private:
- bool map(const char* kbuf, size_t ksiz, const char* vbuf, size_t vsiz) {
- bool err = false;
- std::vector<std::string> fields;
- kc::strsplit(std::string(vbuf, vsiz), '\t', &fields);
- if (fields.size() >= 5) {
- std::string key;
- normalizequery(fields[0].data(), fields[0].size(), &key);
- std::string value;
- kc::strprintf(&value, "%s\t%s\t%s\t%s",
- fields[1].c_str(), fields[2].c_str(),
- fields[3].c_str(), fields[4].c_str());
- if (!emit(key.data(), key.size(), value.data(), value.size())) err = true;
- }
- int64_t cnt = mapcnt_.add(1) + 1;
- if (cnt % 10000 == 0) {
- std::string message = kc::strprintf("processed %lld entries", (long long)cnt);
- if (!log("map", message.c_str())) err = true;
- }
- return !err;
- }
- bool reduce(const char* kbuf, size_t ksiz, ValueIterator* iter) {
- bool err = false;
- std::vector<IndexedRecord> records;
- const char* vbuf;
- size_t vsiz;
- while ((vbuf = iter->next(&vsiz)) != NULL) {
- std::vector<std::string> fields;
- kc::strsplit(std::string(vbuf, vsiz), '\t', &fields);
- if (fields.size() >= 4) {
- int64_t rank = kc::atoi(fields[0].c_str());
- std::string text;
- kc::strprintf(&text, "%s\t%s\t%s",
- fields[1].c_str(), fields[2].c_str(), fields[3].c_str());
- IndexedRecord rec = { rank, text };
- records.push_back(rec);
- }
- }
- std::sort(records.begin(), records.end());
- if (records.size() > 1000) records.resize(1000);
- std::vector<IndexedRecord>::iterator rit = records.begin();
- std::vector<IndexedRecord>::iterator ritend = records.end();
- int32_t seq = 0;
- while (rit != ritend) {
- std::string key(kbuf, ksiz);
- kc::strprintf(&key, "\t%03lld", (long long)++seq);
- if (!destdb_->set(key.data(), key.size(), rit->text.data(), rit->text.size())) {
- err = true;
- err_ = true;
- }
- rit++;
- }
- int64_t cnt = redcnt_.add(1) + 1;
- if (cnt % 10000 == 0) {
- std::string message = kc::strprintf("processed %lld entries", (long long)cnt);
- if (!log("reduce", message.c_str())) err = true;
- }
- return !err;
- }
- bool log(const char* name, const char* message) {
- kc::ScopedMutex lock(&lock_);
- std::cout << name << ": " << message << std::endl;
- return true;
- }
- private:
- kc::TreeDB* destdb_;
- kc::Mutex lock_;
- bool err_;
- kc::AtomicInt64 mapcnt_;
- kc::AtomicInt64 redcnt_;
- };
- MapReduceImpl mr(&destdb);
- mr.tune_thread(THREADNUM, THREADNUM, THREADNUM);
- if (!mr.execute(&srcdb, "", kc::MapReduce::XPARAMAP | kc::MapReduce::XPARAFLS)) {
- dberrprint(&srcdb, "MapReduce::execute failed");
- err = true;
- }
- if (mr.error()) {
- dberrprint(&srcdb, "MapReduce::execute failed");
- err = true;
- }
- if (!destdb.close()) {
- dberrprint(&destdb, "DB::close failed");
- err = true;
- }
- delete zcomp;
- if (!srcdb.close()) {
- dberrprint(&srcdb, "DB::close failed");
- err = true;
- }
- return err ? 1 : 0;
- }
- // perform search command
- static int32_t procsearch(const char* path, const char* query, int32_t zmode, int64_t max,
- int32_t mode, bool ts, bool pk) {
- kc::TreeDB db;
- kc::Compressor* zcomp = NULL;
- if (zmode != ZM_DEFAULT) {
- switch (zmode) {
- case ZM_LZO: {
- zcomp = new kc::LZOCompressor<kc::LZO::RAW>;
- break;
- }
- case ZM_LZMA: {
- zcomp = new kc::LZMACompressor<kc::LZMA::RAW>;
- break;
- }
- }
- }
- if (zcomp) db.tune_compressor(zcomp);
- if (!db.open(path, kc::TreeDB::OREADER)) {
- dberrprint(&db, "DB::open failed");
- delete zcomp;
- return 1;
- }
- std::string nquery;
- if (ts) {
- nquery = query;
- kc::strtolower(&nquery);
- } else {
- normalizequery(query, std::strlen(query), &nquery);
- }
- bool err = false;
- if (mode == 'a') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max, bool pk) :
- qbuf_(), qsiz_(0), max_(max), pk_(pk),
- thres_(0), minsiz_(0), maxsiz_(0), lock_(), queue_() {
- qsiz_ = query.size();
- qbuf_ = new uint32_t[qsiz_+1];
- utftoucs(query.data(), query.size(), qbuf_, &qsiz_);
- if (qsiz_ > kc::UINT8MAX) qsiz_ = kc::UINT8MAX;
- thres_ = qsiz_ / AMBGRATIO;
- if (thres_ < AMBGMIN) thres_ = AMBGMIN;
- minsiz_ = qsiz_ > thres_ ? qsiz_ - thres_ : 0;
- maxsiz_ = qsiz_ + thres_;
- }
- ~VisitorImpl() {
- delete[] qbuf_;
- }
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- size_t oksiz = ksiz;
- while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
- ksiz--;
- }
- if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
- uint32_t ucsstack[kc::UINT8MAX];
- uint32_t* ucs = ksiz > sizeof(ucsstack) / sizeof(*ucsstack) ?
- new uint32_t[ksiz] : ucsstack;
- size_t usiz;
- utftoucs(kbuf, ksiz, ucs, &usiz);
- if (usiz > kc::UINT8MAX) usiz = kc::UINT8MAX;
- if (usiz < minsiz_ || usiz > maxsiz_) {
- if (ucs != ucsstack) delete[] ucs;
- return NOP;
- }
- size_t dist = levdist(ucs, usiz, qbuf_, qsiz_);
- if (dist <= thres_) {
- std::string key(kbuf, ksiz);
- uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- AmbiguousRecord rec = { dist, key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- } else {
- const AmbiguousRecord& top = queue_.top();
- if (!top.less(dist, key, order)) {
- queue_.pop();
- AmbiguousRecord rec = { dist, key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- }
- }
- }
- if (ucs != ucsstack) delete[] ucs;
- return NOP;
- }
- void visit_after() {
- std::vector<AmbiguousRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<AmbiguousRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<AmbiguousRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- if (pk_) std::cout << rit->key << "\t";
- std::cout << rit->text << "\t" << rit->dist << std::endl;
- ++rit;
- }
- }
- uint32_t* qbuf_;
- size_t qsiz_;
- int64_t max_;
- bool pk_;
- size_t thres_;
- size_t minsiz_;
- size_t maxsiz_;
- kc::Mutex lock_;
- std::priority_queue<AmbiguousRecord> queue_;
- };
- VisitorImpl visitor(nquery, max, pk);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel faileda");
- err = true;
- }
- } else if (mode == 'm') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max, bool pk) :
- query_(query), max_(max), pk_(pk), lock_(), queue_() {}
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- size_t oksiz = ksiz;
- while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
- ksiz--;
- }
- if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
- std::string key(kbuf, ksiz);
- if (key.find(query_) != std::string::npos) {
- uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- } else {
- const PlainRecord& top = queue_.top();
- if (!top.less(key, order)) {
- queue_.pop();
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- }
- }
- }
- return NOP;
- }
- void visit_after() {
- std::vector<PlainRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- if (pk_) std::cout << rit->key << "\t";
- std::cout << rit->text << std::endl;
- ++rit;
- }
- }
- std::string query_;
- int64_t max_;
- bool pk_;
- kc::Mutex lock_;
- std::priority_queue<PlainRecord> queue_;
- };
- VisitorImpl visitor(nquery, max, pk);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel failed");
- err = true;
- }
- } else if (mode == 'M') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max, bool ts, bool pk) :
- query_(query), max_(max), ts_(ts), pk_(pk), lock_(), queue_() {}
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- const char* rbuf = vbuf;
- size_t rsiz = vsiz;
- while (rsiz > 0 && *rbuf != '\t') {
- rbuf++;
- rsiz--;
- }
- if (rsiz > 0 && *rbuf == '\t') {
- rbuf++;
- rsiz--;
- }
- while (rsiz > 0 && *rbuf != '\t') {
- rbuf++;
- rsiz--;
- }
- if (rsiz > 0 && *rbuf == '\t') {
- rbuf++;
- rsiz--;
- }
- bool hit = false;
- if (ts_) {
- hit = kc::memimem(rbuf, rsiz, query_.data(), query_.size()) != NULL;
- } else {
- std::string value;
- normalizequery(rbuf, rsiz, &value);
- hit = value.find(query_) != std::string::npos;
- }
- if (hit) {
- size_t oksiz = ksiz;
- while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
- ksiz--;
- }
- if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
- std::string key(kbuf, ksiz);
- uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- } else {
- const PlainRecord& top = queue_.top();
- if (!top.less(key, order)) {
- queue_.pop();
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- }
- }
- }
- return NOP;
- }
- void visit_after() {
- std::vector<PlainRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- if (pk_) std::cout << rit->key << "\t";
- std::cout << rit->text << std::endl;
- ++rit;
- }
- }
- std::string query_;
- int64_t max_;
- bool ts_;
- bool pk_;
- kc::Mutex lock_;
- std::priority_queue<PlainRecord> queue_;
- };
- VisitorImpl visitor(nquery, max, ts, pk);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel failed");
- err = true;
- }
- } else if (mode == 'r') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max, bool pk) :
- regex_(), max_(max), pk_(pk), lock_(), queue_() {
- regex_.compile(query, kc::Regex::MATCHONLY);
- }
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- size_t oksiz = ksiz;
- while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
- ksiz--;
- }
- if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
- std::string key(kbuf, ksiz);
- if (regex_.match(key)) {
- uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- } else {
- const PlainRecord& top = queue_.top();
- if (!top.less(key, order)) {
- queue_.pop();
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- }
- }
- }
- return NOP;
- }
- void visit_after() {
- std::vector<PlainRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- if (pk_) std::cout << rit->key << "\t";
- std::cout << rit->text << std::endl;
- ++rit;
- }
- }
- kc::Regex regex_;
- int64_t max_;
- bool pk_;
- kc::Mutex lock_;
- std::priority_queue<PlainRecord> queue_;
- };
- VisitorImpl visitor(nquery, max, pk);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel failed");
- err = true;
- }
- } else if (mode == 'R') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max, bool ts, bool pk) :
- regex_(), max_(max), ts_(ts), pk_(pk), lock_(), queue_() {
- regex_.compile(query, kc::Regex::MATCHONLY);
- }
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- const char* rbuf = vbuf;
- size_t rsiz = vsiz;
- while (rsiz > 0 && *rbuf != '\t') {
- rbuf++;
- rsiz--;
- }
- if (rsiz > 0 && *rbuf == '\t') {
- rbuf++;
- rsiz--;
- }
- while (rsiz > 0 && *rbuf != '\t') {
- rbuf++;
- rsiz--;
- }
- if (rsiz > 0 && *rbuf == '\t') {
- rbuf++;
- rsiz--;
- }
- bool hit = false;
- if (ts_) {
- std::string value(rbuf, rsiz);
- kc::strtolower(&value);
- hit = regex_.match(value);
- } else {
- std::string value(rbuf, rsiz);
- normalizequery(rbuf, rsiz, &value);
- hit = regex_.match(value);
- }
- if (hit) {
- size_t oksiz = ksiz;
- while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
- ksiz--;
- }
- if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
- std::string key(kbuf, ksiz);
- uint32_t order = kc::atoin(kbuf + ksiz, oksiz - ksiz);
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- } else {
- const PlainRecord& top = queue_.top();
- if (!top.less(key, order)) {
- queue_.pop();
- PlainRecord rec = { key, order, std::string(vbuf, vsiz) };
- queue_.push(rec);
- }
- }
- }
- return NOP;
- }
- void visit_after() {
- std::vector<PlainRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- if (pk_) std::cout << rit->key << "\t";
- std::cout << rit->text << std::endl;
- ++rit;
- }
- }
- kc::Regex regex_;
- int64_t max_;
- bool ts_;
- bool pk_;
- kc::Mutex lock_;
- std::priority_queue<PlainRecord> queue_;
- };
- VisitorImpl visitor(nquery, max, ts, pk);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel failed");
- err = true;
- }
- } else {
- std::string qstr(nquery);
- if (mode == 'f') qstr.append("\t");
- kc::TreeDB::Cursor* cur = db.cursor();
- cur->jump(qstr);
- char* kbuf;
- size_t ksiz;
- const char* vbuf;
- size_t vsiz;
- while (max > 0 && (kbuf = cur->get(&ksiz, &vbuf, &vsiz, true)) != NULL) {
- if (ksiz >= qstr.size() && !std::memcmp(kbuf, qstr.data(), qstr.size())) {
- if (pk) {
- while (ksiz > 0 && kbuf[ksiz-1] != '\t') {
- ksiz--;
- }
- if (ksiz > 0 && kbuf[ksiz-1] == '\t') ksiz--;
- std::cout.write(kbuf, ksiz);
- std::cout << "\t";
- }
- std::cout.write(vbuf, vsiz);
- std::cout << std::endl;
- } else {
- max = 0;
- }
- delete[] kbuf;
- max--;
- }
- delete cur;
- }
- if (!db.close()) {
- dberrprint(&db, "DB::close failed");
- err = true;
- }
- delete zcomp;
- return err ? 1 : 0;
- }
- // perform suggest command
- static int32_t procsuggest(const char* path, const char* query, int64_t max, int32_t mode) {
- kc::TextDB db;
- if (!db.open(path, kc::TextDB::OREADER)) {
- dberrprint(&db, "DB::open failed");
- return 1;
- }
- std::string nquery;
- normalizequery(query, std::strlen(query), &nquery);
- bool err = false;
- if (mode == 'm') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max) :
- query_(query), max_(max), lock_(), queue_() {}
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- std::string key(vbuf, vsiz);
- if (key.find(query_) != std::string::npos) {
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- PlainRecord rec = { key, 0, "" };
- queue_.push(rec);
- } else {
- const PlainRecord& top = queue_.top();
- if (!top.less(key, 0)) {
- queue_.pop();
- PlainRecord rec = { key, 0, "" };
- queue_.push(rec);
- }
- }
- }
- return NOP;
- }
- void visit_after() {
- std::vector<PlainRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- std::cout << rit->key << std::endl;
- ++rit;
- }
- }
- std::string query_;
- int64_t max_;
- kc::Mutex lock_;
- std::priority_queue<PlainRecord> queue_;
- };
- VisitorImpl visitor(nquery, max);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel failed");
- err = true;
- }
- } else if (mode == 'r') {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max) :
- regex_(), max_(max), lock_(), queue_() {
- regex_.compile(query, kc::Regex::MATCHONLY);
- }
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- std::string key(vbuf, vsiz);
- if (regex_.match(key)) {
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- PlainRecord rec = { key, 0, "" };
- queue_.push(rec);
- } else {
- const PlainRecord& top = queue_.top();
- if (!top.less(key, 0)) {
- queue_.pop();
- PlainRecord rec = { key, 0, "" };
- queue_.push(rec);
- }
- }
- }
- return NOP;
- }
- void visit_after() {
- std::vector<PlainRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<PlainRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<PlainRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- std::cout << rit->key << std::endl;
- ++rit;
- }
- }
- kc::Regex regex_;
- int64_t max_;
- kc::Mutex lock_;
- std::priority_queue<PlainRecord> queue_;
- };
- VisitorImpl visitor(nquery, max);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel failed");
- err = true;
- }
- } else {
- class VisitorImpl : public kc::DB::Visitor {
- public:
- VisitorImpl(const std::string& query, int64_t max) :
- qbuf_(), qsiz_(0), max_(max),
- thres_(0), minsiz_(0), maxsiz_(0), lock_(), queue_() {
- qsiz_ = query.size();
- qbuf_ = new uint32_t[qsiz_+1];
- utftoucs(query.data(), query.size(), qbuf_, &qsiz_);
- if (qsiz_ > kc::UINT8MAX) qsiz_ = kc::UINT8MAX;
- thres_ = qsiz_ / AMBGRATIO;
- if (thres_ < AMBGMIN) thres_ = AMBGMIN;
- minsiz_ = qsiz_ > thres_ ? qsiz_ - thres_ : 0;
- maxsiz_ = qsiz_ + thres_;
- }
- ~VisitorImpl() {
- delete[] qbuf_;
- }
- private:
- const char* visit_full(const char* kbuf, size_t ksiz,
- const char* vbuf, size_t vsiz, size_t* sp) {
- uint32_t ucsstack[kc::UINT8MAX];
- uint32_t* ucs = vsiz > sizeof(ucsstack) / sizeof(*ucsstack) ?
- new uint32_t[vsiz] : ucsstack;
- size_t usiz;
- utftoucs(vbuf, vsiz, ucs, &usiz);
- if (usiz > kc::UINT8MAX) usiz = kc::UINT8MAX;
- if (usiz < minsiz_ || usiz > maxsiz_) {
- if (ucs != ucsstack) delete[] ucs;
- return NOP;
- }
- size_t dist = levdist(ucs, usiz, qbuf_, qsiz_);
- if (dist <= thres_) {
- std::string key(vbuf, vsiz);
- kc::ScopedMutex lock(&lock_);
- if ((int64_t)queue_.size() < max_) {
- AmbiguousRecord rec = { dist, key, 0, "" };
- queue_.push(rec);
- } else {
- const AmbiguousRecord& top = queue_.top();
- if (!top.less(dist, key, 0)) {
- queue_.pop();
- AmbiguousRecord rec = { dist, key, 0, "" };
- queue_.push(rec);
- }
- }
- }
- if (ucs != ucsstack) delete[] ucs;
- return NOP;
- }
- void visit_after() {
- std::vector<AmbiguousRecord> recs;
- while (!queue_.empty()) {
- recs.push_back(queue_.top());
- queue_.pop();
- }
- std::vector<AmbiguousRecord>::reverse_iterator rit = recs.rbegin();
- std::vector<AmbiguousRecord>::reverse_iterator ritend = recs.rend();
- while (rit != ritend) {
- std::cout << rit->key << "\t" << rit->dist << std::endl;
- ++rit;
- }
- }
- uint32_t* qbuf_;
- size_t qsiz_;
- int64_t max_;
- size_t thres_;
- size_t minsiz_;
- size_t maxsiz_;
- kc::Mutex lock_;
- std::priority_queue<AmbiguousRecord> queue_;
- };
- VisitorImpl visitor(nquery, max);
- if (!db.scan_parallel(&visitor, THREADNUM)) {
- dberrprint(&db, "DB::scan_parallel faileda");
- err = true;
- }
- }
- if (!db.close()) {
- dberrprint(&db, "DB::close failed");
- err = true;
- }
- return err ? 1 : 0;
- }
- // END OF FILE