/src/_gparser/MyLib.cpp

https://github.com/xiaocao/ltp · C++ · 424 lines · 364 code · 42 blank · 18 comment · 120 complexity · f17564b41496b07625c8f21ad4c2d2ba MD5 · raw file

  1. /////////////////////////////////////////////////////////////////////////////////////
  2. // File Name : MyLib.cpp
  3. // Project Name: IRLAS
  4. // Author : Huipeng Zhang (zhp@ir.hit.edu.cn)
  5. // Environment : Microsoft Visual C++ 6.0
  6. // Description : some utility functions
  7. // Time : 2005.9
  8. // History :
  9. // CopyRight : HIT-IRLab (c) 2001-2005, all rights reserved.
  10. /////////////////////////////////////////////////////////////////////////////////////
  11. #include "MyLib.h"
  12. void replace_char_by_char(string &str, char c1, char c2)
  13. {
  14. string::size_type pos = 0;
  15. for (; pos < str.size(); ++pos) {
  16. if (str[pos] == c1) {
  17. str[pos] = c2;
  18. }
  19. }
  20. }
  21. void split_bychars(const string& str, vector<string> & vec, const char *sep)
  22. { //assert(vec.empty());
  23. vec.clear();
  24. string::size_type pos1 = 0, pos2 = 0;
  25. string word;
  26. while((pos2 = str.find_first_of(sep, pos1)) != string::npos)
  27. {
  28. word = str.substr(pos1, pos2-pos1);
  29. pos1 = pos2 + 1;
  30. if(!word.empty())
  31. vec.push_back(word);
  32. }
  33. word = str.substr(pos1);
  34. if(!word.empty())
  35. vec.push_back(word);
  36. }
  37. // remove the blanks at the begin and end of string
  38. void clean_str(string &str)
  39. {
  40. string blank = " \t\r\n";
  41. string::size_type pos1 = str.find_first_not_of(blank);
  42. string::size_type pos2 = str.find_last_not_of(blank);
  43. if (pos1 == string::npos) {
  44. str = "";
  45. } else {
  46. str = str.substr(pos1, pos2-pos1+1);
  47. }
  48. }
  49. bool my_getline(ifstream &inf, string &line)
  50. {
  51. if (!getline(inf, line)) return false;
  52. int end = line.size() - 1;
  53. while (end >= 0 && (line[end] == '\r' || line[end] == '\n')) {
  54. line.erase(end--);
  55. }
  56. return true;
  57. }
  58. void str2uint_vec(const vector<string> &vecStr, vector<unsigned int> &vecInt)
  59. {
  60. vecInt.resize(vecStr.size());
  61. int i = 0;
  62. for (; i < vecStr.size(); ++i)
  63. {
  64. vecInt[i] = atoi(vecStr[i].c_str());
  65. }
  66. }
  67. void str2int_vec(const vector<string> &vecStr, vector<int> &vecInt)
  68. {
  69. vecInt.resize(vecStr.size());
  70. int i = 0;
  71. for (; i < vecStr.size(); ++i)
  72. {
  73. vecInt[i] = atoi(vecStr[i].c_str());
  74. }
  75. }
  76. void int2str_vec(const vector<int> &vecInt, vector<string> &vecStr)
  77. {
  78. vecStr.resize(vecInt.size());
  79. int i = 0;
  80. for (; i < vecInt.size(); ++i) {
  81. ostringstream out;
  82. out << vecInt[i];
  83. vecStr[i] = out.str();
  84. }
  85. }
  86. void join_bystr(const vector<string> &vec, string &str, const string &sep)
  87. {
  88. str = "";
  89. if (vec.empty()) return;
  90. str = vec[0];
  91. int i = 1;
  92. for(; i < vec.size(); ++i)
  93. {
  94. str += sep + vec[i];
  95. }
  96. }
  97. void split_bystr(const string &str, vector<string> &vec, const string &sep)
  98. {
  99. vec.clear();
  100. string::size_type pos1 = 0, pos2 = 0;
  101. string word;
  102. while((pos2 = str.find(sep, pos1)) != string::npos)
  103. {
  104. word = str.substr(pos1, pos2-pos1);
  105. pos1 = pos2 + sep.size();
  106. if(!word.empty()) vec.push_back(word);
  107. }
  108. word = str.substr(pos1);
  109. if(!word.empty()) vec.push_back(word);
  110. }
  111. void split_pair_vector(const vector< pair<int, string> > &vecPair, vector<int> &vecInt, vector<string> &vecStr)
  112. {
  113. int i = 0;
  114. vecInt.resize(vecPair.size());
  115. vecStr.resize(vecPair.size());
  116. for (; i < vecPair.size(); ++i) {
  117. vecInt[i] = vecPair[i].first;
  118. vecStr[i] = vecPair[i].second;
  119. }
  120. }
  121. void split_bychar(const string& str, vector<string>& vec,
  122. const char separator)
  123. {
  124. //assert(vec.empty());
  125. vec.clear();
  126. string::size_type pos1 = 0, pos2 = 0;
  127. string word;
  128. while((pos2 = str.find_first_of(separator, pos1)) != string::npos)
  129. {
  130. word = str.substr(pos1, pos2-pos1);
  131. pos1 = pos2 + 1;
  132. if(!word.empty())
  133. vec.push_back(word);
  134. }
  135. word = str.substr(pos1);
  136. if(!word.empty())
  137. vec.push_back(word);
  138. }
  139. void string2pair(const string& str, pair<string, string>& pairStr, const char separator)
  140. {
  141. string::size_type pos = str.find_last_of(separator);
  142. if (pos == string::npos) {
  143. pairStr.first = str;
  144. pairStr.second = "";
  145. } else {
  146. pairStr.first = str.substr(0, pos);
  147. pairStr.second = str.substr(pos+1);
  148. }
  149. }
  150. void convert_to_pair(vector<string>& vecString,
  151. vector< pair<string, string> >& vecPair)
  152. {
  153. assert(vecPair.empty());
  154. int size = vecString.size();
  155. string::size_type cur;
  156. string strWord, strPos;
  157. for(int i = 0; i < size; ++i)
  158. {
  159. cur = vecString[i].find('/');
  160. if (cur == string::npos)
  161. {
  162. strWord = vecString[i].substr(0);
  163. strPos = "";
  164. }
  165. else if (cur == vecString[i].size()-1)
  166. {
  167. strWord = vecString[i].substr(0, cur);
  168. strPos = "";
  169. }
  170. else
  171. {
  172. strWord = vecString[i].substr(0, cur);
  173. strPos = vecString[i].substr(cur+1);
  174. }
  175. vecPair.push_back(pair<string, string>(strWord, strPos));
  176. }
  177. }
  178. void split_to_pair(const string& str, vector< pair<string, string> >& vecPair)
  179. {
  180. assert(vecPair.empty());
  181. vector<string> vec;
  182. split_bychar(str, vec);
  183. convert_to_pair(vec, vecPair);
  184. }
  185. void split_sentence(const string& line, vector<string>& vecSentence)
  186. {
  187. assert(vecSentence.empty());
  188. vector< pair<string, string> > vecPair;
  189. split_to_pair(line, vecPair);
  190. int size = vecPair.size();
  191. string sentence = "";
  192. for(int i = 0; i < size; i++)
  193. {
  194. if (vecPair[i].first == "¡£" || vecPair[i].first == "£¡" || vecPair[i].first == "£¿")
  195. {
  196. sentence += vecPair[i].first + "/" + vecPair[i].second + " ";
  197. if (i+1 < size && vecPair[i+1].first == "¡±")
  198. {
  199. sentence += vecPair[i+1].first + "/" + vecPair[i+1].second + " ";
  200. i++;
  201. }
  202. vecSentence.push_back(sentence);
  203. sentence = "";
  204. }
  205. else
  206. {
  207. sentence += vecPair[i].first + "/" + vecPair[i].second + " ";
  208. }
  209. }
  210. }
  211. void chomp(string& str)
  212. {
  213. string white = " \t\n";
  214. string::size_type pos1 = str.find_first_not_of(white);
  215. string::size_type pos2 = str.find_last_not_of(white);
  216. if (pos1 == string::npos || pos2 == string::npos)
  217. {
  218. str = "";
  219. }
  220. else
  221. {
  222. str = str.substr(pos1, pos2-pos1+1);
  223. }
  224. }
  225. int common_substr_len(string str1, string str2)
  226. {
  227. string::size_type minLen;
  228. if (str1.length() < str2.length())
  229. {
  230. minLen = str1.length();
  231. }
  232. else
  233. {
  234. minLen = str2.length();
  235. str1.swap(str2); //make str1 the shorter string
  236. }
  237. string::size_type maxSubstrLen = 0;
  238. string::size_type posBeg;
  239. string::size_type substrLen;
  240. string sub;
  241. for (posBeg = 0; posBeg < minLen; posBeg++)
  242. {
  243. for (substrLen = minLen-posBeg; substrLen > 0; substrLen--)
  244. {
  245. sub = str1.substr(posBeg, substrLen);
  246. if (str2.find(sub) != string::npos)
  247. {
  248. if (maxSubstrLen < substrLen)
  249. {
  250. maxSubstrLen = substrLen;
  251. }
  252. if (maxSubstrLen >= minLen-posBeg-1)
  253. {
  254. return maxSubstrLen;
  255. }
  256. }
  257. }
  258. }
  259. return 0;
  260. }
  261. int get_char_index(string& str)
  262. {
  263. assert(str.size() == 2);
  264. return ((unsigned char)str[0]-176)*94 + (unsigned char)str[1] - 161;
  265. }
  266. bool is_chinese_char(string& str)
  267. {
  268. if (str.size() != 2)
  269. {
  270. return false;
  271. }
  272. int index = ((unsigned char)str[0]-176)*94 + (unsigned char)str[1] - 161;
  273. if (index >= 0 && index < 6768)
  274. {
  275. return true;
  276. }
  277. else
  278. {
  279. return false;
  280. }
  281. }
  282. string separators = "¡££¬£¿£¡¡¢£º¡ª¡°¡±¡¶¡·£¨£©£¥£¤¡æ£¯¡¤\",.?!:'/;£»()%"; //all defined separators
  283. bool is_separator(string& str)
  284. {
  285. if (separators.find(str) != string::npos && str.size() <= 2)
  286. {
  287. return true;
  288. }
  289. else
  290. {
  291. return false;
  292. }
  293. }
  294. int find_GB_char(const string& str, string wideChar, int begPos)
  295. {
  296. assert(wideChar.size() == 2 && wideChar[0] < 0); //is a GB char
  297. int strLen = str.size();
  298. if (begPos >= strLen)
  299. {
  300. return -1;
  301. }
  302. string GBchar;
  303. for (int i = begPos; i < strLen-1; i++)
  304. {
  305. if (str[i] < 0) //is a GB char
  306. {
  307. GBchar = str.substr(i, 2);
  308. if (GBchar == wideChar)
  309. return i;
  310. else
  311. i++;
  312. }
  313. }
  314. return -1;
  315. }
  316. void split_to_sentence_by_period(const string& line, vector<string>& vecSentence)
  317. {
  318. assert(vecSentence.empty());
  319. int pos1 = 0, pos2 = 0;
  320. string sentence;
  321. while((pos2 = find_GB_char(line, "¡£", pos1)) != -1)
  322. {
  323. sentence = line.substr(pos1, pos2-pos1+2);
  324. pos1 = pos2 + 2;
  325. if(!sentence.empty())
  326. vecSentence.push_back(sentence);
  327. }
  328. sentence = line.substr(pos1);
  329. if(!sentence.empty())
  330. vecSentence.push_back(sentence);
  331. }
  332. void split_by_separator(const string& str, vector<string>& vec, const string separator)
  333. {
  334. assert(vec.empty());
  335. string::size_type pos1 = 0, pos2 = 0;
  336. string word;
  337. while((pos2 = find_GB_char(str, separator, pos1)) != -1)
  338. {
  339. word = str.substr(pos1, pos2-pos1);
  340. pos1 = pos2 + separator.size();
  341. if(!word.empty())
  342. vec.push_back(word);
  343. }
  344. word = str.substr(pos1);
  345. if(!word.empty())
  346. vec.push_back(word);
  347. }
  348. bool is_chinese_number(const string& str)
  349. {
  350. if (str == "Ò»" || str == "¶þ" || str == "Èý" || str == "ËÄ" || str == "Îå" ||
  351. str == "Áù" || str == "Æß" || str == "°Ë" || str == "¾Å" || str == "Ê®" ||
  352. str == "Á½" || str == "¼¸" || str == "Áã" || str == "©–" || str == "°Ù" ||
  353. str == "ǧ" || str == "Íò" || str == "ÒÚ")
  354. {
  355. return true;
  356. }
  357. else
  358. {
  359. return false;
  360. }
  361. }
  362. //void compute_time()
  363. //{
  364. // clock_t tick = clock();
  365. // double t = (double)tick / CLK_TCK;
  366. // cout << endl << "The time used: " << t << " seconds." << endl;
  367. //}
  368. string word(string& word_pos)
  369. {
  370. return word_pos.substr(0, word_pos.find("/"));
  371. }
  372. bool is_ascii_string(string& word)
  373. {
  374. for (unsigned int i = 0; i < word.size(); i++)
  375. {
  376. if (word[i] < 0)
  377. {
  378. return false;
  379. }
  380. }
  381. return true;
  382. }