/ictclas4j/src/org/ictclas4j/segment/PosTagger.java

http://ictclas4j.googlecode.com/ · Java · 777 lines · 584 code · 76 blank · 117 comment · 233 complexity · d39f00cbe0855bd7189024f7a9582e9a MD5 · raw file

  1. package org.ictclas4j.segment;
  2. import java.util.ArrayList;
  3. import org.ictclas4j.bean.AdjoiningPos;
  4. import org.ictclas4j.bean.DictLib;
  5. import org.ictclas4j.bean.Dictionary;
  6. import org.ictclas4j.bean.POSTag;
  7. import org.ictclas4j.bean.Pos;
  8. import org.ictclas4j.bean.PosContext;
  9. import org.ictclas4j.bean.SegAtom;
  10. import org.ictclas4j.bean.SegNode;
  11. import org.ictclas4j.util.Utility;
  12. import org.ictclas4j.util.Utility.TAG_TYPE;
  13. /**
  14. * ???????
  15. *
  16. * @author sinboy
  17. * @since 2007.5.17 updated
  18. *
  19. */
  20. public class PosTagger {
  21. private DictLib dictLib;
  22. private Dictionary coreDict;
  23. private Dictionary unknownDict;
  24. private PosContext context;
  25. private int pos;
  26. private TAG_TYPE tagType;
  27. String unknownFlags;
  28. public PosTagger(TAG_TYPE type, DictLib dictLib) {
  29. if (dictLib != null) {
  30. this.tagType = type;
  31. this.dictLib = dictLib;
  32. this.coreDict = dictLib.getCoreDict();
  33. switch (type) {
  34. case TT_PERSON:
  35. unknownFlags = "?##?";
  36. pos = -POSTag.NOUN_PERSON;
  37. context = dictLib.getPersonContext();
  38. unknownDict = dictLib.getPersonUnknownDict();
  39. break;
  40. case TT_TRANS_PERSON:
  41. unknownFlags = "?##?";
  42. pos = -POSTag.NOUN_PERSON;
  43. context = dictLib.getTransPersonContext();
  44. unknownDict = dictLib.getTransPersonUnknownDict();
  45. break;
  46. case TT_PLACE:
  47. unknownFlags = "?##?";
  48. pos = -POSTag.NOUN_SPACE;
  49. context = dictLib.getPlaceContext();
  50. unknownDict = dictLib.getPlaceUnknownDict();
  51. break;
  52. default:
  53. pos = 0;
  54. context = dictLib.getLexContext();
  55. unknownDict = dictLib.getLexUnknownDict();
  56. break;
  57. }
  58. }
  59. }
  60. /**
  61. * ????????????????????????????
  62. *
  63. * @param segGraph
  64. * @param coreDict
  65. * @return
  66. */
  67. public boolean recognise(SegGraph segGraph, ArrayList<SegNode> sns) {
  68. if (segGraph != null && sns != null && coreDict != null && unknownDict != null && context != null) {
  69. posTag(sns);
  70. getBestPos(sns);
  71. // DebugUtil.outputPostag(sns);
  72. switch (tagType) {
  73. case TT_PERSON:// Person recognition
  74. personRecognize(segGraph, sns);
  75. break;
  76. case TT_PLACE:// Place name recognition
  77. case TT_TRANS_PERSON:// Transliteration Person
  78. placeRecognize(segGraph, sns, coreDict);
  79. break;
  80. }
  81. }
  82. return true;
  83. }
  84. public boolean recognise(ArrayList<SegNode> sns) {
  85. if (sns != null && unknownDict != null && context != null) {
  86. posTag(sns);
  87. getBestPos(sns);
  88. // DebugUtil.outputPostag(sns);
  89. switch (tagType) {
  90. case TT_NORMAL:
  91. for (SegNode sn : sns) {
  92. if (sn.getPos() == 0) {
  93. sn.setPos(getBestTag(sn));
  94. }
  95. }
  96. }
  97. }
  98. return true;
  99. }
  100. /**
  101. * ??????????
  102. *
  103. * @param frs
  104. * ???????
  105. * @pararm startIndex ???????????
  106. * @param coreDict
  107. * ?????
  108. * @param unknownDict
  109. * ??????
  110. * @return ??????????
  111. */
  112. public void posTag(ArrayList<SegNode> sns) {
  113. if (sns != null && coreDict != null && unknownDict != null && context != null) {
  114. int i = 0;
  115. String curWord = null;
  116. for (; i < sns.size(); i++) {
  117. SegNode sn = sns.get(i);
  118. sn.setAllPos(null);
  119. curWord = sn.getSrcWord();
  120. int gbkID = sn.getGbkID();// dictLib.getGBKID(curWord);
  121. // if (tagType == Utility.TAG_TYPE.TT_NORMAL ||
  122. // !unknownDict.isExist(sn.getWord(), 44)) {
  123. //
  124. // }
  125. if (tagType != Utility.TAG_TYPE.TT_NORMAL) {
  126. // ????????????
  127. if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0) {
  128. String prevWord = sns.get(i - 1).getSrcWord();
  129. if (Utility.charType(prevWord) == Utility.CT_CHINESE) {
  130. if (".".equals(curWord))
  131. curWord = "?";
  132. else if ("-".equals(curWord))
  133. curWord = "?";
  134. }
  135. }
  136. if (sn.getPos() < 0) {
  137. AdjoiningPos pos = new AdjoiningPos( 0 , 0);
  138. sn.addPos(pos);
  139. } else {
  140. // ?unknownDict?????????????
  141. SegAtom sa = unknownDict.getSegAtom(curWord, gbkID);
  142. for (int j = 0; sa != null && j < sa.getPosCount(); j++) {
  143. Pos pos = sa.getPos(j);
  144. double value = -Math.log((1 + pos.getFreq()));
  145. value += Math.log((context.getFreq(pos.getTag()) + sa.getPosCount() + 1));
  146. AdjoiningPos apos = new AdjoiningPos(pos , value);
  147. sn.addPos(apos);
  148. }
  149. if (Utility.SENTENCE_BEGIN.equals(curWord))
  150. sn.addPos(new AdjoiningPos( 100 , 0));
  151. else if (Utility.SENTENCE_END.equals(curWord))
  152. sn.addPos(new AdjoiningPos( 101 , 0));
  153. else {
  154. int freq = 0;
  155. sa = coreDict.getSegAtom(curWord, gbkID);
  156. if (sa != null) {
  157. double value = -Math.log((double) (1 + freq));
  158. value += Math.log((double) (context.getFreq(0) + sa.getPosCount()));
  159. sn.addPos(new AdjoiningPos( 0 , value));
  160. }
  161. }
  162. }
  163. } else {
  164. if (sn.getPos() > 0) {
  165. int tag = sn.getPos();
  166. double value = -Math.log(1 + sn.getFreq());
  167. value += Math.log(1 + context.getFreq(tag));
  168. if (value < 0)
  169. value = 0;
  170. sn.addPos(new AdjoiningPos( tag, value));
  171. } else {
  172. if (sn.getPos() < 0) {
  173. sn.setPos(-sn.getPos());
  174. sn.addPos(new AdjoiningPos( -sn.getPos(), sn.getFreq()));
  175. }
  176. SegAtom sa = coreDict.getSegAtom(curWord, gbkID);
  177. if (sa != null) {
  178. for (int j = 0; j < sa.getPosCount(); j++) {
  179. Pos pos = sa.getPos(j);
  180. double value = -Math.log(1 + pos.getFreq());
  181. value += Math.log(context.getFreq(pos.getTag()) + sa.getPosCount());
  182. sn.addPos(new AdjoiningPos(pos , value));
  183. }
  184. }
  185. }
  186. }
  187. if (sn.getAllPos() == null)
  188. guessPos(tagType, sn);
  189. // ??????????allPos?null???????????
  190. // ????????????,??????“?##?”??
  191. if (i - 1 >= 0 && sns.get(i - 1).getPosSize() == -1) {
  192. if (sn.getPosSize() > 0) {
  193. Pos pos = sn.getAllPos().get(0).getPos();
  194. int ipos = pos.getTag() == POSTag.SEN_END ? POSTag.UNKNOWN : pos.getTag();
  195. AdjoiningPos apos = new AdjoiningPos( ipos , 0);
  196. sns.get(i - 1).addPos(apos);
  197. }
  198. }
  199. }
  200. // ???????
  201. SegNode last = sns.get(i - 1);
  202. if (last != null) {
  203. SegNode sn = new SegNode();
  204. int tag = 0;
  205. if (tagType != Utility.TAG_TYPE.TT_NORMAL)
  206. tag = 101;
  207. else
  208. tag = 1;
  209. AdjoiningPos pos = new AdjoiningPos( tag, 0);
  210. sn.addPos(pos);
  211. sns.add(sn);
  212. }
  213. }
  214. }
  215. /**
  216. * ???????N??????????????????
  217. */
  218. private void getBestPos(ArrayList<SegNode> sns) {
  219. ArrayList<AdjoiningPos> prevAllPos = null;
  220. ArrayList<AdjoiningPos> allPos = null;
  221. if (sns != null && context != null) {
  222. for (int i = 0; i < sns.size(); i++) {
  223. if (i == 0) {
  224. int pos = tagType != Utility.TAG_TYPE.TT_NORMAL ? 100 : 0;
  225. prevAllPos = new ArrayList<AdjoiningPos>();
  226. prevAllPos.add(new AdjoiningPos(pos, 0));
  227. } else {
  228. prevAllPos = sns.get(i - 1).getAllPos();
  229. }
  230. allPos = sns.get(i).getAllPos();
  231. if (allPos != null)
  232. for (AdjoiningPos pos : allPos) {
  233. // ?????????????????????
  234. int bestPrev = 0;
  235. double minValue = 10000000;
  236. for (int k = 0; prevAllPos != null && k < prevAllPos.size(); k++) {
  237. AdjoiningPos prevPos = prevAllPos.get(k);
  238. double temp = context.computePossibility(prevPos.getPos().getTag(), pos.getPos().getTag());
  239. temp = -Math.log(temp) + prevPos.getValue();
  240. if (temp < minValue) {
  241. minValue = temp;
  242. bestPrev = k;
  243. }
  244. }
  245. pos.setPrev(bestPrev);
  246. pos.setValue(pos.getValue() + minValue);
  247. }
  248. }
  249. tagBest(sns);
  250. // for(SegNode sn:sns){
  251. // String word=sn.getSrcWord();
  252. // System.out.println(word+":");
  253. // for(AdjoiningPos ap:sn.getAllPos()){
  254. // System.out.println("
  255. // "+POSTag.int2str(ap.getPos())+","+ap.getValue()+","+ap.getPrev()+","+ap.isBest());
  256. // }
  257. // }
  258. }
  259. }
  260. // ???????
  261. private int guessPos(TAG_TYPE tagType, SegNode sn) {
  262. int result = -1;
  263. if (sn != null && context != null) {
  264. int charType;
  265. double freq = 0;
  266. String word = sn.getWord();
  267. if (word == null)
  268. return result;
  269. switch (tagType) {
  270. case TT_NORMAL:
  271. break;
  272. case TT_PERSON:
  273. if (word.indexOf("××") != -1) {
  274. freq = (double) 1 / (double) (context.getFreq(6) + 1);
  275. sn.addPos(new AdjoiningPos(6, freq));
  276. } else {
  277. freq = (double) 1 / (double) (context.getFreq(0) + 1);
  278. sn.addPos(new AdjoiningPos(0, freq));
  279. if (sn.getLen() >= 4) {
  280. freq = (double) 1 / (double) (context.getFreq(0) + 1);
  281. sn.addPos(new AdjoiningPos(0, freq));
  282. freq = (double) 1 / (double) (context.getFreq(11) * 8);
  283. sn.addPos(new AdjoiningPos(11, freq));
  284. freq = (double) 1 / (double) (context.getFreq(12) * 8);
  285. sn.addPos(new AdjoiningPos(12, freq));
  286. freq = (double) 1 / (double) (context.getFreq(13) * 8);
  287. sn.addPos(new AdjoiningPos(13, freq));
  288. } else if (sn.getLen() == 2) {
  289. freq = (double) 1 / (double) (context.getFreq(0) + 1);
  290. sn.addPos(new AdjoiningPos(0, freq));
  291. charType = Utility.charType(word);
  292. if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
  293. freq = (double) 1 / (double) (context.getFreq(1) + 1);
  294. sn.addPos(new AdjoiningPos(1, freq));
  295. freq = (double) 1 / (double) (context.getFreq(2) + 1);
  296. sn.addPos(new AdjoiningPos(2, freq));
  297. freq = (double) 1 / (double) (context.getFreq(3) + 1);
  298. sn.addPos(new AdjoiningPos(3, freq));
  299. freq = (double) 1 / (double) (context.getFreq(4) + 1);
  300. sn.addPos(new AdjoiningPos(4, freq));
  301. }
  302. freq = (double) 1 / (double) (context.getFreq(11) * 8);
  303. sn.addPos(new AdjoiningPos(11, freq));
  304. freq = (double) 1 / (double) (context.getFreq(12) * 8);
  305. sn.addPos(new AdjoiningPos(12, freq));
  306. freq = (double) 1 / (double) (context.getFreq(13) * 8);
  307. sn.addPos(new AdjoiningPos(13, freq));
  308. }
  309. }
  310. break;
  311. case TT_PLACE:
  312. freq = (double) 1 / (double) (context.getFreq(0) + 1);
  313. sn.addPos(new AdjoiningPos(0, freq));
  314. if (sn.getLen() >= 4) {
  315. freq = (double) 1 / (double) (context.getFreq(11) * 8);
  316. sn.addPos(new AdjoiningPos(11, freq));
  317. freq = (double) 1 / (double) (context.getFreq(12) * 8);
  318. sn.addPos(new AdjoiningPos(12, freq));
  319. freq = (double) 1 / (double) (context.getFreq(13) * 8);
  320. sn.addPos(new AdjoiningPos(13, freq));
  321. } else if (sn.getLen() == 2) {
  322. freq = (double) 1 / (double) (context.getFreq(0) + 1);
  323. sn.addPos(new AdjoiningPos(0, freq));
  324. charType = Utility.charType(word);
  325. if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
  326. freq = (double) 1 / (double) (context.getFreq(1) + 1);
  327. sn.addPos(new AdjoiningPos(1, freq));
  328. freq = (double) 1 / (double) (context.getFreq(2) + 1);
  329. sn.addPos(new AdjoiningPos(2, freq));
  330. freq = (double) 1 / (double) (context.getFreq(3) + 1);
  331. sn.addPos(new AdjoiningPos(3, freq));
  332. freq = (double) 1 / (double) (context.getFreq(4) + 1);
  333. sn.addPos(new AdjoiningPos(4, freq));
  334. }
  335. freq = (double) 1 / (double) (context.getFreq(11) * 8);
  336. sn.addPos(new AdjoiningPos(11, freq));
  337. freq = (double) 1 / (double) (context.getFreq(12) * 8);
  338. sn.addPos(new AdjoiningPos(12, freq));
  339. freq = (double) 1 / (double) (context.getFreq(13) * 8);
  340. sn.addPos(new AdjoiningPos(13, freq));
  341. }
  342. break;
  343. case TT_TRANS_PERSON:
  344. freq = (double) 1 / (double) (context.getFreq(0) + 1);
  345. sn.addPos(new AdjoiningPos(0, freq));
  346. if (!Utility.isAllChinese(word)) {
  347. if (Utility.isAllLetter(word)) {
  348. freq = (double) 1 / (double) (context.getFreq(1) + 1);
  349. sn.addPos(new AdjoiningPos(1, freq));
  350. freq = (double) 1 / (double) (context.getFreq(11) + 1);
  351. sn.addPos(new AdjoiningPos(11, freq));
  352. freq = (double) 1 / (double) (context.getFreq(2) * 2 + 1);
  353. sn.addPos(new AdjoiningPos(2, freq));
  354. freq = (double) 1 / (double) (context.getFreq(3) * 2 + 1);
  355. sn.addPos(new AdjoiningPos(3, freq));
  356. freq = (double) 1 / (double) (context.getFreq(12) * 2 + 1);
  357. sn.addPos(new AdjoiningPos(12, freq));
  358. freq = (double) 1 / (double) (context.getFreq(13) * 2 + 1);
  359. sn.addPos(new AdjoiningPos(13, freq));
  360. }
  361. freq = (double) 1 / (double) (context.getFreq(41) * 8);
  362. sn.addPos(new AdjoiningPos(41, freq));
  363. freq = (double) 1 / (double) (context.getFreq(42) * 8);
  364. sn.addPos(new AdjoiningPos(42, freq));
  365. freq = (double) 1 / (double) (context.getFreq(43) * 8);
  366. sn.addPos(new AdjoiningPos(43, freq));
  367. } else if (sn.getLen() >= 4) {
  368. freq = (double) 1 / (double) (context.getFreq(41) * 8);
  369. sn.addPos(new AdjoiningPos(41, freq));
  370. freq = (double) 1 / (double) (context.getFreq(42) * 8);
  371. sn.addPos(new AdjoiningPos(42, freq));
  372. freq = (double) 1 / (double) (context.getFreq(43) * 8);
  373. sn.addPos(new AdjoiningPos(43, freq));
  374. } else if (sn.getLen() == 2) {
  375. charType = Utility.charType(word);
  376. if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
  377. freq = (double) 1 / (double) (context.getFreq(1) * 2 + 1);
  378. sn.addPos(new AdjoiningPos(1, freq));
  379. freq = (double) 1 / (double) (context.getFreq(2) * 2 + 1);
  380. sn.addPos(new AdjoiningPos(2, freq));
  381. freq = (double) 1 / (double) (context.getFreq(3) * 2 + 1);
  382. sn.addPos(new AdjoiningPos(3, freq));
  383. freq = (double) 1 / (double) (context.getFreq(30) * 8 + 1);
  384. sn.addPos(new AdjoiningPos(30, freq));
  385. freq = (double) 1 / (double) (context.getFreq(11) * 4 + 1);
  386. sn.addPos(new AdjoiningPos(11, freq));
  387. freq = (double) 1 / (double) (context.getFreq(12) * 4 + 1);
  388. sn.addPos(new AdjoiningPos(12, freq));
  389. freq = (double) 1 / (double) (context.getFreq(13) * 4 + 1);
  390. sn.addPos(new AdjoiningPos(13, freq));
  391. freq = (double) 1 / (double) (context.getFreq(21) * 2 + 1);
  392. sn.addPos(new AdjoiningPos(21, freq));
  393. freq = (double) 1 / (double) (context.getFreq(22) * 2 + 1);
  394. sn.addPos(new AdjoiningPos(22, freq));
  395. freq = (double) 1 / (double) (context.getFreq(23) * 2 + 1);
  396. sn.addPos(new AdjoiningPos(23, freq));
  397. }
  398. freq = (double) 1 / (double) (context.getFreq(41) * 8);
  399. sn.addPos(new AdjoiningPos(41, freq));
  400. freq = (double) 1 / (double) (context.getFreq(42) * 8);
  401. sn.addPos(new AdjoiningPos(42, freq));
  402. freq = (double) 1 / (double) (context.getFreq(43) * 8);
  403. sn.addPos(new AdjoiningPos(43, freq));
  404. }
  405. break;
  406. default:
  407. break;
  408. }
  409. if (sn.getAllPos() != null)
  410. result = sn.getAllPos().size();
  411. }
  412. return result;
  413. }
  414. /**
  415. * ??????
  416. *
  417. * <pre>
  418. *
  419. * BBCD 343 0.003606
  420. * BBC 2 0.000021
  421. * BBE 125 0.001314
  422. * BBZ 30 0.000315
  423. * BCD 62460 0.656624
  424. * BEE 0 0.000000
  425. * BE 13899 0.146116
  426. * BG 869 0.009136
  427. * BXD 4 0.000042
  428. * BZ 3707 0.038971
  429. * CD 8596 0.090367
  430. * EE 26 0.000273
  431. * FB 871 0.009157
  432. * Y 3265 0.034324
  433. * XD 926 0.009735
  434. *
  435. * The person recognition patterns set
  436. * BBCD:?+?+?1+?2;
  437. * BBE: ?+?+??;
  438. * BBZ: ?+?+????;
  439. * BCD: ?+?1+?2;
  440. * BE: ?+??;
  441. * BEE: ?+??+??;???
  442. * BG: ?+??
  443. * BXD: ?+???????+????
  444. * BZ: ?+????;
  445. * B: ?
  446. * CD: ?1+?2;
  447. * EE: ??+??;
  448. * FB: ??+?
  449. * XD: ???????+????
  450. * Y: ?????
  451. * </pre>
  452. */
  453. private void personRecognize(SegGraph segGraph, ArrayList<SegNode> sns) {
  454. String sPos = null;
  455. String personName = null;
  456. // ??????
  457. final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
  458. final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 0.000042, 0.038971, 0, 0.090367,
  459. 0.000273, 0.009157, 0.034324, 0.009735, 0 };
  460. if (segGraph != null && sns != null) {
  461. int j = 1, k, nPos;
  462. boolean bMatched = false;
  463. sPos = word2pattern(sns);
  464. while (sPos != null && j < sPos.length()) {
  465. bMatched = false;
  466. for (k = 0; !bMatched && patterns[k].length() > 0; k++) {
  467. // ?????????????????????????????????????
  468. if (sPos.substring(j).indexOf(patterns[k]) == 0 && !"?".equals(sns.get(j - 1).getWord())
  469. && !"?".equals(sns.get(j + patterns[k].length()))) {// Find
  470. String temp = sPos.substring(j + 2);
  471. if (temp.length() > 1)
  472. temp = temp.substring(0, 1);
  473. // Rule 1 for exclusion:??+?+?1(?2): ??(??+?)???
  474. if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
  475. continue;
  476. }
  477. nPos = j;
  478. personName = "";
  479. // Get the possible person name
  480. while (nPos < j + patterns[k].length()) {
  481. SegNode sn = sns.get(nPos);
  482. int gbkID = sn.getGbkID();// dictLib.getGBKID(sn.getSrcWord());
  483. if (sn.getPos() < 4 && unknownDict.getFreq(sn.getSrcWord(), sn.getPos(), gbkID) < Utility.LITTLE_FREQUENCY)
  484. personName += sn.getSrcWord();
  485. nPos += 1;
  486. }
  487. if ("CDCD".equals(patterns[k])) {
  488. if (GetForeignCharCount(personName) > 0)
  489. j += patterns[k].length() - 1;
  490. continue;
  491. }
  492. SegNode usn = new SegNode();
  493. usn.setRow(sns.get(j).getRow());
  494. usn.setCol(sns.get(j + patterns[k].length() - 1).getCol());
  495. usn.setWord(unknownFlags);
  496. usn.setSrcWord(personName);
  497. double value = -Math.log(factor[k]) + computePossibility(j, patterns[k].length(), sns);
  498. usn.setPos(pos);
  499. usn.setWeight(value);
  500. segGraph.insert(usn, true);
  501. j += patterns[k].length();
  502. bMatched = true;
  503. }
  504. }
  505. if (!bMatched)// Not matched, add j by 1
  506. j += 1;
  507. }
  508. }
  509. }
  510. // TODO:
  511. private int GetForeignCharCount(String personName) {
  512. return 0;
  513. }
  514. /**
  515. * ??????
  516. *
  517. */
  518. private void placeRecognize(SegGraph segGraph, ArrayList<SegNode> sns, Dictionary coreDict) {
  519. if (segGraph != null && coreDict != null) {
  520. int start = 1;
  521. int end = 1;
  522. double dPanelty = 1;
  523. String srcWord = "";
  524. for (int i = 1; i < sns.size(); i++) {
  525. start = i;
  526. end = start;
  527. srcWord = sns.get(i).getSrcWord();
  528. if (getBestTag(sns, i) == 1) {
  529. for (end = i + 1; end < sns.size(); end++) {
  530. int bestTag = getBestTag(sns, end);
  531. if (bestTag == -1)
  532. continue;
  533. else if (bestTag == 1 || bestTag == 3) {
  534. if (end > i + 1)
  535. dPanelty += 1;
  536. srcWord += sns.get(end).getSrcWord();
  537. } else if (bestTag == 2)
  538. srcWord += sns.get(end).getSrcWord();
  539. else
  540. break;
  541. }
  542. } else if (getBestTag(sns, i) == 2) {
  543. dPanelty += 1;
  544. for (end = i + 1; end < sns.size(); end++) {
  545. int bestTag = getBestTag(sns, end);
  546. if (bestTag == -1)
  547. continue;
  548. else if (bestTag == 3) {
  549. if (end > i + 1)
  550. dPanelty += 1;
  551. srcWord += sns.get(end).getSrcWord();
  552. } else if (bestTag == 2)
  553. srcWord += sns.get(end).getSrcWord();
  554. else
  555. break;
  556. }
  557. }
  558. if (end > start) {
  559. SegNode newsn = new SegNode();
  560. newsn.setRow(sns.get(start).getRow());
  561. newsn.setCol(sns.get(end - 1).getCol());
  562. newsn.setPos(pos);
  563. newsn.setWord(unknownFlags);
  564. newsn.setSrcWord(srcWord);
  565. double value = computePossibility(start, end - start + 1, sns);
  566. newsn.setWeight(value);
  567. segGraph.insert(newsn, true);
  568. }
  569. }
  570. }
  571. }
  572. private int getBestTag(ArrayList<SegNode> sns, int index) {
  573. if (sns != null && index >= 0 && index < sns.size()) {
  574. SegNode sn = sns.get(index);
  575. return getBestTag(sn);
  576. }
  577. return -1;
  578. }
  579. private int getBestTag(SegNode sn) {
  580. if (sn != null) {
  581. ArrayList<AdjoiningPos> allPos = sn.getAllPos();
  582. if (allPos != null) {
  583. for (AdjoiningPos pos : allPos) {
  584. if (pos.isBest())
  585. return pos.getPos().getTag();
  586. }
  587. }
  588. }
  589. return -1;
  590. }
  591. // Judge whether the name is a given name
  592. public boolean isGivenName(String sName) {
  593. String firstChar;
  594. String secondChar;
  595. // given Name Possibility
  596. double gnp = 0;
  597. // singleNamePossibility
  598. double snp = 0;
  599. if (sName != null) {
  600. if (sName.getBytes().length != 4)
  601. return false;
  602. firstChar = sName.substring(0, 1);
  603. int gbkID1 = dictLib.getGBKID(firstChar);
  604. secondChar = sName.substring(1);
  605. int gbkID2 = dictLib.getGBKID(secondChar);
  606. // The possibility of P(Wi|Ti)
  607. gnp += Math.log((double) unknownDict.getFreq(firstChar, 2, gbkID1) + 1.0);
  608. gnp -= Math.log(context.getFreq(2) + 1.0);
  609. gnp += Math.log((double) unknownDict.getFreq(secondChar, 3, gbkID2) + 1.0);
  610. gnp -= Math.log(context.getFreq(3) + 1.0);
  611. // The possibility of conversion from 2 to 3
  612. gnp += Math.log(context.computePossibility(2, 3) + 1.0);
  613. gnp -= Math.log(context.getFreq(2) + 1.0);
  614. // The possibility of P(Wi|Ti)
  615. snp += Math.log((double) unknownDict.getFreq(firstChar, 1, gbkID1) + 1.0);
  616. snp -= Math.log(context.getFreq(1) + 1.0);
  617. snp += Math.log((double) unknownDict.getFreq(secondChar, 4, gbkID2) + 1.0);
  618. snp -= Math.log(context.getFreq(4) + 1.0);
  619. // The possibility of conversion from 1 to 4
  620. snp += Math.log(context.computePossibility(1, 4) + 1.0);
  621. snp -= Math.log(context.getFreq(1) + 1.0);
  622. // ??||m_dict.getFrequency(sFirstChar,1)/m_dict.getFrequency(sFirstChar,2)>=10
  623. // The possibility being a single given name is more than being a
  624. // 2-char given name
  625. if (snp >= gnp)
  626. return false;
  627. return true;
  628. }
  629. return false;
  630. }
  631. // ??????????????????????
  632. private String word2pattern(ArrayList<SegNode> sns) {
  633. String result = null;
  634. if (sns != null) {
  635. result = "";
  636. for (SegNode sn : sns) {
  637. result += (char) (getBestTag(sn) + 'A');
  638. }
  639. }
  640. return result;
  641. }
  642. /**
  643. * ???????
  644. *
  645. * @param sns
  646. */
  647. private void tagBest(ArrayList<SegNode> sns) {
  648. if (sns != null) {
  649. int size = sns.size();
  650. // ??????????
  651. for (int i = size - 1, j = 0; i >= 0; i--) {
  652. SegNode sn = sns.get(i);
  653. ArrayList<AdjoiningPos> allPos = sn.getAllPos();
  654. if (allPos != null && allPos.size() > j) {
  655. AdjoiningPos pos = allPos.get(j);
  656. pos.setBest(true);
  657. j = pos.getPrev();
  658. } else if (i + 1 < size - 1) {
  659. int tag = getBestTag(sns.get(i + 1));
  660. AdjoiningPos pos = new AdjoiningPos(tag, 0);
  661. pos.setBest(true);
  662. sns.get(i).addPos(pos);
  663. }
  664. // ?????????????????????????????????
  665. if (sn.getPos() == POSTag.NOUN_LETTER || sn.getPos() == POSTag.NUM) {
  666. for (AdjoiningPos pos : allPos) {
  667. if (pos.isBest() && pos.getPos().getTag() > 0) {
  668. sn.setPos(pos.getPos().getTag());
  669. break;
  670. }
  671. }
  672. }
  673. }
  674. // ????????????????????????“????”??????
  675. if (size > 1) {
  676. if (sns.get(size - 1).getWord() == null)
  677. sns.remove(size - 1);
  678. }
  679. }
  680. }
  681. private double computePossibility(int startPos, int length, ArrayList<SegNode> sns) {
  682. double retValue = 0, posPoss;
  683. if (sns != null && unknownDict != null && context != null) {
  684. for (int i = startPos; sns != null && i < startPos + length && i < sns.size(); i++) {
  685. SegNode sn = sns.get(i);
  686. int bestTag = getBestTag(sn);
  687. if (bestTag != -1) {
  688. int gbkID = sn.getGbkID();// dictLib.getGBKID(sn.getSrcWord());
  689. int freq = unknownDict.getFreq(sn.getSrcWord(), bestTag, gbkID);
  690. posPoss = Math.log((double) (context.getFreq(sn.getPos()) + 1));
  691. posPoss += -Math.log((double) (freq + 1));
  692. retValue += posPoss;
  693. }
  694. }
  695. }
  696. return retValue;
  697. }
  698. public Dictionary getUnknownDict() {
  699. return unknownDict;
  700. }
  701. }