/src/org/ictclas4j/bean/Span.java

http://ictclas4j.googlecode.com/ · Java · 659 lines · 520 code · 56 blank · 83 comment · 140 complexity · 99cdb84be0bb0ff2f8ff3942bc454719 MD5 · raw file

  1. package org.ictclas4j.bean;
  2. import java.util.ArrayList;
  3. import org.ictclas4j.utility.Utility;
  4. import org.ictclas4j.utility.Utility.TAG_TYPE;
  5. public class Span {
  6. public ContextStat context;
  7. TAG_TYPE tagType;
  8. private int[][] m_nTags;
  9. int[][] m_nBestPrev;
  10. int m_nStartPos;
  11. int[] m_nBestTag;
  12. int m_nCurLength;
  13. String[] m_sWords;
  14. double[][] m_dFrequency;
  15. public int[][] m_nUnknownWords;
  16. public int m_nUnknownIndex;
  17. public int[] m_nWordPosition;
  18. public double[] m_dWordsPossibility;
  19. public Span() {
  20. m_nTags = new int[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
  21. if (tagType != Utility.TAG_TYPE.TT_NORMAL)
  22. m_nTags[0][0] = 100;// Begin tag
  23. else
  24. m_nTags[0][0] = 0;// Begin tag
  25. m_nTags[0][1] = -1;
  26. m_nBestPrev = new int[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
  27. m_nBestTag = new int[Utility.MAX_WORDS_PER_SENTENCE];
  28. m_sWords = new String[Utility.MAX_WORDS_PER_SENTENCE];
  29. m_nUnknownWords = new int[Utility.MAX_UNKNOWN_PER_SENTENCE][2];
  30. m_nWordPosition = new int[Utility.MAX_WORDS_PER_SENTENCE];
  31. m_dWordsPossibility = new double[Utility.MAX_UNKNOWN_PER_SENTENCE];
  32. m_dFrequency = new double[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
  33. tagType = Utility.TAG_TYPE.TT_NORMAL;
  34. }
  35. public boolean loadContext(String fileName) {
  36. if (fileName != null) {
  37. context = new ContextStat();
  38. return context.load(fileName);
  39. }
  40. return false;
  41. }
  42. public void setType(TAG_TYPE type) {
  43. tagType = type;
  44. }
  45. public boolean posTagging(ArrayList<WordResult> wrList, Dictionary coreDict, Dictionary unknownDict) {
  46. int i = 0;
  47. int j, nStartPos;
  48. reset(false);
  49. while (i > -1 && i < wrList.size()) {
  50. nStartPos = i;// Start Position
  51. i = getFrom(wrList, nStartPos, coreDict, unknownDict);
  52. getBestPOS();
  53. switch (tagType) {
  54. case TT_NORMAL:// normal POS tagging
  55. j = 1;
  56. // Store the best POS tagging
  57. while (m_nBestTag[j] != -1 && j < m_nCurLength) {
  58. WordResult wr = wrList.get(j + nStartPos - 1);
  59. wr.setHandle(m_nBestTag[j]);
  60. // Let ?be 0
  61. // Exist and update its frequncy as a POS value
  62. if (wr.getValue() > 0 && coreDict.isExist(wr.getWord(), -1))
  63. wr.setValue(coreDict.getFreq(wr.getWord(), m_nBestTag[j]));
  64. j += 1;
  65. }
  66. break;
  67. case TT_PERSON:// Person recognition
  68. PersonRecognize(unknownDict);
  69. break;
  70. case TT_PLACE:// Place name recognition
  71. case TT_TRANS_PERSON:// Transliteration Person
  72. PlaceRecognize(coreDict, unknownDict);
  73. break;
  74. default:
  75. break;
  76. }
  77. reset();
  78. }
  79. return true;
  80. }
  81. public boolean reset(boolean isContinue) {
  82. if (!isContinue) {
  83. if (tagType != Utility.TAG_TYPE.TT_NORMAL)
  84. m_nTags[0][0] = 100;// Begin tag
  85. else
  86. m_nTags[0][0] = 0;// Begin tag
  87. m_nUnknownIndex = 0;
  88. m_dFrequency[0][0] = 0;
  89. m_nStartPos = 0;
  90. } else {
  91. // Get the last POS in the last sentence
  92. m_nTags[0][0] = m_nTags[m_nCurLength - 1][0];
  93. m_dFrequency[0][0] = m_dFrequency[m_nCurLength - 1][0];
  94. }
  95. // Get the last POS in the last sentence,set the -1 as end flag
  96. m_nTags[0][1] = -1;
  97. m_nCurLength = 1;
  98. m_nWordPosition[1] = m_nStartPos;
  99. m_sWords[0] = null;
  100. return true;
  101. }
  102. public boolean reset() {
  103. return reset(true);
  104. }
  105. private boolean disamb() {
  106. int i, j, k, nMinCandidate;
  107. double dMinFee = 0;
  108. double dTmp = 0;
  109. for (i = 1; i < m_nCurLength; i++)// For every word
  110. {
  111. for (j = 0; m_nTags[i][j] >= 0; j++)// For every word
  112. {
  113. nMinCandidate = Utility.MAX_POS_PER_WORD + 1;
  114. for (k = 0; m_nTags[i - 1][k] >= 0; k++) {
  115. // ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
  116. // ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
  117. // dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
  118. dTmp = -Math.log(context.getPossibility(0, m_nTags[i - 1][k], m_nTags[i][j]));
  119. dTmp += m_dFrequency[i - 1][k];// Add the fees
  120. if (nMinCandidate > 10 || dTmp < dMinFee)// Get the
  121. // minimum fee
  122. {
  123. nMinCandidate = k;
  124. dMinFee = dTmp;
  125. }
  126. }
  127. m_nBestPrev[i][j] = nMinCandidate;// The best previous for j
  128. m_dFrequency[i][j] = m_dFrequency[i][j] + dMinFee;
  129. }
  130. }
  131. return true;
  132. }
  133. public boolean getBestPOS() {
  134. disamb();
  135. for (int i = m_nCurLength - 1, j = 0; i > 0; i--)// ,j>=0
  136. {
  137. if (m_sWords[i] != null) {// Not virtual ending
  138. m_nBestTag[i] = m_nTags[i][j];// Record the best POS and its
  139. // possibility
  140. }
  141. j = m_nBestPrev[i][j];
  142. }
  143. int nEnd = m_nCurLength;// Set the end of POS tagging
  144. if (m_sWords[m_nCurLength - 1] == null)
  145. nEnd = m_nCurLength - 1;
  146. m_nBestTag[nEnd] = -1;
  147. return true;
  148. }
  149. /**
  150. * ?????dictUnknown????????????
  151. * @param wrList
  152. * @param index
  153. * @param coreDict
  154. * @param unknownDict
  155. * @return
  156. */
  157. public int getFrom(ArrayList<WordResult> wrList, int index, Dictionary coreDict, Dictionary unknownDict) {
  158. int[] aPOS = new int[Utility.MAX_POS_PER_WORD];
  159. int[] aFreq = new int[Utility.MAX_POS_PER_WORD];
  160. int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
  161. boolean bSplit = false;// Need to split in Transliteration recognition
  162. int i = 1, nPOSCount;
  163. String sCurWord;// Current word
  164. nWordsIndex = index ;
  165. for (; i < Utility.MAX_WORDS_PER_SENTENCE && nWordsIndex < wrList.size(); i++) {
  166. WordResult wr = wrList.get(nWordsIndex);
  167. String word = wr.getWord();
  168. if (tagType == Utility.TAG_TYPE.TT_NORMAL || !unknownDict.isExist(word, 44)) {
  169. // current word
  170. m_sWords[i] = word;// store
  171. m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].getBytes().length;
  172. }
  173. // Record the position of current word
  174. m_nStartPos = m_nWordPosition[i + 1];
  175. // Move the Start POS to the ending
  176. if (tagType != Utility.TAG_TYPE.TT_NORMAL) {
  177. // Get the POSs from the unknown recognition dictionary
  178. sCurWord = m_sWords[i];
  179. if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0
  180. && Utility.charType(m_sWords[i - 1]) == Utility.CT_CHINESE) {
  181. if (".".equals(m_sWords[i]))
  182. sCurWord = "?";
  183. else if ("-".equals(m_sWords))
  184. sCurWord = "?";
  185. }
  186. ArrayList<WordItem> wis = unknownDict.getHandle(sCurWord);
  187. nPOSCount = wis.size() + 1;
  188. for (j = 0; j < wis.size(); j++) {
  189. aPOS[j] = wis.get(j).getHandle();
  190. aFreq[j] = wis.get(j).getFreq();
  191. m_nTags[i][j] = aPOS[j];
  192. m_dFrequency[i][j] = -Math.log((1 + aFreq[j]));
  193. m_dFrequency[i][j] += Math.log((context.getFreq(0, aPOS[j]) + nPOSCount));
  194. }
  195. if ("?##?".equals(m_sWords[i])) {
  196. m_nTags[i][j] = 100;
  197. m_dFrequency[i][j] = 0;
  198. j++;
  199. } else if ("?##?".equals(m_sWords[i])) {
  200. m_nTags[i][j] = 101;
  201. m_dFrequency[i][j] = 0;
  202. j++;
  203. } else {
  204. wis = coreDict.getHandle(m_sWords[i]);
  205. nFreq = 0;
  206. for (int k = 0; k < wis.size(); k++) {
  207. aFreq[k] = wis.get(k).getFreq();
  208. nFreq += aFreq[k];
  209. }
  210. if (wis.size() > 0) {
  211. m_nTags[i][j] = 0;
  212. m_dFrequency[i][j] = -Math.log((double) (1 + nFreq));
  213. m_dFrequency[i][j] += Math.log((double) (context.getFreq(0, 0) + nPOSCount));
  214. j++;
  215. }
  216. }
  217. } else// For normal POS tagging
  218. {
  219. j = 0;
  220. // Get the POSs from the unknown recognition dictionary
  221. if (wr.getHandle() > 0) {// The word has is only one POS
  222. // value
  223. // We have record its POS and nFrequncy in the items.
  224. m_nTags[i][j] = wr.getHandle();
  225. m_dFrequency[i][j] = -Math.log(wr.getValue())
  226. + Math.log((double) (context.getFreq(0, m_nTags[i][j]) + 1));
  227. // Not permit the value less than 0
  228. if (m_dFrequency[i][j] < 0)
  229. m_dFrequency[i][j] = 0;
  230. j++;
  231. }
  232. // The word has multiple POSs, we should retrieve the
  233. // information from Core Dictionary
  234. else {
  235. if (wr.getHandle() < 0) {// The word has is only one POS
  236. m_nTags[i][j] = -wr.getHandle();
  237. m_dFrequency[i][j++] = wr.getValue();
  238. }
  239. ArrayList<WordItem> wis = coreDict.getHandle(m_sWords[i]);
  240. nPOSCount = wis.size();
  241. for (; j < wis.size(); j++) {
  242. // in the unknown dictionary
  243. aPOS[j] = wis.get(j).getHandle();
  244. aFreq[j] = wis.get(j).getFreq();
  245. m_nTags[i][j] = aPOS[j];
  246. m_dFrequency[i][j] = -Math.log(1 + aFreq[j])
  247. + Math.log(context.getFreq(0, m_nTags[i][j]) + nPOSCount);
  248. }
  249. }
  250. }
  251. // We donot know the POS, so we have to guess them according lexical
  252. // knowledge
  253. if (j == 0) {
  254. j = guessPOS(i);// Guess the POS of current word
  255. }
  256. m_nTags[i][j] = -1;// Set the ending POS
  257. // No ambuguity, so we can break from the loop
  258. if (j == 1 && m_nTags[i][j] != Utility.CT_SENTENCE_BEGIN) {
  259. i++;
  260. m_sWords[i] = null;
  261. break;
  262. }
  263. if (!bSplit)
  264. nWordsIndex++;
  265. }
  266. if (nWordsIndex == wrList.size())
  267. nRetPos = -1;// Reaching ending
  268. if (m_nTags[i - 1][1] != -1)// ||m_sWords[i][0]==0
  269. {// Set end for words like "?/?/?"
  270. if (tagType != Utility.TAG_TYPE.TT_NORMAL)
  271. m_nTags[i][0] = 101;
  272. else
  273. m_nTags[i][0] = 1;
  274. m_dFrequency[i][0] = 0;
  275. m_sWords[i] = null;// Set virtual ending
  276. m_nTags[i++][1] = -1;
  277. }
  278. m_nCurLength = i;// The current word count
  279. if (nRetPos != -1)
  280. return nWordsIndex + 1;// Next start position
  281. return -1;// Reaching ending
  282. }
  283. /**
  284. * <pre>
  285. *
  286. * BBCD 343 0.003606
  287. * BBC 2 0.000021
  288. * BBE 125 0.001314
  289. * BBZ 30 0.000315
  290. * BCD 62460 0.656624
  291. * BEE 0 0.000000
  292. * BE 13899 0.146116
  293. * BG 869 0.009136
  294. * BXD 4 0.000042
  295. * BZ 3707 0.038971
  296. * CD 8596 0.090367
  297. * EE 26 0.000273
  298. * FB 871 0.009157
  299. * Y 3265 0.034324
  300. * XD 926 0.009735
  301. *
  302. * The person recognition patterns set
  303. * BBCD:?+?+?1+?2;
  304. * BBE: ?+?+??;
  305. * BBZ: ?+?+????;
  306. * BCD: ?+?1+?2;
  307. * BE: ?+??;
  308. * BEE: ?+??+??;???
  309. * BG: ?+??
  310. * BXD: ?+???????+????
  311. * BZ: ?+????;
  312. * B: ?
  313. * CD: ?1+?2;
  314. * EE: ??+??;
  315. * FB: ??+?
  316. * XD: ???????+????
  317. * Y: ?????
  318. * </pre>
  319. */
  320. public boolean PersonRecognize(Dictionary personDict) {
  321. String sPOS = "z";
  322. String sPersonName;
  323. // 0 1 2 3 4 5
  324. final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD",
  325. "EE", "FB", "Y", "XD", "" };
  326. // BBCD BBC BBE BBZ BCD BEE BE BG
  327. final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
  328. // BXD BZ CDCD CD EE FB Y XD
  329. 0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };
  330. // About parameter:
  331. final int patternLen[] = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };
  332. int i = 0;
  333. for (i = 1; m_nBestTag[i] > -1; i++)
  334. // Convert to string from POS
  335. sPOS += (char) (m_nBestTag[i] + 'A');
  336. int j = 1, k, nPos;// Find the proper pattern from the first POS
  337. int nLittleFreqCount;// Counter for the person name role with little
  338. // frequecy
  339. boolean bMatched = false;
  340. while (j < i) {
  341. bMatched = false;
  342. for (k = 0; !bMatched && patternLen[k] > 0; k++) {
  343. if (sPOS.substring(j).indexOf(patterns[k]) == 0 && !"?".equals(m_sWords[j - 1])
  344. && !"?".equals(m_sWords[j + patternLen[k]])) {// Find
  345. String temp = sPOS.substring(j + 2);
  346. if (temp.length() > 1)
  347. temp = temp.substring(0, 1);
  348. // Rule 1 for exclusion:??+?+?1(?2): ??(??+?)???
  349. if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
  350. continue;
  351. }
  352. nPos = j;// Record the person position in the tag
  353. // sequence
  354. sPersonName = "";
  355. nLittleFreqCount = 0;// Record the number of role with
  356. // little frequency
  357. while (nPos < j + patternLen[k]) {// Get the possible
  358. // person name
  359. if (m_nBestTag[nPos] < 4
  360. && personDict.getFreq(m_sWords[nPos], m_nBestTag[nPos]) < Utility.LITTLE_FREQUENCY)
  361. nLittleFreqCount++;// The counter increase
  362. sPersonName += m_sWords[nPos];
  363. nPos += 1;
  364. }
  365. if ("CDCD".equals(patterns[k])) {
  366. if (GetForeignCharCount(sPersonName) > 0)
  367. j += patternLen[k] - 1;
  368. continue;
  369. }
  370. m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[j];
  371. m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[j + patternLen[k]];
  372. m_dWordsPossibility[m_nUnknownIndex] = -Math.log(factor[k])
  373. + ComputePossibility(j, patternLen[k], personDict);
  374. // Mutiply the factor
  375. m_nUnknownIndex += 1;
  376. j += patternLen[k];
  377. bMatched = true;
  378. }
  379. }
  380. if (!bMatched)// Not matched, add j by 1
  381. j += 1;
  382. }
  383. return true;
  384. }
  385. private int guessPOS(int index) {
  386. int j = 0, i = index, charType;
  387. int nLen;
  388. switch (tagType) {
  389. case TT_NORMAL:
  390. break;
  391. case TT_PERSON:
  392. j = 0;
  393. if (m_sWords[index].indexOf("××") != -1) {
  394. m_nTags[i][j] = 6;
  395. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 6) + 1);
  396. } else {
  397. m_nTags[i][j] = 0;
  398. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
  399. nLen = m_sWords[index].getBytes().length;
  400. if (nLen >= 4) {
  401. m_nTags[i][j] = 0;
  402. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
  403. m_nTags[i][j] = 11;
  404. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
  405. m_nTags[i][j] = 12;
  406. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
  407. m_nTags[i][j] = 13;
  408. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
  409. } else if (nLen == 2) {
  410. m_nTags[i][j] = 0;
  411. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
  412. charType = Utility.charType(m_sWords[index]);
  413. if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
  414. m_nTags[i][j] = 1;
  415. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
  416. m_nTags[i][j] = 2;
  417. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) + 1);
  418. m_nTags[i][j] = 3;
  419. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) + 1);
  420. m_nTags[i][j] = 4;
  421. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 4) + 1);
  422. }
  423. m_nTags[i][j] = 11;
  424. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
  425. m_nTags[i][j] = 12;
  426. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
  427. m_nTags[i][j] = 13;
  428. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
  429. }
  430. }
  431. break;
  432. case TT_PLACE:
  433. j = 0;
  434. m_nTags[i][j] = 0;
  435. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
  436. nLen = m_sWords[index].length();
  437. if (nLen >= 4) {
  438. m_nTags[i][j] = 11;
  439. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
  440. m_nTags[i][j] = 12;
  441. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
  442. m_nTags[i][j] = 13;
  443. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
  444. } else if (nLen == 2) {
  445. m_nTags[i][j] = 0;
  446. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
  447. charType = Utility.charType(m_sWords[index]);
  448. if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
  449. m_nTags[i][j] = 1;
  450. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
  451. m_nTags[i][j] = 2;
  452. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) + 1);
  453. m_nTags[i][j] = 3;
  454. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) + 1);
  455. m_nTags[i][j] = 4;
  456. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 4) + 1);
  457. }
  458. m_nTags[i][j] = 11;
  459. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
  460. m_nTags[i][j] = 12;
  461. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
  462. m_nTags[i][j] = 13;
  463. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
  464. }
  465. break;
  466. case TT_TRANS_PERSON:
  467. j = 0;
  468. nLen = m_sWords[index].length();
  469. m_nTags[i][j] = 0;
  470. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
  471. if (!Utility.isAllChinese(m_sWords[index])) {
  472. if (Utility.isAllLetter(m_sWords[index])) {
  473. m_nTags[i][j] = 1;
  474. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
  475. m_nTags[i][j] = 11;
  476. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) + 1);
  477. m_nTags[i][j] = 2;
  478. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
  479. m_nTags[i][j] = 3;
  480. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
  481. m_nTags[i][j] = 12;
  482. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 2 + 1);
  483. m_nTags[i][j] = 13;
  484. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 2 + 1);
  485. }
  486. m_nTags[i][j] = 41;
  487. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
  488. m_nTags[i][j] = 42;
  489. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
  490. m_nTags[i][j] = 43;
  491. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
  492. } else if (nLen >= 4) {
  493. m_nTags[i][j] = 41;
  494. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
  495. m_nTags[i][j] = 42;
  496. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
  497. m_nTags[i][j] = 43;
  498. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
  499. } else if (nLen == 2) {
  500. charType = Utility.charType(m_sWords[index]);
  501. if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
  502. m_nTags[i][j] = 1;
  503. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) * 2 + 1);
  504. m_nTags[i][j] = 2;
  505. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
  506. m_nTags[i][j] = 3;
  507. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
  508. m_nTags[i][j] = 30;
  509. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 30) * 8 + 1);
  510. m_nTags[i][j] = 11;
  511. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 4 + 1);
  512. m_nTags[i][j] = 12;
  513. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 4 + 1);
  514. m_nTags[i][j] = 13;
  515. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 4 + 1);
  516. m_nTags[i][j] = 21;
  517. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 21) * 2 + 1);
  518. m_nTags[i][j] = 22;
  519. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 22) * 2 + 1);
  520. m_nTags[i][j] = 23;
  521. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 23) * 2 + 1);
  522. }
  523. m_nTags[i][j] = 41;
  524. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
  525. m_nTags[i][j] = 42;
  526. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
  527. m_nTags[i][j] = 43;
  528. m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
  529. }
  530. break;
  531. default:
  532. break;
  533. }
  534. return j;
  535. }
  536. int GetForeignCharCount(String personName) {
  537. return 0;
  538. }
  539. public boolean PlaceRecognize(Dictionary coreDict, Dictionary placeDict) {
  540. int nStart = 1, nEnd = 1, i = 1, nTemp;
  541. double dPanelty = 1.0;// Panelty value
  542. while (m_nBestTag[i] > -1) {
  543. if (m_nBestTag[i] == 1)// 1 Trigger the recognition procession
  544. {
  545. nStart = i;
  546. nEnd = nStart + 1;
  547. while (m_nBestTag[nEnd] == 1)//
  548. {
  549. if (nEnd > nStart + 1)
  550. dPanelty += 1.0;
  551. nEnd++;
  552. }
  553. while (m_nBestTag[nEnd] == 2)
  554. // 2,12,22
  555. nEnd++;
  556. nTemp = nEnd;
  557. while (m_nBestTag[nEnd] == 3) {
  558. if (nEnd > nTemp)
  559. dPanelty += 1.0;
  560. nEnd++;
  561. }
  562. } else if (m_nBestTag[i] == 2)// 1,11,21 Trigger the recognition
  563. {
  564. dPanelty += 1.0;
  565. nStart = i;
  566. nEnd = nStart + 1;
  567. while (m_nBestTag[nEnd] == 2)
  568. // 2
  569. nEnd++;
  570. nTemp = nEnd;
  571. while (m_nBestTag[nEnd] == 3)// 2
  572. {
  573. if (nEnd > nTemp)
  574. dPanelty += 1.0;
  575. nEnd++;
  576. }
  577. }
  578. if (nEnd > nStart) {
  579. m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[nStart];
  580. m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[nEnd];
  581. m_dWordsPossibility[m_nUnknownIndex++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict)
  582. + Math.log(dPanelty);
  583. nStart = nEnd;
  584. }
  585. if (i < nEnd)
  586. i = nEnd;
  587. else
  588. i = i + 1;
  589. }
  590. return true;
  591. }
  592. private double ComputePossibility(int startPos, int length, Dictionary dict) {
  593. double retValue = 0, posPoss;
  594. int nFreq;
  595. for (int i = startPos; i < startPos + length; i++) {
  596. nFreq = dict.getFreq(m_sWords[i], m_nBestTag[i]);
  597. // nFreq is word being the POS
  598. posPoss = Math.log((double) (context.getFreq(0, m_nBestTag[i]) + 1)) - Math.log((double) (nFreq + 1));
  599. retValue += posPoss;
  600. }
  601. return retValue;
  602. }
  603. }