/src/org/ictclas4j/bean/ContextStat.java

http://ictclas4j.googlecode.com/ · Java · 170 lines · 127 code · 34 blank · 9 comment · 32 complexity · 20c5d64d43a5bcfe5d81ed36ade21468 MD5 · raw file

  1. package org.ictclas4j.bean;
  2. import java.io.DataInputStream;
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.FileNotFoundException;
  6. import java.io.IOException;
  7. import java.util.ArrayList;
  8. import org.apache.log4j.Logger;
  9. import org.ictclas4j.utility.GFCommon;
  10. import org.ictclas4j.utility.Utility;
  11. public class ContextStat {
  12. private int tableLen;
  13. private int[] symbolTable;
  14. private ArrayList<TagContext> tcList;
  15. static Logger logger = Logger.getLogger(ContextStat.class);
  16. public ContextStat() {
  17. tcList = new ArrayList<TagContext>();
  18. }
  19. public boolean load(String fileName) {
  20. return load(fileName, false);
  21. }
  22. public boolean load(String fileName, boolean isReset) {
  23. File file = new File(fileName);
  24. if (!file.canRead())
  25. return false;// fail while opening the file
  26. try {
  27. byte[] b = null;
  28. DataInputStream in = new DataInputStream(new FileInputStream(file));
  29. // ????
  30. tableLen = GFCommon.bytes2int(Utility.readBytes(in, 4), false);
  31. logger.debug("tableLen:" + tableLen);
  32. // ??????
  33. symbolTable = new int[tableLen];
  34. for (int i = 0; i < tableLen; i++) {
  35. b = Utility.readBytes(in, 4);
  36. symbolTable[i] = GFCommon.bytes2int(b, false);
  37. logger.debug("symbolTable[" + i + "]:" + symbolTable[i]);
  38. }
  39. long fileLen = file.length();
  40. long curLen = 4 + tableLen * 4;
  41. while (curLen < fileLen) {
  42. logger.debug("tagContext:");
  43. TagContext tc = new TagContext();
  44. // ?????
  45. b = Utility.readBytes(in, 4);
  46. int key = GFCommon.bytes2int(b);
  47. curLen += 4;
  48. logger.debug("\tkey:" + key);
  49. // ?????
  50. b = Utility.readBytes(in, 4);
  51. curLen += 4;
  52. int totalFreq = GFCommon.bytes2int(b, false);
  53. logger.debug("\ttotalFreq:" + totalFreq);
  54. // ????
  55. int[] tagFreq = new int[tableLen];
  56. for (int i = 0; i < tableLen; i++) {
  57. b = Utility.readBytes(in, 4);
  58. curLen += 4;
  59. tagFreq[i] = GFCommon.bytes2int(b, false);
  60. logger.debug("\ttagFreq[" + i + "]:" + tagFreq[i]);
  61. }
  62. // ???????
  63. int[][] contextArray = new int[tableLen][tableLen];
  64. for (int i = 0; i < tableLen; i++) {
  65. String pr = "";
  66. logger.debug("\tcontextArray[" + i + "]");
  67. for (int j = 0; j < tableLen; j++) {
  68. b = Utility.readBytes(in, 4);
  69. curLen += 4;
  70. contextArray[i][j] = GFCommon.bytes2int(b, false);
  71. pr += " " + contextArray[i][j];
  72. }
  73. logger.debug("\t\t" + pr);
  74. }
  75. tc.setTotalFreq(totalFreq);
  76. tc.setKey(key);
  77. tc.setTagFreq(tagFreq);
  78. tc.setContextArray(contextArray);
  79. tcList.add(tc);
  80. }
  81. in.close();
  82. } catch (FileNotFoundException e) {
  83. logger.debug(e);
  84. } catch (IOException e) {
  85. logger.debug(e);
  86. }
  87. return true;
  88. }
  89. public int getFreq(int key, int symbol) {
  90. TagContext tc = getItem(key);
  91. if (tc == null)
  92. return 0;
  93. int index = Utility.binarySearch(symbol, symbolTable);
  94. if (index == -1)// error finding the symbol
  95. return 0;
  96. // Add the frequency
  97. int frequency = 0;
  98. if (tc.getTagFreq() != null)
  99. frequency = tc.getTagFreq()[index];
  100. return frequency;
  101. }
  102. public double getPossibility(int key, int prev, int cur) {
  103. double result = 0;
  104. int curIndex = Utility.binarySearch(cur, symbolTable);
  105. int prevIndex = Utility.binarySearch(prev, symbolTable);
  106. TagContext tc = getItem(key);
  107. // return a lower value, not 0 to prevent data sparse
  108. if (tc == null || curIndex == -1 || prevIndex == -1
  109. || tc.getContextArray()[prevIndex][curIndex] == 0
  110. || tc.getTagFreq()[prevIndex] == 0)
  111. return 0.000001;
  112. int prevCurConFreq = tc.getContextArray()[prevIndex][curIndex];
  113. int prevFreq = tc.getTagFreq()[prevIndex];
  114. // 0.9 and 0.1 is a value based experience
  115. result = 0.9 * (double) prevCurConFreq;
  116. result /= (double) prevFreq;
  117. result += 0.1 * (double) prevFreq / (double) tc.getTotalFreq();
  118. return result;
  119. }
  120. public TagContext getItem(int key) {
  121. TagContext result = null;
  122. if(tcList==null||tcList.size()==0)
  123. return null;
  124. if (key == 0 )
  125. result = tcList.get(0);
  126. else {
  127. int i=0;
  128. for ( ; i < tcList.size() && tcList.get(i).getKey()<key; i++);
  129. if(i<tcList.size() && tcList.get(i).getKey()==key)
  130. result=tcList.get(i);
  131. else if(i-1<tcList.size())
  132. result=tcList.get(i-1);
  133. }
  134. return result;
  135. }
  136. }