/ictclas4j/src/org/ictclas4j/bean/Dictionary.java

http://ictclas4j.googlecode.com/ · Java · 253 lines · 190 code · 38 blank · 25 comment · 47 complexity · 1199c6838387a4f2586f637187caa395 MD5 · raw file

  1. package org.ictclas4j.bean;
  2. import java.io.DataInputStream;
  3. import java.io.DataOutputStream;
  4. import java.io.File;
  5. import java.io.FileInputStream;
  6. import java.io.FileNotFoundException;
  7. import java.io.FileOutputStream;
  8. import java.io.IOException;
  9. import java.util.Collection;
  10. import java.util.HashMap;
  11. import org.apache.log4j.Logger;
  12. import org.ictclas4j.util.Utility;
  13. import com.gftech.util.GFNet;
  14. import com.gftech.util.GFString;
  15. public class Dictionary {
  16. /**
  17. * ???,?6768?,GB2312??(before) 22034??gbk??+?????now)
  18. */
  19. private WordTable[] wts;
  20. private int wordCount;// ????
  21. private long totalFreq;// ???
  22. private int dict_count;
  23. static Logger logger = Logger.getLogger(Dictionary.class);
  24. public Dictionary() {
  25. this(null,false);
  26. }
  27. public Dictionary(String fileName) {
  28. this(fileName,false);
  29. }
  30. public Dictionary( boolean isExtend) {
  31. this(null,isExtend);
  32. }
  33. public Dictionary(String fileName,boolean isExtend) {
  34. init(isExtend);
  35. load(fileName);
  36. }
  37. public void init(boolean isExtend) {
  38. wordCount = 0;
  39. totalFreq = 0;
  40. dict_count = isExtend ? Utility.GBK_NUM_EXT : Utility.GB_NUM;
  41. wts = new WordTable[dict_count];
  42. }
  43. /**
  44. * ?????????.?6768??????(??5??????),???????????????,
  45. * ???????????,??????????????????.
  46. *
  47. * @param fileName
  48. * ???????
  49. * @return
  50. */
  51. public boolean load(String fileName) {
  52. int i = 0, j = 0;
  53. File file = new File(fileName);
  54. if (!file.canRead())
  55. return false;// fail while opening the file
  56. try {
  57. long offset = 0;
  58. WordTable wt = new WordTable();
  59. SegAtom sa = new SegAtom();
  60. HashMap<String, SegAtom> wordMap = null;
  61. DataInputStream in = new DataInputStream(new FileInputStream(file));
  62. for (i = 0; i < dict_count; i++) {
  63. try {
  64. WordTable wtClone = wt.clone();
  65. logger.debug("?" + i);
  66. // ?????????????????(????)??,??????
  67. int count = GFNet.readInt32(in);
  68. logger.debug(" count:" + count);
  69. wtClone.setWordCount(count);
  70. int wordMaxLen = GFNet.readUInt8(in);
  71. wtClone.setWordMaxLen(wordMaxLen);
  72. offset += 5;
  73. wordMap = new HashMap<String, SegAtom>();
  74. for (j = 0; j < count; j++, wordCount++) {
  75. SegAtom saClone = sa.clone();
  76. int bc = saClone.read(in, 0);
  77. offset += bc;
  78. logger.debug(saClone);
  79. wordMap.put(saClone.getWord(), saClone);
  80. totalFreq += saClone.getTotalFreq();
  81. }
  82. wtClone.setWordMap(wordMap);
  83. wts[i] = wtClone;
  84. } catch (CloneNotSupportedException e) {
  85. logger.fatal("Load dict:", e);
  86. }
  87. }
  88. in.close();
  89. } catch (FileNotFoundException e) {
  90. logger.fatal("load dict " + fileName + ":", e);
  91. } catch (IOException e) {
  92. logger.fatal("load dict " + fileName + ":", e);
  93. logger.fatal("i:" + i + ",j:" + j);
  94. }
  95. return true;
  96. }
  97. /**
  98. *
  99. * @param fileName
  100. * @return
  101. */
  102. public boolean save(String fileName) {
  103. File file = new File(fileName);
  104. try {
  105. DataOutputStream out = new DataOutputStream(new FileOutputStream(file));
  106. for (int i = 0; i < dict_count; i++) {
  107. int count = 0;
  108. WordTable wt = wts[i];
  109. count = wt.getWordCount();
  110. GFNet.writeInt32(out, count);
  111. GFNet.writeInt8(out, wt.getWordMaxLen());
  112. Collection<SegAtom> atoms = wt.getWordMap().values();
  113. for (SegAtom atom : atoms) {
  114. int size = atom.write(out);
  115. System.out.println(i + "," + size);
  116. }
  117. }
  118. out.close();
  119. return true;
  120. } catch (FileNotFoundException e) {
  121. logger.error(e);
  122. } catch (IOException e) {
  123. logger.error(e);
  124. }
  125. return false;
  126. }
  127. public SegAtom getSegAtom(String word, int index) {
  128. SegAtom result = null;
  129. if (word != null && word.length() > 0) {
  130. if (index > 0 && index < wts.length) {
  131. WordTable wt = wts[index];
  132. result = wt.getSegAtom(word);
  133. }
  134. }
  135. return result;
  136. }
  137. public boolean addSegAtom(SegAtom sa,int index){
  138. if(sa!=null && index>=0 && index<dict_count){
  139. if(wts!=null){
  140. WordTable wt=wts[index];
  141. if(wt!=null){
  142. wt.addSegAtom(sa);
  143. }
  144. }
  145. }
  146. return false;
  147. }
  148. // ???????????????????
  149. public int getWordMaxLen(String word, int index) {
  150. int result = 0;
  151. if (word != null && word.length() > 0) {
  152. if (index > 0 && index < wts.length) {
  153. WordTable wt = wts[index];
  154. return wt.getWordMaxLen();
  155. }
  156. }
  157. return result;
  158. }
  159. public boolean strEqual(String b1, String b2) {
  160. if (b1 == null && b2 == null)
  161. return true;
  162. else if (b1 != null && b2 != null) {
  163. return b1.equals(b2);
  164. }
  165. return false;
  166. }
  167. public int getWordType(String word) {
  168. if (word != null) {
  169. int type = Utility.charType(word);
  170. int len = word.length();
  171. if (len > 0 && type == Utility.CT_CHINESE && GFString.isAllChinese(word))
  172. return Utility.WT_CHINESE;
  173. else if (len > 0 && type == Utility.CT_DELIMITER)
  174. return Utility.WT_DELIMITER;
  175. }
  176. return Utility.WT_OTHER;
  177. }
  178. /**
  179. * ????????????
  180. *
  181. * @param word
  182. * @param pos
  183. * @return
  184. */
  185. public boolean isExist(String word, int pos, int index) {
  186. if (word != null) {
  187. SegAtom atom = getSegAtom(word, index);
  188. if (atom != null) {
  189. return atom.hasPos(pos);
  190. }
  191. }
  192. return false;
  193. }
  194. public int getFreq(String word, int pos, int index) {
  195. if (word != null) {
  196. SegAtom atom = getSegAtom(word, index);
  197. if (atom != null) {
  198. return atom.getFreqByPos(pos);
  199. }
  200. }
  201. return 0;
  202. }
  203. public long totalFreq() {
  204. return totalFreq;
  205. }
  206. public int wordCount() {
  207. return wordCount;
  208. }
  209. public WordTable[] getWts() {
  210. return wts;
  211. }
  212. public void setWts(WordTable[] wts) {
  213. this.wts = wts;
  214. }
  215. }