/ictclas4j/src/org/ictclas4j/bean/DictLib.java

http://ictclas4j.googlecode.com/ · Java · 292 lines · 208 code · 48 blank · 36 comment · 30 complexity · f87588478cc53a1ed501ea144d60713e MD5 · raw file

  1. package org.ictclas4j.bean;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.IOException;
  5. import java.util.ArrayList;
  6. import java.util.HashMap;
  7. import java.util.Properties;
  8. import org.apache.jcs.JCS;
  9. import org.apache.jcs.access.exception.CacheException;
  10. import org.apache.jcs.engine.control.CompositeCacheManager;
  11. import org.apache.log4j.Logger;
  12. import org.ictclas4j.util.Utility;
  13. import org.ictclas4j.util.Utility.TAG_TYPE;
  14. import com.gftech.util.GFFile;
  15. import com.gftech.util.GFFinal;
  16. import com.gftech.util.GFString;
  17. import com.gftech.util.GFUtil;
  18. /**
  19. * Dictionary Library
  20. *
  21. * @author sinboy
  22. * @since 2007.12.6
  23. *
  24. */
  25. public class DictLib {
  26. private Dictionary coreDict;
  27. private Dictionary bigramDict;
  28. private Dictionary personUnknownDict;
  29. private PosContext personContext;
  30. private Dictionary transPersonUnknownDict;
  31. private PosContext transPersonContext;
  32. private Dictionary placeUnknownDict;
  33. private PosContext placeContext;
  34. private Dictionary lexUnknownDict;
  35. private PosContext lexContext;
  36. private JCS segCache;// ????Cache
  37. // GBK??+??????GBK_ID?
  38. private HashMap<String, Integer> idMap;
  39. static Logger logger = Logger.getLogger(DictLib.class);
  40. public DictLib() {
  41. boolean isGBKExtend = false;
  42. idMap = new HashMap<String, Integer>();
  43. for (int i = 0; i < Utility.GBK_NUM_EXT; i++) {
  44. idMap.put(Utility.getGBKWord(i), i);
  45. }
  46. logger.info("Load coreDict ...");
  47. coreDict = new Dictionary("data" + GFFinal.FILE_SEP + "coreDict.dct", isGBKExtend);
  48. logger.info("Load bigramDict ...");
  49. bigramDict = new Dictionary("data" + GFFinal.FILE_SEP + "bigramDict.dct", isGBKExtend);
  50. logger.info("Load tagger dict ...");
  51. personUnknownDict = new Dictionary("data" + GFFinal.FILE_SEP + "nr.dct", isGBKExtend);
  52. personContext = new PosContext("data" + GFFinal.FILE_SEP + "nr.ctx");
  53. transPersonUnknownDict = new Dictionary("data" + GFFinal.FILE_SEP + "tr.dct", isGBKExtend);
  54. transPersonContext = new PosContext("data" + GFFinal.FILE_SEP + "tr.ctx");
  55. placeUnknownDict = new Dictionary("data" + GFFinal.FILE_SEP + "ns.dct", isGBKExtend);
  56. placeContext = new PosContext("data" + GFFinal.FILE_SEP + "ns.ctx");
  57. lexUnknownDict = coreDict;
  58. lexContext = new PosContext("data" + GFFinal.FILE_SEP + "lexical.ctx");
  59. loadMyDict("data"+ GFFinal.FILE_SEP +"myDict.txt");
  60. // personTagger = new PosTagger(Utility.TAG_TYPE.TT_PERSON, "data" +
  61. // GFFinal.FILE_SEP + "nr", coreDict);
  62. // transPersonTagger = new PosTagger(Utility.TAG_TYPE.TT_TRANS_PERSON,
  63. // "data" + GFFinal.FILE_SEP + "tr", coreDict);
  64. // placeTagger = new PosTagger(Utility.TAG_TYPE.TT_TRANS_PERSON, "data"
  65. // + GFFinal.FILE_SEP + "ns", coreDict);
  66. // lexTagger = new PosTagger(Utility.TAG_TYPE.TT_NORMAL, "data" +
  67. // GFFinal.FILE_SEP + "lexical", coreDict);
  68. // pronunDict = new PronunDict("data"+GFFinal.FILE_SEP+"pronun.txt");
  69. logger.info("Load dict is over");
  70. // init Segment Cache
  71. try {
  72. CompositeCacheManager ccm = CompositeCacheManager.getUnconfiguredInstance();
  73. StringBuffer sb = new StringBuffer();
  74. Properties props = new Properties();
  75. sb.append("conf").append(GFFinal.FILE_SEP).append("cache.ccf");
  76. props.load(new FileInputStream(new File(sb.toString())));
  77. ccm.configure(props);
  78. segCache = JCS.getInstance("segCache");
  79. logger.info("init index?info?seg cache");
  80. } catch (CacheException e) {
  81. logger.error("init segment cache is failed", e);
  82. } catch (IOException e) {
  83. logger.error("init segment cache is failed", e);
  84. }
  85. }
  86. public Dictionary getBigramDict() {
  87. return bigramDict;
  88. }
  89. public Dictionary getCoreDict() {
  90. return coreDict;
  91. }
  92. public Dictionary getPersonUnknownDict() {
  93. return personUnknownDict;
  94. }
  95. public PosContext getPersonContext() {
  96. return personContext;
  97. }
  98. public Dictionary getTransPersonUnknownDict() {
  99. return transPersonUnknownDict;
  100. }
  101. public PosContext getTransPersonContext() {
  102. return transPersonContext;
  103. }
  104. public Dictionary getPlaceUnknownDict() {
  105. return placeUnknownDict;
  106. }
  107. public PosContext getPlaceContext() {
  108. return placeContext;
  109. }
  110. public Dictionary getLexUnknownDict() {
  111. return lexUnknownDict;
  112. }
  113. public PosContext getLexContext() {
  114. return lexContext;
  115. }
  116. public Dictionary getUnknownDict(TAG_TYPE type) {
  117. switch (type) {
  118. case TT_PERSON:
  119. return this.personUnknownDict;
  120. case TT_TRANS_PERSON:
  121. return this.transPersonUnknownDict;
  122. case TT_PLACE:
  123. return this.placeUnknownDict;
  124. default:
  125. return this.lexUnknownDict;
  126. }
  127. }
  128. public PosContext getContext(TAG_TYPE type) {
  129. switch (type) {
  130. case TT_PERSON:
  131. return this.personContext;
  132. case TT_TRANS_PERSON:
  133. return this.transPersonContext;
  134. case TT_PLACE:
  135. return this.placeContext;
  136. default:
  137. return this.lexContext;
  138. }
  139. }
  140. // TODO:
  141. public boolean addWordItem(SegAtom wi, boolean isOvercast, boolean isNotSave) {
  142. // if (wi != null && coreDict != null) {
  143. // int handle = wi.getHandle();
  144. // return coreDict.addItem(wi.getWord(), handle, wi.getFreq(), false,
  145. // isOvercast, isNotSave);
  146. // } else
  147. return false;
  148. }
  149. // TODO:
  150. public boolean addBigramWordItem(SegAtom wi, boolean isNotSave) {
  151. // if (wi != null && bigramDict != null) {
  152. // int handle = wi.getHandle();
  153. // return bigramDict.addItem(wi.getWord(), handle, wi.getFreq(), false,
  154. // false, isNotSave);
  155. // } else
  156. return false;
  157. }
  158. // TODO:
  159. public boolean delWordItem(String word, int pos) {
  160. // if (word != null && coreDict != null) {
  161. // return coreDict.delItem(word, pos);
  162. // } else
  163. return false;
  164. }
  165. // ??Cache??????
  166. public SegResult getCachedSeg(String src) {
  167. SegResult result = null;
  168. if (segCache != null && src != null) {
  169. result = (SegResult) segCache.get(src);
  170. }
  171. return result;
  172. }
  173. public void delCachedSeg(String word) {
  174. if (segCache != null && word != null) {
  175. try {
  176. segCache.remove(word);
  177. } catch (CacheException e) {
  178. logger.error(e);
  179. }
  180. }
  181. }
  182. public void addCachedSeg(String src, SegResult result) {
  183. if (segCache != null && src != null && result != null) {
  184. try {
  185. GFUtil.putIntoCache(segCache, src, result);
  186. } catch (CacheException e) {
  187. logger.error(e);
  188. }
  189. }
  190. }
  191. public int getGBKID(String word) {
  192. if (word != null && word.length() > 0) {
  193. String first = GFString.getFirst(word);
  194. if (first != null) {
  195. Integer obj = idMap.get(first);
  196. return obj != null ? obj : -1;
  197. }
  198. }
  199. return -1;
  200. }
  201. // ?????????
  202. private void loadMyDict(String fileName) {
  203. if (fileName != null) {
  204. try {
  205. SegAtom sa = new SegAtom();
  206. ArrayList<String> list = GFFile.readTxtFile2(fileName);
  207. for (String line : list) {
  208. if (line.startsWith("#"))
  209. continue;
  210. line = line.replaceAll("?", ",");
  211. String[] strs = line.split(",");
  212. if (strs.length >= 4) {
  213. SegAtom saClone = sa.clone();
  214. saClone.setWord(strs[0]);
  215. Pos pos=new Pos();
  216. pos.setTag(POSTag.str2int(strs[1]));
  217. pos.setFreq(GFString.cint(strs[2]));
  218. pos.setVisible("1".equals(strs[3])?true:false);
  219. saClone.addPos(pos);
  220. int index=getGBKID(strs[0]);
  221. coreDict.addSegAtom(saClone,index);
  222. if(strs.length==5){
  223. String str=strs[4];
  224. String[] strs2=str.split(" ");
  225. for(String s:strs2){
  226. SegAtom saClone2=sa.clone();
  227. saClone2.setWord(s);
  228. Pos pos2=new Pos();
  229. pos2.setTag(3);
  230. pos2.setFreq(1);
  231. saClone2.addPos(pos2);
  232. bigramDict.addSegAtom(saClone2,index);
  233. }
  234. }
  235. }
  236. }
  237. } catch (IOException e) {
  238. logger.error("load myDict is failed", e);
  239. } catch (CloneNotSupportedException e) {
  240. logger.error(e);
  241. }
  242. }
  243. }
  244. }