PageRenderTime 31ms CodeModel.GetById 16ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/ictclas4j/src/org/ictclas4j/bean/DictLib.java

http://ictclas4j.googlecode.com/
Java | 292 lines | 208 code | 48 blank | 36 comment | 30 complexity | f87588478cc53a1ed501ea144d60713e MD5 | raw file
  1package org.ictclas4j.bean;
  2
  3import java.io.File;
  4import java.io.FileInputStream;
  5import java.io.IOException;
  6import java.util.ArrayList;
  7import java.util.HashMap;
  8import java.util.Properties;
  9
 10import org.apache.jcs.JCS;
 11import org.apache.jcs.access.exception.CacheException;
 12import org.apache.jcs.engine.control.CompositeCacheManager;
 13import org.apache.log4j.Logger;
 14import org.ictclas4j.util.Utility;
 15import org.ictclas4j.util.Utility.TAG_TYPE;
 16
 17import com.gftech.util.GFFile;
 18import com.gftech.util.GFFinal;
 19import com.gftech.util.GFString;
 20import com.gftech.util.GFUtil;
 21
 22/**
 23 * Dictionary Library
 24 * 
 25 * @author sinboy
 26 * @since 2007.12.6
 27 * 
 28 */
 29public class DictLib {
 30	private Dictionary coreDict;
 31
 32	private Dictionary bigramDict;
 33
 34	private Dictionary personUnknownDict;
 35
 36	private PosContext personContext;
 37
 38	private Dictionary transPersonUnknownDict;
 39
 40	private PosContext transPersonContext;
 41
 42	private Dictionary placeUnknownDict;
 43
 44	private PosContext placeContext;
 45
 46	private Dictionary lexUnknownDict;
 47
 48	private PosContext lexContext;
 49
 50	private JCS segCache;// ????Cache
 51
 52	// GBK??+??????GBK_ID?
 53	private HashMap<String, Integer> idMap;
 54
 55	static Logger logger = Logger.getLogger(DictLib.class);
 56
 57	public DictLib() {
 58		boolean isGBKExtend = false;
 59		idMap = new HashMap<String, Integer>();
 60		for (int i = 0; i < Utility.GBK_NUM_EXT; i++) {
 61			idMap.put(Utility.getGBKWord(i), i);
 62		}
 63
 64		logger.info("Load coreDict  ...");
 65		coreDict = new Dictionary("data" + GFFinal.FILE_SEP + "coreDict.dct", isGBKExtend);
 66
 67		logger.info("Load bigramDict ...");
 68		bigramDict = new Dictionary("data" + GFFinal.FILE_SEP + "bigramDict.dct", isGBKExtend);
 69
 70		logger.info("Load tagger dict ...");
 71		personUnknownDict = new Dictionary("data" + GFFinal.FILE_SEP + "nr.dct", isGBKExtend);
 72		personContext = new PosContext("data" + GFFinal.FILE_SEP + "nr.ctx");
 73		transPersonUnknownDict = new Dictionary("data" + GFFinal.FILE_SEP + "tr.dct", isGBKExtend);
 74		transPersonContext = new PosContext("data" + GFFinal.FILE_SEP + "tr.ctx");
 75		placeUnknownDict = new Dictionary("data" + GFFinal.FILE_SEP + "ns.dct", isGBKExtend);
 76		placeContext = new PosContext("data" + GFFinal.FILE_SEP + "ns.ctx");
 77		lexUnknownDict = coreDict;
 78		lexContext = new PosContext("data" + GFFinal.FILE_SEP + "lexical.ctx");
 79
 80		loadMyDict("data"+ GFFinal.FILE_SEP +"myDict.txt");
 81		// personTagger = new PosTagger(Utility.TAG_TYPE.TT_PERSON, "data" +
 82		// GFFinal.FILE_SEP + "nr", coreDict);
 83		// transPersonTagger = new PosTagger(Utility.TAG_TYPE.TT_TRANS_PERSON,
 84		// "data" + GFFinal.FILE_SEP + "tr", coreDict);
 85		// placeTagger = new PosTagger(Utility.TAG_TYPE.TT_TRANS_PERSON, "data"
 86		// + GFFinal.FILE_SEP + "ns", coreDict);
 87		// lexTagger = new PosTagger(Utility.TAG_TYPE.TT_NORMAL, "data" +
 88		// GFFinal.FILE_SEP + "lexical", coreDict);
 89
 90		// pronunDict = new PronunDict("data"+GFFinal.FILE_SEP+"pronun.txt");
 91		logger.info("Load dict is over");
 92
 93		// init Segment Cache
 94		try {
 95			CompositeCacheManager ccm = CompositeCacheManager.getUnconfiguredInstance();
 96			StringBuffer sb = new StringBuffer();
 97			Properties props = new Properties();
 98			sb.append("conf").append(GFFinal.FILE_SEP).append("cache.ccf");
 99			props.load(new FileInputStream(new File(sb.toString())));
100			ccm.configure(props);
101			segCache = JCS.getInstance("segCache");
102			logger.info("init index?info?seg cache");
103		} catch (CacheException e) {
104			logger.error("init segment cache is failed", e);
105		} catch (IOException e) {
106			logger.error("init segment cache is failed", e);
107		}
108	}
109
110	public Dictionary getBigramDict() {
111		return bigramDict;
112	}
113
114	public Dictionary getCoreDict() {
115		return coreDict;
116	}
117
118	public Dictionary getPersonUnknownDict() {
119		return personUnknownDict;
120	}
121
122	public PosContext getPersonContext() {
123		return personContext;
124	}
125
126	public Dictionary getTransPersonUnknownDict() {
127		return transPersonUnknownDict;
128	}
129
130	public PosContext getTransPersonContext() {
131		return transPersonContext;
132	}
133
134	public Dictionary getPlaceUnknownDict() {
135		return placeUnknownDict;
136	}
137
138	public PosContext getPlaceContext() {
139		return placeContext;
140	}
141
142	public Dictionary getLexUnknownDict() {
143		return lexUnknownDict;
144	}
145
146	public PosContext getLexContext() {
147		return lexContext;
148	}
149
150	public Dictionary getUnknownDict(TAG_TYPE type) {
151		switch (type) {
152		case TT_PERSON:
153			return this.personUnknownDict;
154		case TT_TRANS_PERSON:
155			return this.transPersonUnknownDict;
156		case TT_PLACE:
157			return this.placeUnknownDict;
158		default:
159			return this.lexUnknownDict;
160		}
161	}
162
163	public PosContext getContext(TAG_TYPE type) {
164		switch (type) {
165		case TT_PERSON:
166			return this.personContext;
167		case TT_TRANS_PERSON:
168			return this.transPersonContext;
169		case TT_PLACE:
170			return this.placeContext;
171		default:
172			return this.lexContext;
173		}
174	}
175
176	// TODO:
177	public boolean addWordItem(SegAtom wi, boolean isOvercast, boolean isNotSave) {
178		// if (wi != null && coreDict != null) {
179		// int handle = wi.getHandle();
180		// return coreDict.addItem(wi.getWord(), handle, wi.getFreq(), false,
181		// isOvercast, isNotSave);
182		// } else
183		return false;
184	}
185
186	// TODO:
187	public boolean addBigramWordItem(SegAtom wi, boolean isNotSave) {
188		// if (wi != null && bigramDict != null) {
189		// int handle = wi.getHandle();
190		// return bigramDict.addItem(wi.getWord(), handle, wi.getFreq(), false,
191		// false, isNotSave);
192		// } else
193		return false;
194	}
195
196	// TODO:
197	public boolean delWordItem(String word, int pos) {
198		// if (word != null && coreDict != null) {
199		// return coreDict.delItem(word, pos);
200		// } else
201		return false;
202	}
203
204	// ??Cache??????
205	public SegResult getCachedSeg(String src) {
206		SegResult result = null;
207
208		if (segCache != null && src != null) {
209			result = (SegResult) segCache.get(src);
210
211		}
212		return result;
213	}
214
215	public void delCachedSeg(String word) {
216		if (segCache != null && word != null) {
217			try {
218				segCache.remove(word);
219			} catch (CacheException e) {
220				logger.error(e);
221			}
222		}
223	}
224
225	public void addCachedSeg(String src, SegResult result) {
226		if (segCache != null && src != null && result != null) {
227			try {
228				GFUtil.putIntoCache(segCache, src, result);
229			} catch (CacheException e) {
230				logger.error(e);
231			}
232		}
233	}
234
235	public int getGBKID(String word) {
236		if (word != null && word.length() > 0) {
237			String first = GFString.getFirst(word);
238			if (first != null) {
239				Integer obj = idMap.get(first);
240				return obj != null ? obj : -1;
241			}
242		}
243		return -1;
244	}
245
246	// ?????????
247	private void loadMyDict(String fileName) {
248		if (fileName != null) {
249			try {
250				SegAtom sa = new SegAtom();
251				ArrayList<String> list = GFFile.readTxtFile2(fileName);
252				for (String line : list) {
253					if (line.startsWith("#"))
254						continue;
255					line = line.replaceAll("?", ",");
256					String[] strs = line.split(",");
257					if (strs.length >= 4) {
258						SegAtom saClone = sa.clone();
259						saClone.setWord(strs[0]);
260						Pos pos=new Pos();
261						pos.setTag(POSTag.str2int(strs[1]));
262						pos.setFreq(GFString.cint(strs[2]));
263						pos.setVisible("1".equals(strs[3])?true:false);
264						saClone.addPos(pos);
265						int index=getGBKID(strs[0]);
266						coreDict.addSegAtom(saClone,index);
267						
268						if(strs.length==5){
269							String str=strs[4];
270							String[] strs2=str.split(" ");
271							for(String s:strs2){
272								SegAtom saClone2=sa.clone();
273								saClone2.setWord(s); 
274								Pos pos2=new Pos();
275								pos2.setTag(3);
276								pos2.setFreq(1);
277								saClone2.addPos(pos2);
278								bigramDict.addSegAtom(saClone2,index);
279							}
280						}
281					}
282
283				}
284			} catch (IOException e) {
285				logger.error("load myDict is failed", e);
286			} catch (CloneNotSupportedException e) {
287				logger.error(e);
288			}
289		}
290	}
291
292}