PageRenderTime 26ms CodeModel.GetById 9ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/ictclas4j/src/org/ictclas4j/bean/Dictionary.java

http://ictclas4j.googlecode.com/
Java | 253 lines | 190 code | 38 blank | 25 comment | 47 complexity | 1199c6838387a4f2586f637187caa395 MD5 | raw file
  1package org.ictclas4j.bean;
  2
  3import java.io.DataInputStream;
  4import java.io.DataOutputStream;
  5import java.io.File;
  6import java.io.FileInputStream;
  7import java.io.FileNotFoundException;
  8import java.io.FileOutputStream;
  9import java.io.IOException;
 10import java.util.Collection;
 11import java.util.HashMap;
 12
 13import org.apache.log4j.Logger;
 14import org.ictclas4j.util.Utility;
 15
 16import com.gftech.util.GFNet;
 17import com.gftech.util.GFString;
 18
 19public class Dictionary {
 20	/**
 21	 * ???,?6768?,GB2312??(before) 22034??gbk??+?????now)
 22	 */
 23	private WordTable[] wts;
 24
 25	private int wordCount;// ????
 26
 27	private long totalFreq;// ???
 28
 29	private int dict_count;
 30
 31	static Logger logger = Logger.getLogger(Dictionary.class);
 32
 33	public Dictionary() {
 34		this(null,false);
 35	}
 36	public Dictionary(String fileName) {
 37		this(fileName,false);
 38	}
 39	
 40	public Dictionary( boolean isExtend) {
 41		this(null,isExtend);
 42	}
 43	
 44	public Dictionary(String fileName,boolean isExtend) {
 45		init(isExtend);
 46		load(fileName);
 47	}
 48
 49	public void init(boolean isExtend) {
 50		wordCount = 0;
 51		totalFreq = 0;
 52		dict_count = isExtend ? Utility.GBK_NUM_EXT : Utility.GB_NUM;
 53		wts = new WordTable[dict_count];
 54
 55	}
 56
 57	/**
 58	 * ?????????.?6768??????(??5??????),???????????????,
 59	 * ???????????,??????????????????.
 60	 * 
 61	 * @param fileName
 62	 *            ???????
 63	 * @return
 64	 */
 65	public boolean load(String fileName) {
 66		int i = 0, j = 0;
 67		File file = new File(fileName);
 68		if (!file.canRead())
 69			return false;// fail while opening the file
 70
 71		try {
 72			long offset = 0;
 73			WordTable wt = new WordTable();
 74			SegAtom sa = new SegAtom();
 75			HashMap<String, SegAtom> wordMap = null;
 76			DataInputStream in = new DataInputStream(new FileInputStream(file));
 77			for (i = 0; i < dict_count; i++) {
 78				try {
 79					WordTable wtClone = wt.clone();
 80					logger.debug("?" + i);
 81					// ?????????????????(????)??,??????
 82					int count = GFNet.readInt32(in);
 83					logger.debug(" count:" + count);
 84					wtClone.setWordCount(count);
 85					int wordMaxLen = GFNet.readUInt8(in);
 86					wtClone.setWordMaxLen(wordMaxLen);
 87					offset += 5;
 88					wordMap = new HashMap<String, SegAtom>();
 89					for (j = 0; j < count; j++, wordCount++) {
 90						SegAtom saClone = sa.clone();
 91						int bc = saClone.read(in, 0);
 92						offset += bc;
 93						logger.debug(saClone);
 94						wordMap.put(saClone.getWord(), saClone);
 95						totalFreq += saClone.getTotalFreq();
 96					}
 97					wtClone.setWordMap(wordMap);
 98					wts[i] = wtClone;
 99				} catch (CloneNotSupportedException e) {
100					logger.fatal("Load dict:", e);
101				}
102			}
103
104			in.close();
105		} catch (FileNotFoundException e) {
106			logger.fatal("load dict " + fileName + ":", e);
107		} catch (IOException e) {
108			logger.fatal("load dict " + fileName + ":", e);
109			logger.fatal("i:" + i + ",j:" + j);
110		}
111
112		return true;
113	}
114
115	/**
116	 * 
117	 * @param fileName
118	 * @return
119	 */
120	public boolean save(String fileName) {
121
122		File file = new File(fileName);
123		try {
124			DataOutputStream out = new DataOutputStream(new FileOutputStream(file));
125			for (int i = 0; i < dict_count; i++) {
126
127				int count = 0;
128				WordTable wt = wts[i];
129				count = wt.getWordCount();
130				GFNet.writeInt32(out, count);
131				GFNet.writeInt8(out, wt.getWordMaxLen());
132				Collection<SegAtom> atoms = wt.getWordMap().values();
133				for (SegAtom atom : atoms) {
134					int size = atom.write(out);
135					System.out.println(i + "," + size);
136				}
137			}
138			out.close();
139			return true;
140		} catch (FileNotFoundException e) {
141			logger.error(e);
142		} catch (IOException e) {
143			logger.error(e);
144		}
145		return false;
146	}
147
148	public SegAtom getSegAtom(String word, int index) {
149		SegAtom result = null;
150
151		if (word != null && word.length() > 0) {
152			if (index > 0 && index < wts.length) {
153				WordTable wt = wts[index];
154				result = wt.getSegAtom(word);
155
156			}
157		}
158		return result;
159	}
160	
161	public boolean addSegAtom(SegAtom sa,int index){
162		
163		if(sa!=null && index>=0 && index<dict_count){
164			if(wts!=null){
165				WordTable wt=wts[index];
166				if(wt!=null){
167					wt.addSegAtom(sa);
168				}
169			}
170		}
171		return false;
172	}
173
174	// ???????????????????
175	public int getWordMaxLen(String word, int index) {
176		int result = 0;
177		if (word != null && word.length() > 0) {
178			if (index > 0 && index < wts.length) {
179				WordTable wt = wts[index];
180				return wt.getWordMaxLen();
181			}
182		}
183		return result;
184	}
185
186	public boolean strEqual(String b1, String b2) {
187		if (b1 == null && b2 == null)
188			return true;
189		else if (b1 != null && b2 != null) {
190			return b1.equals(b2);
191		}
192		return false;
193	}
194
195	public int getWordType(String word) {
196		if (word != null) {
197			int type = Utility.charType(word);
198			int len = word.length();
199
200			if (len > 0 && type == Utility.CT_CHINESE && GFString.isAllChinese(word))
201				return Utility.WT_CHINESE;
202			else if (len > 0 && type == Utility.CT_DELIMITER)
203				return Utility.WT_DELIMITER;
204
205		}
206		return Utility.WT_OTHER;
207	}
208
209	/**
210	 * ????????????
211	 * 
212	 * @param word
213	 * @param pos
214	 * @return
215	 */
216	public boolean isExist(String word, int pos, int index) {
217		if (word != null) {
218			SegAtom atom = getSegAtom(word, index);
219			if (atom != null) {
220				return atom.hasPos(pos);
221			}
222		}
223
224		return false;
225	}
226
227	public int getFreq(String word, int pos, int index) {
228		if (word != null) {
229			SegAtom atom = getSegAtom(word, index);
230			if (atom != null) {
231				return atom.getFreqByPos(pos);
232			}
233		}
234		return 0;
235	}
236
237	public long totalFreq() {
238		return totalFreq;
239	}
240
241	public int wordCount() {
242		return wordCount;
243	}
244
245	public WordTable[] getWts() {
246		return wts;
247	}
248
249	public void setWts(WordTable[] wts) {
250		this.wts = wts;
251	}
252
253}