/src/org/ictclas4j/bean/ContextStat.java
http://ictclas4j.googlecode.com/ · Java · 170 lines · 127 code · 34 blank · 9 comment · 32 complexity · 20c5d64d43a5bcfe5d81ed36ade21468 MD5 · raw file
- package org.ictclas4j.bean;
-
- import java.io.DataInputStream;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.util.ArrayList;
-
- import org.apache.log4j.Logger;
- import org.ictclas4j.utility.GFCommon;
- import org.ictclas4j.utility.Utility;
-
-
- public class ContextStat {
- private int tableLen;
-
- private int[] symbolTable;
-
- private ArrayList<TagContext> tcList;
-
- static Logger logger = Logger.getLogger(ContextStat.class);
-
- public ContextStat() {
- tcList = new ArrayList<TagContext>();
- }
-
- public boolean load(String fileName) {
- return load(fileName, false);
- }
-
- public boolean load(String fileName, boolean isReset) {
- File file = new File(fileName);
- if (!file.canRead())
- return false;// fail while opening the file
-
- try {
-
- byte[] b = null;
- DataInputStream in = new DataInputStream(new FileInputStream(file));
- // ????
- tableLen = GFCommon.bytes2int(Utility.readBytes(in, 4), false);
- logger.debug("tableLen:" + tableLen);
-
- // ??????
- symbolTable = new int[tableLen];
- for (int i = 0; i < tableLen; i++) {
- b = Utility.readBytes(in, 4);
- symbolTable[i] = GFCommon.bytes2int(b, false);
- logger.debug("symbolTable[" + i + "]:" + symbolTable[i]);
- }
-
- long fileLen = file.length();
- long curLen = 4 + tableLen * 4;
- while (curLen < fileLen) {
- logger.debug("tagContext:");
- TagContext tc = new TagContext();
-
- // ?????
- b = Utility.readBytes(in, 4);
- int key = GFCommon.bytes2int(b);
- curLen += 4;
- logger.debug("\tkey:" + key);
-
- // ?????
- b = Utility.readBytes(in, 4);
- curLen += 4;
- int totalFreq = GFCommon.bytes2int(b, false);
- logger.debug("\ttotalFreq:" + totalFreq);
-
- // ????
- int[] tagFreq = new int[tableLen];
- for (int i = 0; i < tableLen; i++) {
- b = Utility.readBytes(in, 4);
- curLen += 4;
- tagFreq[i] = GFCommon.bytes2int(b, false);
- logger.debug("\ttagFreq[" + i + "]:" + tagFreq[i]);
- }
-
- // ???????
- int[][] contextArray = new int[tableLen][tableLen];
- for (int i = 0; i < tableLen; i++) {
- String pr = "";
- logger.debug("\tcontextArray[" + i + "]");
- for (int j = 0; j < tableLen; j++) {
- b = Utility.readBytes(in, 4);
- curLen += 4;
- contextArray[i][j] = GFCommon.bytes2int(b, false);
- pr += " " + contextArray[i][j];
- }
- logger.debug("\t\t" + pr);
- }
-
- tc.setTotalFreq(totalFreq);
- tc.setKey(key);
- tc.setTagFreq(tagFreq);
- tc.setContextArray(contextArray);
- tcList.add(tc);
- }
- in.close();
- } catch (FileNotFoundException e) {
- logger.debug(e);
- } catch (IOException e) {
- logger.debug(e);
- }
- return true;
- }
-
- public int getFreq(int key, int symbol) {
- TagContext tc = getItem(key);
- if (tc == null)
- return 0;
-
- int index = Utility.binarySearch(symbol, symbolTable);
- if (index == -1)// error finding the symbol
- return 0;
-
- // Add the frequency
- int frequency = 0;
- if (tc.getTagFreq() != null)
- frequency = tc.getTagFreq()[index];
- return frequency;
-
- }
-
- public double getPossibility(int key, int prev, int cur) {
- double result = 0;
-
- int curIndex = Utility.binarySearch(cur, symbolTable);
- int prevIndex = Utility.binarySearch(prev, symbolTable);
-
- TagContext tc = getItem(key);
-
- // return a lower value, not 0 to prevent data sparse
- if (tc == null || curIndex == -1 || prevIndex == -1
- || tc.getContextArray()[prevIndex][curIndex] == 0
- || tc.getTagFreq()[prevIndex] == 0)
- return 0.000001;
-
- int prevCurConFreq = tc.getContextArray()[prevIndex][curIndex];
- int prevFreq = tc.getTagFreq()[prevIndex];
-
- // 0.9 and 0.1 is a value based experience
- result = 0.9 * (double) prevCurConFreq;
- result /= (double) prevFreq;
- result += 0.1 * (double) prevFreq / (double) tc.getTotalFreq();
-
- return result;
- }
-
- public TagContext getItem(int key) {
- TagContext result = null;
-
- if(tcList==null||tcList.size()==0)
- return null;
- if (key == 0 )
- result = tcList.get(0);
- else {
- int i=0;
- for ( ; i < tcList.size() && tcList.get(i).getKey()<key; i++);
- if(i<tcList.size() && tcList.get(i).getKey()==key)
- result=tcList.get(i);
- else if(i-1<tcList.size())
- result=tcList.get(i-1);
- }
-
- return result;
- }
-
- }