PageRenderTime 92ms CodeModel.GetById 2ms app.highlight 83ms RepoModel.GetById 1ms app.codeStats 1ms

/ictclas4j/src/org/ictclas4j/segment/PosTagger.java

http://ictclas4j.googlecode.com/
Java | 777 lines | 584 code | 76 blank | 117 comment | 233 complexity | d39f00cbe0855bd7189024f7a9582e9a MD5 | raw file
  1package org.ictclas4j.segment;
  2
  3import java.util.ArrayList;
  4
  5import org.ictclas4j.bean.AdjoiningPos;
  6import org.ictclas4j.bean.DictLib;
  7import org.ictclas4j.bean.Dictionary;
  8import org.ictclas4j.bean.POSTag;
  9import org.ictclas4j.bean.Pos;
 10import org.ictclas4j.bean.PosContext;
 11import org.ictclas4j.bean.SegAtom;
 12import org.ictclas4j.bean.SegNode;
 13import org.ictclas4j.util.Utility;
 14import org.ictclas4j.util.Utility.TAG_TYPE;
 15
 16
 17/**
 18 * ???????
 19 * 
 20 * @author sinboy
 21 * @since 2007.5.17 updated
 22 * 
 23 */
 24public class PosTagger {
 25	private DictLib dictLib;
 26
 27	private Dictionary coreDict;
 28
 29	private Dictionary unknownDict;
 30
 31	private PosContext context;
 32
 33	private int pos;
 34
 35	private TAG_TYPE tagType;
 36
 37	String unknownFlags;
 38
 39	public PosTagger(TAG_TYPE type, DictLib dictLib) {
 40		if (dictLib != null) {
 41			this.tagType = type;
 42			this.dictLib = dictLib;
 43			this.coreDict = dictLib.getCoreDict();
 44
 45			switch (type) {
 46			case TT_PERSON:
 47				unknownFlags = "?##?";
 48				pos = -POSTag.NOUN_PERSON;
 49				context = dictLib.getPersonContext();
 50				unknownDict = dictLib.getPersonUnknownDict();
 51				break;
 52			case TT_TRANS_PERSON:
 53				unknownFlags = "?##?";
 54				pos = -POSTag.NOUN_PERSON;
 55				context = dictLib.getTransPersonContext();
 56				unknownDict = dictLib.getTransPersonUnknownDict();
 57				break;
 58			case TT_PLACE:
 59				unknownFlags = "?##?";
 60				pos = -POSTag.NOUN_SPACE;
 61				context = dictLib.getPlaceContext();
 62				unknownDict = dictLib.getPlaceUnknownDict();
 63				break;
 64			default:
 65				pos = 0;
 66				context = dictLib.getLexContext();
 67				unknownDict = dictLib.getLexUnknownDict();
 68				break;
 69			}
 70
 71		}
 72	}
 73
 74	/**
 75	 * ????????????????????????????
 76	 * 
 77	 * @param segGraph
 78	 * @param coreDict
 79	 * @return
 80	 */
 81	public boolean recognise(SegGraph segGraph, ArrayList<SegNode> sns) {
 82
 83		if (segGraph != null && sns != null && coreDict != null && unknownDict != null && context != null) {
 84			posTag(sns);
 85			getBestPos(sns);
 86			// DebugUtil.outputPostag(sns);
 87			switch (tagType) {
 88			case TT_PERSON:// Person recognition
 89				personRecognize(segGraph, sns);
 90				break;
 91			case TT_PLACE:// Place name recognition
 92			case TT_TRANS_PERSON:// Transliteration Person
 93				placeRecognize(segGraph, sns, coreDict);
 94				break;
 95			}
 96		}
 97
 98		return true;
 99	}
100
101	public boolean recognise(ArrayList<SegNode> sns) {
102
103		if (sns != null && unknownDict != null && context != null) {
104			posTag(sns);
105			getBestPos(sns);
106			// DebugUtil.outputPostag(sns);
107			switch (tagType) {
108			case TT_NORMAL:
109				for (SegNode sn : sns) {
110					if (sn.getPos() == 0) {
111						sn.setPos(getBestTag(sn));
112					}
113				}
114			}
115		}
116
117		return true;
118	}
119
120	/**
121	 * ??????????
122	 * 
123	 * @param frs
124	 *            ???????
125	 * @pararm startIndex ???????????
126	 * @param coreDict
127	 *            ?????
128	 * @param unknownDict
129	 *            ??????
130	 * @return ??????????
131	 */
132	public void posTag(ArrayList<SegNode> sns) {
133
134		if (sns != null && coreDict != null && unknownDict != null && context != null) {
135			int i = 0;
136			String curWord = null;
137
138			for (; i < sns.size(); i++) {
139				SegNode sn = sns.get(i);
140				sn.setAllPos(null);
141				curWord = sn.getSrcWord();
142				int gbkID = sn.getGbkID();// dictLib.getGBKID(curWord);
143				// if (tagType == Utility.TAG_TYPE.TT_NORMAL ||
144				// !unknownDict.isExist(sn.getWord(), 44)) {
145				//
146				// }
147
148				if (tagType != Utility.TAG_TYPE.TT_NORMAL) {
149
150					// ????????????
151					if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0) {
152						String prevWord = sns.get(i - 1).getSrcWord();
153						if (Utility.charType(prevWord) == Utility.CT_CHINESE) {
154							if (".".equals(curWord))
155								curWord = "?";
156							else if ("-".equals(curWord))
157								curWord = "?";
158						}
159					}
160
161					if (sn.getPos() < 0) {
162						AdjoiningPos pos = new AdjoiningPos( 0 , 0);
163						sn.addPos(pos);
164					} else {
165						// ?unknownDict?????????????
166						SegAtom sa = unknownDict.getSegAtom(curWord, gbkID);
167						for (int j = 0; sa != null && j < sa.getPosCount(); j++) {
168							Pos pos = sa.getPos(j);
169							double value = -Math.log((1 + pos.getFreq()));
170							value += Math.log((context.getFreq(pos.getTag()) + sa.getPosCount() + 1));
171							AdjoiningPos apos = new AdjoiningPos(pos , value);
172							sn.addPos(apos);
173						}
174
175						if (Utility.SENTENCE_BEGIN.equals(curWord))
176							sn.addPos(new AdjoiningPos( 100 , 0));
177
178						else if (Utility.SENTENCE_END.equals(curWord))
179							sn.addPos(new AdjoiningPos( 101 , 0));
180						else {
181							int freq = 0;
182							sa = coreDict.getSegAtom(curWord, gbkID);
183							if (sa != null) {
184								double value = -Math.log((double) (1 + freq));
185								value += Math.log((double) (context.getFreq(0) + sa.getPosCount()));
186								sn.addPos(new AdjoiningPos( 0 , value));
187
188							}
189						}
190					}
191				} else {
192					if (sn.getPos() > 0) {
193						int tag = sn.getPos();
194						double value = -Math.log(1 + sn.getFreq());
195						value += Math.log(1 + context.getFreq(tag));
196						if (value < 0)
197							value = 0;
198						sn.addPos(new AdjoiningPos( tag,  value));
199					} else {
200						if (sn.getPos() < 0) {
201							sn.setPos(-sn.getPos());
202							sn.addPos(new AdjoiningPos( -sn.getPos(),  sn.getFreq()));
203						}
204						SegAtom sa = coreDict.getSegAtom(curWord, gbkID);
205						if (sa != null) {
206							for (int j = 0; j < sa.getPosCount(); j++) {
207								Pos pos = sa.getPos(j);
208								double value = -Math.log(1 + pos.getFreq());
209								value += Math.log(context.getFreq(pos.getTag()) + sa.getPosCount());
210								sn.addPos(new AdjoiningPos(pos , value));
211							}
212						}
213					}
214				}
215
216				if (sn.getAllPos() == null)
217					guessPos(tagType, sn);
218
219				// ??????????allPos?null???????????
220				// ????????????,??????“?##?”??
221				if (i - 1 >= 0 && sns.get(i - 1).getPosSize() == -1) {
222					if (sn.getPosSize() > 0) {
223						Pos pos = sn.getAllPos().get(0).getPos();
224						int ipos = pos.getTag() == POSTag.SEN_END ? POSTag.UNKNOWN : pos.getTag();
225						AdjoiningPos apos = new AdjoiningPos( ipos , 0);
226						sns.get(i - 1).addPos(apos);
227					}
228				}
229			}
230
231			// ???????
232			SegNode last = sns.get(i - 1);
233			if (last != null) {
234				SegNode sn = new SegNode();
235				int tag = 0;
236				if (tagType != Utility.TAG_TYPE.TT_NORMAL)
237					tag = 101;
238				else
239					tag = 1;
240				AdjoiningPos pos = new AdjoiningPos( tag, 0);
241				sn.addPos(pos);
242				sns.add(sn);
243			}
244		}
245	}
246
247	/**
248	 * ???????N??????????????????
249	 */
250	private void getBestPos(ArrayList<SegNode> sns) {
251		ArrayList<AdjoiningPos> prevAllPos = null;
252		ArrayList<AdjoiningPos> allPos = null;
253		if (sns != null && context != null) {
254			for (int i = 0; i < sns.size(); i++) {
255				if (i == 0) {
256					int pos = tagType != Utility.TAG_TYPE.TT_NORMAL ? 100 : 0;
257					prevAllPos = new ArrayList<AdjoiningPos>();
258					prevAllPos.add(new AdjoiningPos(pos, 0));
259				} else {
260					prevAllPos = sns.get(i - 1).getAllPos();
261				}
262				allPos = sns.get(i).getAllPos();
263				if (allPos != null)
264					for (AdjoiningPos pos : allPos) {
265						// ?????????????????????
266						int bestPrev = 0;
267						double minValue = 10000000;
268						for (int k = 0; prevAllPos != null && k < prevAllPos.size(); k++) {
269							AdjoiningPos prevPos = prevAllPos.get(k);
270							double temp = context.computePossibility(prevPos.getPos().getTag(), pos.getPos().getTag());
271							temp = -Math.log(temp) + prevPos.getValue();
272							if (temp < minValue) {
273								minValue = temp;
274								bestPrev = k;
275							}
276						}
277
278						pos.setPrev(bestPrev);
279						pos.setValue(pos.getValue() + minValue);
280					}
281			}
282
283			tagBest(sns);
284
285			// for(SegNode sn:sns){
286			// String word=sn.getSrcWord();
287			// System.out.println(word+":");
288			// for(AdjoiningPos ap:sn.getAllPos()){
289			// System.out.println("
290			// "+POSTag.int2str(ap.getPos())+","+ap.getValue()+","+ap.getPrev()+","+ap.isBest());
291			// }
292			// }
293		}
294	}
295
296	// ???????
297	private int guessPos(TAG_TYPE tagType, SegNode sn) {
298		int result = -1;
299		if (sn != null && context != null) {
300			int charType;
301			double freq = 0;
302
303			String word = sn.getWord();
304			if (word == null)
305				return result;
306
307			switch (tagType) {
308			case TT_NORMAL:
309				break;
310			case TT_PERSON:
311				if (word.indexOf("××") != -1) {
312					freq = (double) 1 / (double) (context.getFreq(6) + 1);
313					sn.addPos(new AdjoiningPos(6, freq));
314				} else {
315					freq = (double) 1 / (double) (context.getFreq(0) + 1);
316					sn.addPos(new AdjoiningPos(0, freq));
317
318					if (sn.getLen() >= 4) {
319						freq = (double) 1 / (double) (context.getFreq(0) + 1);
320						sn.addPos(new AdjoiningPos(0, freq));
321						freq = (double) 1 / (double) (context.getFreq(11) * 8);
322						sn.addPos(new AdjoiningPos(11, freq));
323						freq = (double) 1 / (double) (context.getFreq(12) * 8);
324						sn.addPos(new AdjoiningPos(12, freq));
325						freq = (double) 1 / (double) (context.getFreq(13) * 8);
326						sn.addPos(new AdjoiningPos(13, freq));
327					} else if (sn.getLen() == 2) {
328						freq = (double) 1 / (double) (context.getFreq(0) + 1);
329						sn.addPos(new AdjoiningPos(0, freq));
330						charType = Utility.charType(word);
331						if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
332							freq = (double) 1 / (double) (context.getFreq(1) + 1);
333							sn.addPos(new AdjoiningPos(1, freq));
334							freq = (double) 1 / (double) (context.getFreq(2) + 1);
335							sn.addPos(new AdjoiningPos(2, freq));
336							freq = (double) 1 / (double) (context.getFreq(3) + 1);
337							sn.addPos(new AdjoiningPos(3, freq));
338							freq = (double) 1 / (double) (context.getFreq(4) + 1);
339							sn.addPos(new AdjoiningPos(4, freq));
340						}
341						freq = (double) 1 / (double) (context.getFreq(11) * 8);
342						sn.addPos(new AdjoiningPos(11, freq));
343						freq = (double) 1 / (double) (context.getFreq(12) * 8);
344						sn.addPos(new AdjoiningPos(12, freq));
345						freq = (double) 1 / (double) (context.getFreq(13) * 8);
346						sn.addPos(new AdjoiningPos(13, freq));
347					}
348				}
349				break;
350			case TT_PLACE:
351				freq = (double) 1 / (double) (context.getFreq(0) + 1);
352				sn.addPos(new AdjoiningPos(0, freq));
353
354				if (sn.getLen() >= 4) {
355					freq = (double) 1 / (double) (context.getFreq(11) * 8);
356					sn.addPos(new AdjoiningPos(11, freq));
357					freq = (double) 1 / (double) (context.getFreq(12) * 8);
358					sn.addPos(new AdjoiningPos(12, freq));
359					freq = (double) 1 / (double) (context.getFreq(13) * 8);
360					sn.addPos(new AdjoiningPos(13, freq));
361				} else if (sn.getLen() == 2) {
362					freq = (double) 1 / (double) (context.getFreq(0) + 1);
363					sn.addPos(new AdjoiningPos(0, freq));
364					charType = Utility.charType(word);
365					if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
366
367						freq = (double) 1 / (double) (context.getFreq(1) + 1);
368						sn.addPos(new AdjoiningPos(1, freq));
369						freq = (double) 1 / (double) (context.getFreq(2) + 1);
370						sn.addPos(new AdjoiningPos(2, freq));
371						freq = (double) 1 / (double) (context.getFreq(3) + 1);
372						sn.addPos(new AdjoiningPos(3, freq));
373						freq = (double) 1 / (double) (context.getFreq(4) + 1);
374						sn.addPos(new AdjoiningPos(4, freq));
375					}
376					freq = (double) 1 / (double) (context.getFreq(11) * 8);
377					sn.addPos(new AdjoiningPos(11, freq));
378					freq = (double) 1 / (double) (context.getFreq(12) * 8);
379					sn.addPos(new AdjoiningPos(12, freq));
380					freq = (double) 1 / (double) (context.getFreq(13) * 8);
381					sn.addPos(new AdjoiningPos(13, freq));
382				}
383				break;
384			case TT_TRANS_PERSON:
385				freq = (double) 1 / (double) (context.getFreq(0) + 1);
386				sn.addPos(new AdjoiningPos(0, freq));
387				if (!Utility.isAllChinese(word)) {
388					if (Utility.isAllLetter(word)) {
389						freq = (double) 1 / (double) (context.getFreq(1) + 1);
390						sn.addPos(new AdjoiningPos(1, freq));
391						freq = (double) 1 / (double) (context.getFreq(11) + 1);
392						sn.addPos(new AdjoiningPos(11, freq));
393						freq = (double) 1 / (double) (context.getFreq(2) * 2 + 1);
394						sn.addPos(new AdjoiningPos(2, freq));
395						freq = (double) 1 / (double) (context.getFreq(3) * 2 + 1);
396						sn.addPos(new AdjoiningPos(3, freq));
397						freq = (double) 1 / (double) (context.getFreq(12) * 2 + 1);
398						sn.addPos(new AdjoiningPos(12, freq));
399						freq = (double) 1 / (double) (context.getFreq(13) * 2 + 1);
400						sn.addPos(new AdjoiningPos(13, freq));
401					}
402					freq = (double) 1 / (double) (context.getFreq(41) * 8);
403					sn.addPos(new AdjoiningPos(41, freq));
404					freq = (double) 1 / (double) (context.getFreq(42) * 8);
405					sn.addPos(new AdjoiningPos(42, freq));
406					freq = (double) 1 / (double) (context.getFreq(43) * 8);
407					sn.addPos(new AdjoiningPos(43, freq));
408				} else if (sn.getLen() >= 4) {
409					freq = (double) 1 / (double) (context.getFreq(41) * 8);
410					sn.addPos(new AdjoiningPos(41, freq));
411					freq = (double) 1 / (double) (context.getFreq(42) * 8);
412					sn.addPos(new AdjoiningPos(42, freq));
413					freq = (double) 1 / (double) (context.getFreq(43) * 8);
414					sn.addPos(new AdjoiningPos(43, freq));
415				} else if (sn.getLen() == 2) {
416					charType = Utility.charType(word);
417					if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
418						freq = (double) 1 / (double) (context.getFreq(1) * 2 + 1);
419						sn.addPos(new AdjoiningPos(1, freq));
420						freq = (double) 1 / (double) (context.getFreq(2) * 2 + 1);
421						sn.addPos(new AdjoiningPos(2, freq));
422						freq = (double) 1 / (double) (context.getFreq(3) * 2 + 1);
423						sn.addPos(new AdjoiningPos(3, freq));
424						freq = (double) 1 / (double) (context.getFreq(30) * 8 + 1);
425						sn.addPos(new AdjoiningPos(30, freq));
426						freq = (double) 1 / (double) (context.getFreq(11) * 4 + 1);
427						sn.addPos(new AdjoiningPos(11, freq));
428						freq = (double) 1 / (double) (context.getFreq(12) * 4 + 1);
429						sn.addPos(new AdjoiningPos(12, freq));
430						freq = (double) 1 / (double) (context.getFreq(13) * 4 + 1);
431						sn.addPos(new AdjoiningPos(13, freq));
432						freq = (double) 1 / (double) (context.getFreq(21) * 2 + 1);
433						sn.addPos(new AdjoiningPos(21, freq));
434						freq = (double) 1 / (double) (context.getFreq(22) * 2 + 1);
435						sn.addPos(new AdjoiningPos(22, freq));
436						freq = (double) 1 / (double) (context.getFreq(23) * 2 + 1);
437						sn.addPos(new AdjoiningPos(23, freq));
438					}
439					freq = (double) 1 / (double) (context.getFreq(41) * 8);
440					sn.addPos(new AdjoiningPos(41, freq));
441					freq = (double) 1 / (double) (context.getFreq(42) * 8);
442					sn.addPos(new AdjoiningPos(42, freq));
443					freq = (double) 1 / (double) (context.getFreq(43) * 8);
444					sn.addPos(new AdjoiningPos(43, freq));
445				}
446				break;
447			default:
448				break;
449			}
450			if (sn.getAllPos() != null)
451				result = sn.getAllPos().size();
452		}
453		return result;
454	}
455
456	/**
457	 * ??????
458	 * 
459	 * <pre>
460	 *           
461	 *           BBCD 343 0.003606 
462	 *           BBC 2 0.000021 
463	 *           BBE 125 0.001314 
464	 *           BBZ 30 0.000315 
465	 *           BCD 62460 0.656624 
466	 *           BEE 0 0.000000 
467	 *           BE 13899 0.146116 
468	 *           BG 869 0.009136 
469	 *           BXD 4 0.000042 
470	 *           BZ 3707 0.038971 
471	 *           CD 8596 0.090367 
472	 *           EE 26 0.000273 
473	 *           FB 871 0.009157 
474	 *           Y 3265 0.034324
475	 *           XD 926 0.009735
476	 *           
477	 *           The person recognition patterns set
478	 *           BBCD:?+?+?1+?2;
479	 *           BBE: ?+?+??;
480	 *           BBZ: ?+?+????;
481	 *           BCD: ?+?1+?2;
482	 *           BE: ?+??;
483	 *           BEE: ?+??+??;???
484	 *           BG: ?+??
485	 *           BXD: ?+???????+????
486	 *           BZ: ?+????;
487	 *           B: ?
488	 *           CD: ?1+?2;
489	 *           EE: ??+??;
490	 *           FB: ??+?
491	 *           XD: ???????+????
492	 *           Y: ?????
493	 * </pre>
494	 */
495	private void personRecognize(SegGraph segGraph, ArrayList<SegNode> sns) {
496		String sPos = null;
497		String personName = null;
498		// ??????
499		final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
500		final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 0.000042, 0.038971, 0, 0.090367,
501				0.000273, 0.009157, 0.034324, 0.009735, 0 };
502
503		if (segGraph != null && sns != null) {
504			int j = 1, k, nPos;
505			boolean bMatched = false;
506
507			sPos = word2pattern(sns);
508			while (sPos != null && j < sPos.length()) {
509				bMatched = false;
510				for (k = 0; !bMatched && patterns[k].length() > 0; k++) {
511					// ?????????????????????????????????????
512					if (sPos.substring(j).indexOf(patterns[k]) == 0 && !"?".equals(sns.get(j - 1).getWord())
513							&& !"?".equals(sns.get(j + patterns[k].length()))) {// Find
514
515						String temp = sPos.substring(j + 2);
516						if (temp.length() > 1)
517							temp = temp.substring(0, 1);
518
519						// Rule 1 for exclusion:??+?+?1(?2): ??(??+?)???
520						if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
521							continue;
522						}
523
524						nPos = j;
525						personName = "";
526						// Get the possible person name
527						while (nPos < j + patterns[k].length()) {
528							SegNode sn = sns.get(nPos);
529							int gbkID = sn.getGbkID();// dictLib.getGBKID(sn.getSrcWord());
530							if (sn.getPos() < 4 && unknownDict.getFreq(sn.getSrcWord(), sn.getPos(), gbkID) < Utility.LITTLE_FREQUENCY)
531								personName += sn.getSrcWord();
532							nPos += 1;
533						}
534						if ("CDCD".equals(patterns[k])) {
535							if (GetForeignCharCount(personName) > 0)
536								j += patterns[k].length() - 1;
537							continue;
538						}
539
540						SegNode usn = new SegNode();
541						usn.setRow(sns.get(j).getRow());
542						usn.setCol(sns.get(j + patterns[k].length() - 1).getCol());
543						usn.setWord(unknownFlags);
544						usn.setSrcWord(personName);
545						double value = -Math.log(factor[k]) + computePossibility(j, patterns[k].length(), sns);
546						usn.setPos(pos);
547						usn.setWeight(value);
548						segGraph.insert(usn, true);
549
550						j += patterns[k].length();
551						bMatched = true;
552					}
553				}
554				if (!bMatched)// Not matched, add j by 1
555					j += 1;
556			}
557
558		}
559	}
560
561	// TODO:
562	private int GetForeignCharCount(String personName) {
563		return 0;
564	}
565
566	/**
567	 * ??????
568	 * 
569	 */
570	private void placeRecognize(SegGraph segGraph, ArrayList<SegNode> sns, Dictionary coreDict) {
571		if (segGraph != null && coreDict != null) {
572			int start = 1;
573			int end = 1;
574			double dPanelty = 1;
575			String srcWord = "";
576			for (int i = 1; i < sns.size(); i++) {
577				start = i;
578				end = start;
579				srcWord = sns.get(i).getSrcWord();
580				if (getBestTag(sns, i) == 1) {
581					for (end = i + 1; end < sns.size(); end++) {
582						int bestTag = getBestTag(sns, end);
583						if (bestTag == -1)
584							continue;
585						else if (bestTag == 1 || bestTag == 3) {
586							if (end > i + 1)
587								dPanelty += 1;
588							srcWord += sns.get(end).getSrcWord();
589						} else if (bestTag == 2)
590							srcWord += sns.get(end).getSrcWord();
591						else
592							break;
593					}
594
595				} else if (getBestTag(sns, i) == 2) {
596					dPanelty += 1;
597					for (end = i + 1; end < sns.size(); end++) {
598						int bestTag = getBestTag(sns, end);
599						if (bestTag == -1)
600							continue;
601						else if (bestTag == 3) {
602							if (end > i + 1)
603								dPanelty += 1;
604							srcWord += sns.get(end).getSrcWord();
605						} else if (bestTag == 2)
606							srcWord += sns.get(end).getSrcWord();
607						else
608							break;
609					}
610				}
611				if (end > start) {
612					SegNode newsn = new SegNode();
613					newsn.setRow(sns.get(start).getRow());
614					newsn.setCol(sns.get(end - 1).getCol());
615					newsn.setPos(pos);
616					newsn.setWord(unknownFlags);
617					newsn.setSrcWord(srcWord);
618					double value = computePossibility(start, end - start + 1, sns);
619					newsn.setWeight(value);
620					segGraph.insert(newsn, true);
621				}
622			}
623		}
624	}
625
626	private int getBestTag(ArrayList<SegNode> sns, int index) {
627		if (sns != null && index >= 0 && index < sns.size()) {
628			SegNode sn = sns.get(index);
629			return getBestTag(sn);
630
631		}
632
633		return -1;
634	}
635
636	private int getBestTag(SegNode sn) {
637		if (sn != null) {
638			ArrayList<AdjoiningPos> allPos = sn.getAllPos();
639			if (allPos != null) {
640				for (AdjoiningPos pos : allPos) {
641					if (pos.isBest())
642						return pos.getPos().getTag();
643				}
644			}
645		}
646
647		return -1;
648	}
649
650	// Judge whether the name is a given name
651	public boolean isGivenName(String sName) {
652		String firstChar;
653		String secondChar;
654		// given Name Possibility
655		double gnp = 0;
656		// singleNamePossibility
657		double snp = 0;
658
659		if (sName != null) {
660			if (sName.getBytes().length != 4)
661				return false;
662
663			firstChar = sName.substring(0, 1);
664			int gbkID1 = dictLib.getGBKID(firstChar);
665			secondChar = sName.substring(1);
666			int gbkID2 = dictLib.getGBKID(secondChar);
667
668			// The possibility of P(Wi|Ti)
669			gnp += Math.log((double) unknownDict.getFreq(firstChar, 2, gbkID1) + 1.0);
670			gnp -= Math.log(context.getFreq(2) + 1.0);
671			gnp += Math.log((double) unknownDict.getFreq(secondChar, 3, gbkID2) + 1.0);
672			gnp -= Math.log(context.getFreq(3) + 1.0);
673			// The possibility of conversion from 2 to 3
674			gnp += Math.log(context.computePossibility(2, 3) + 1.0);
675			gnp -= Math.log(context.getFreq(2) + 1.0);
676
677			// The possibility of P(Wi|Ti)
678			snp += Math.log((double) unknownDict.getFreq(firstChar, 1, gbkID1) + 1.0);
679			snp -= Math.log(context.getFreq(1) + 1.0);
680			snp += Math.log((double) unknownDict.getFreq(secondChar, 4, gbkID2) + 1.0);
681			snp -= Math.log(context.getFreq(4) + 1.0);
682			// The possibility of conversion from 1 to 4
683			snp += Math.log(context.computePossibility(1, 4) + 1.0);
684			snp -= Math.log(context.getFreq(1) + 1.0);
685
686			// ??||m_dict.getFrequency(sFirstChar,1)/m_dict.getFrequency(sFirstChar,2)>=10
687			// The possibility being a single given name is more than being a
688			// 2-char given name
689			if (snp >= gnp)
690				return false;
691			return true;
692		}
693
694		return false;
695	}
696
697	// ??????????????????????
698	private String word2pattern(ArrayList<SegNode> sns) {
699		String result = null;
700
701		if (sns != null) {
702			result = "";
703			for (SegNode sn : sns) {
704				result += (char) (getBestTag(sn) + 'A');
705			}
706
707		}
708		return result;
709	}
710
711	/**
712	 * ???????
713	 * 
714	 * @param sns
715	 */
716	private void tagBest(ArrayList<SegNode> sns) {
717
718		if (sns != null) {
719			int size = sns.size();
720
721			// ??????????
722			for (int i = size - 1, j = 0; i >= 0; i--) {
723				SegNode sn = sns.get(i);
724				ArrayList<AdjoiningPos> allPos = sn.getAllPos();
725				if (allPos != null && allPos.size() > j) {
726					AdjoiningPos pos = allPos.get(j);
727					pos.setBest(true);
728					j = pos.getPrev();
729				} else if (i + 1 < size - 1) {
730					int tag = getBestTag(sns.get(i + 1));
731					AdjoiningPos pos = new AdjoiningPos(tag, 0);
732					pos.setBest(true);
733					sns.get(i).addPos(pos);
734				}
735				// ?????????????????????????????????
736				if (sn.getPos() == POSTag.NOUN_LETTER || sn.getPos() == POSTag.NUM) {
737					for (AdjoiningPos pos : allPos) {
738						if (pos.isBest() && pos.getPos().getTag() > 0) {
739							sn.setPos(pos.getPos().getTag());
740							break;
741						}
742					}
743				}
744			}
745			// ????????????????????????“????”??????
746
747			if (size > 1) {
748				if (sns.get(size - 1).getWord() == null)
749					sns.remove(size - 1);
750			}
751		}
752	}
753
754	private double computePossibility(int startPos, int length, ArrayList<SegNode> sns) {
755		double retValue = 0, posPoss;
756
757		if (sns != null && unknownDict != null && context != null) {
758			for (int i = startPos; sns != null && i < startPos + length && i < sns.size(); i++) {
759				SegNode sn = sns.get(i);
760				int bestTag = getBestTag(sn);
761				if (bestTag != -1) {
762					int gbkID = sn.getGbkID();// dictLib.getGBKID(sn.getSrcWord());
763					int freq = unknownDict.getFreq(sn.getSrcWord(), bestTag, gbkID);
764					posPoss = Math.log((double) (context.getFreq(sn.getPos()) + 1));
765					posPoss += -Math.log((double) (freq + 1));
766					retValue += posPoss;
767				}
768			}
769		}
770		return retValue;
771	}
772
773	public Dictionary getUnknownDict() {
774		return unknownDict;
775	}
776
777}