/doc/ICTCLAS_Diary/2006-03-10.rtf
http://ictclas4j.googlecode.com/ · Unknown · 81 lines · 80 code · 1 blank · 0 comment · 0 complexity · 78d80439bba24f682d09fc07f8c166b6 MD5 · raw file
- {\rtf1\ansi\ansicpg1252\uc2\deff0\stshfdbch13\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe2052{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
- {\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f13\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}SimSun{\*\falt \'cb\'ce\'cc\'e5};}{\f37\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}@SimSun;}
- {\f38\froman\fcharset238\fprq2 Times New Roman CE;}{\f39\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f41\froman\fcharset161\fprq2 Times New Roman Greek;}{\f42\froman\fcharset162\fprq2 Times New Roman Tur;}
- {\f43\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f44\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f45\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f46\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
- {\f48\fswiss\fcharset238\fprq2 Arial CE;}{\f49\fswiss\fcharset204\fprq2 Arial Cyr;}{\f51\fswiss\fcharset161\fprq2 Arial Greek;}{\f52\fswiss\fcharset162\fprq2 Arial Tur;}{\f53\fswiss\fcharset177\fprq2 Arial (Hebrew);}
- {\f54\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f55\fswiss\fcharset186\fprq2 Arial Baltic;}{\f56\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f170\fnil\fcharset0\fprq2 SimSun Western{\*\falt \'cb\'ce\'cc\'e5};}
- {\f410\fnil\fcharset0\fprq2 @SimSun Western;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;
- \red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
- \fs24\lang1033\langfe3076\loch\f0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
- \ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
- \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\listtable{\list\listtemplateid1776563028
- \listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\'01_;}{\levelnumbers;}\f3\fs20 \fi-240\li240\jclisttab\tx390\lin240 }{\listname ;}\listid43587243}}{\*\listoverridetable
- {\listoverride\listid43587243\listoverridecount0\ls1}}{\*\rsidtbl \rsid10966067}{\*\generator Microsoft Word 11.0.6359;}{\info{\author SEEM}{\operator SEEM}{\creatim\yr2006\mo4\dy13\hr23\min21}{\revtim\yr2006\mo4\dy13\hr23\min23}{\version2}{\edmins2}
- {\nofpages1}{\nofwords376}{\nofchars2145}{\*\company CUHK}{\nofcharsws2516}{\vern24703}}\paperw12240\paperh15840\margl1800\margr1800\margt1440\margb1440\gutter0
- \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3\jcompress\viewkind1\viewscale100\nolnhtadjtbl\rsidroot10966067 \fet0\sectd
- \linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl4
- \pnlcltr\pnstart1\pnindent720\pnhang {\pntxta \hich )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl7
- \pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}\pard\plain
- \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 \fs24\lang1033\langfe3076\loch\af0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1
- About the symbol set found in lexical.ctx.shw:
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 by outputting coreDict.dct to textfile coreDict.txt, searching for the symbol 1, the only entry that has this symbol is }{
- \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u22987\'ca\'bc}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 ##}{
- \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u22987\'ca\'bc}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1
- , which means the beginning of a sentence. Moreover, the only entry that has the symbol 4 is }{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u26411\'c4\'a9}{
- \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 ##}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u26411\'c4\'a9}{
- \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 , which means the end of a sentence. The guess of yesterday is correct!}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 Other unknown symbols are found in the textfile converted from coreDict.dct. They may\hich\af1\dbch\af13\loch\f1
- be the POS tag extended by ICT based on PKU POS tags. }{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par \hich\af1\dbch\af13\loch\f1 Question: }{\b\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 when is the information of lexical.ctx used}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \hich\af1\dbch\af13\loch\f1 ?
- \par \hich\af1\dbch\af13\loch\f1 Ans: Probably used only In the last stage, i.e. m_POSTagger.POStagging() in CResult::Processing(). Because from the constructor of CResult, "le\hich\af1\dbch\af13\loch\f1
- xical.ctx" is only loaded to m_POSTagger, and m_POSTagger only take action in the last stage of CResult::Processing().
- \par
- \par \hich\af1\dbch\af13\loch\f1 About .dct and .ctx for named entities (i.e. nr, tr and ns):
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The .dct files are converted to text files and the content of .ctx files \hich\af1\dbch\af13\loch\f1
- can be viewed from .ctx.shw files. Placed inside the folder \hich\af1\dbch\af13\loch\f1 ICTCLAS/Data_covertion/data_in_text. A pair of .dct and .ctx work together (e.g. nr.dct and nr.ctx work together).}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The symbols used in a pair do not have direct relation to\hich\af1\dbch\af13\loch\f1
- those used in other pairs. E.g. the symbol 1 in nr.ctx/nr.dct have the different meaning in ns.ctx/ns.dct.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 Very probably, the symbols represent the roles in Role Model for recognition of unknown named entities.}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The meaning of the symbol can be guessed\hich\af1\dbch\af13\loch\f1
- , but there is no documentation in the code to explain what the symbols stand for.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The symbol set can be re-defined and changed if training is needed.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
-
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The .ctx files contain the frequency count for a role to transit to another role.}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par \hich\af1\dbch\af13\loch\f1 About how to read \hich\af1\dbch\af13\loch\f1 .dct text file and .ctx.shw files:
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 How to read dictionary text file:}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 E.g. for a entry:
- \par }{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u-26418\'b7\'e7\loch\af1\hich\af1\dbch\f13 \u21270\'bb\'af}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1
- [2] n(28160)[0] v(30208)[0] vn(30318)[0]
- \par
- \par \hich\af1\dbch\af13\loch\f1 inside 1st []: wordLen (not counting the leading Chinese char)
- \par \hich\af1\dbch\af13\loch\f1 before(): POS tag in string / symbol
- \par \hich\af1\dbch\af13\loch\f1 inside(): symbol/handle\hich\af1\dbch\af13\loch\f1 value/POS tag
- \par \hich\af1\dbch\af13\loch\f1 inside[]: frequency
- \par
- \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 How to read .ctx.shw file:}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
- \par }\pard \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 "Table Len" - the number of symbols/tags
- \par \hich\af1\dbch\af13\loch\f1 "Symbol" - the symbol set
- \par \hich\af1\dbch\af13\loch\f1 "nKey" - don't know what its usage. Can be ignored.
- \par \hich\af1\dbch\af13\loch\f1 "frequency" - the total frequency by adding all the entries for the following transition matrix.
- \par \hich\af1\dbch\af13\loch\f1 A row in the symbol transition matrix:
- \par \hich\af1\dbch\af13\loch\f1 No. [index]= [from_symbol]: [freq of to_symbol 1] [freq of to_symbol 2] ... [freq of to_symbol m] total=[total freq]:
- \par \hich\af1\dbch\af13\loch\f1 T\hich\af1\dbch\af13\loch\f1 he n_th column is the frequency of the n_th symbol appears before the from_symbol. "total freq" is the summation of all the frequencies of the to_symbol's.
- \par
- \par
- \par }}