PageRenderTime 19ms CodeModel.GetById 8ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/doc/ICTCLAS_Diary/2006-03-10.rtf

http://ictclas4j.googlecode.com/
Unknown | 81 lines | 80 code | 1 blank | 0 comment | 0 complexity | 78d80439bba24f682d09fc07f8c166b6 MD5 | raw file
 1{\rtf1\ansi\ansicpg1252\uc2\deff0\stshfdbch13\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe2052{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
 2{\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f13\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}SimSun{\*\falt \'cb\'ce\'cc\'e5};}{\f37\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}@SimSun;}
 3{\f38\froman\fcharset238\fprq2 Times New Roman CE;}{\f39\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f41\froman\fcharset161\fprq2 Times New Roman Greek;}{\f42\froman\fcharset162\fprq2 Times New Roman Tur;}
 4{\f43\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f44\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f45\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f46\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
 5{\f48\fswiss\fcharset238\fprq2 Arial CE;}{\f49\fswiss\fcharset204\fprq2 Arial Cyr;}{\f51\fswiss\fcharset161\fprq2 Arial Greek;}{\f52\fswiss\fcharset162\fprq2 Arial Tur;}{\f53\fswiss\fcharset177\fprq2 Arial (Hebrew);}
 6{\f54\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f55\fswiss\fcharset186\fprq2 Arial Baltic;}{\f56\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f170\fnil\fcharset0\fprq2 SimSun Western{\*\falt \'cb\'ce\'cc\'e5};}
 7{\f410\fnil\fcharset0\fprq2 @SimSun Western;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;
 8\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
 9\fs24\lang1033\langfe3076\loch\f0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
10\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv 
11\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\listtable{\list\listtemplateid1776563028
12\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\'01_;}{\levelnumbers;}\f3\fs20 \fi-240\li240\jclisttab\tx390\lin240 }{\listname ;}\listid43587243}}{\*\listoverridetable
13{\listoverride\listid43587243\listoverridecount0\ls1}}{\*\rsidtbl \rsid10966067}{\*\generator Microsoft Word 11.0.6359;}{\info{\author SEEM}{\operator SEEM}{\creatim\yr2006\mo4\dy13\hr23\min21}{\revtim\yr2006\mo4\dy13\hr23\min23}{\version2}{\edmins2}
14{\nofpages1}{\nofwords376}{\nofchars2145}{\*\company CUHK}{\nofcharsws2516}{\vern24703}}\paperw12240\paperh15840\margl1800\margr1800\margt1440\margb1440\gutter0 
15\widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3\jcompress\viewkind1\viewscale100\nolnhtadjtbl\rsidroot10966067 \fet0\sectd 
16\linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl4
17\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta \hich )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl7
18\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}\pard\plain 
19\ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 \fs24\lang1033\langfe3076\loch\af0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 
20About the symbol set found in lexical.ctx.shw:
21\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
22\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 by outputting coreDict.dct to textfile coreDict.txt, searching for the symbol 1, the only entry that has this symbol is }{
23\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u22987\'ca\'bc}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 ##}{
24\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u22987\'ca\'bc}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 
25, which means the beginning of a sentence. Moreover, the only entry that has the symbol 4 is }{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u26411\'c4\'a9}{
26\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 ##}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u26411\'c4\'a9}{
27\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 , which means the end of a sentence. The guess of yesterday is correct!}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
28\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
29\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 Other unknown symbols are found in the textfile converted from coreDict.dct. They may\hich\af1\dbch\af13\loch\f1 
30 be the POS tag extended by ICT based on PKU POS tags. }{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
31\par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
32\par \hich\af1\dbch\af13\loch\f1 Question: }{\b\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 when is the information of lexical.ctx used}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
33\hich\af1\dbch\af13\loch\f1 ? 
34\par \hich\af1\dbch\af13\loch\f1 Ans: Probably used only In the last stage, i.e. m_POSTagger.POStagging() in CResult::Processing(). Because from the constructor of CResult, "le\hich\af1\dbch\af13\loch\f1 
35xical.ctx" is only loaded to m_POSTagger, and m_POSTagger only take action in the last stage of CResult::Processing().
36\par 
37\par \hich\af1\dbch\af13\loch\f1 About .dct and .ctx for named entities (i.e. nr, tr and ns):
38\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
39\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The .dct files are converted to text files and the content of .ctx files \hich\af1\dbch\af13\loch\f1 
40can be viewed from .ctx.shw files. Placed inside the folder \hich\af1\dbch\af13\loch\f1 ICTCLAS/Data_covertion/data_in_text. A pair of .dct and .ctx work together (e.g. nr.dct and nr.ctx work together).}{
41\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
42\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
43\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The symbols used in a pair do not have direct relation to\hich\af1\dbch\af13\loch\f1 
44 those used in other pairs. E.g. the symbol 1 in nr.ctx/nr.dct have the different meaning in ns.ctx/ns.dct.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
45\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
46\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 Very probably, the symbols represent the roles in Role Model for recognition of unknown named entities.}{
47\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
48\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
49\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The meaning of the symbol can be guessed\hich\af1\dbch\af13\loch\f1 
50, but there is no documentation in the code to explain what the symbols stand for.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
51\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
52\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The symbol set can be re-defined and changed if training is needed.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
53
54\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
55\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The .ctx files contain the frequency count for a role to transit to another role.}{
56\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
57\par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
58\par \hich\af1\dbch\af13\loch\f1 About how to read \hich\af1\dbch\af13\loch\f1 .dct text file and .ctx.shw files:
59\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
60\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 How to read dictionary text file:}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
61\par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 E.g. for a entry:
62\par }{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u-26418\'b7\'e7\loch\af1\hich\af1\dbch\f13 \u21270\'bb\'af}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 
63[2] n(28160)[0] v(30208)[0] vn(30318)[0]
64\par 
65\par \hich\af1\dbch\af13\loch\f1 inside 1st []: wordLen (not counting the leading Chinese char)
66\par \hich\af1\dbch\af13\loch\f1 before(): POS tag in string / symbol
67\par \hich\af1\dbch\af13\loch\f1 inside(): symbol/handle\hich\af1\dbch\af13\loch\f1  value/POS tag
68\par \hich\af1\dbch\af13\loch\f1 inside[]: frequency
69\par 
70\par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
71\faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 How to read .ctx.shw file:}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 
72\par }\pard \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 "Table Len" - the number of symbols/tags
73\par \hich\af1\dbch\af13\loch\f1 "Symbol" - the symbol set
74\par \hich\af1\dbch\af13\loch\f1 "nKey" - don't know what its usage. Can be ignored.
75\par \hich\af1\dbch\af13\loch\f1 "frequency" - the total frequency by adding all the entries for the following transition matrix.
76\par \hich\af1\dbch\af13\loch\f1 A row in the symbol transition matrix:
77\par \hich\af1\dbch\af13\loch\f1 No. [index]=  [from_symbol]: [freq of to_symbol 1] [freq of to_symbol 2] ... [freq of to_symbol m] total=[total freq]:
78\par \hich\af1\dbch\af13\loch\f1 T\hich\af1\dbch\af13\loch\f1 he n_th column is the frequency of the n_th symbol appears before the from_symbol. "total freq" is the summation of all the frequencies of the to_symbol's.
79\par 
80\par 
81\par }}