/doc/ICTCLAS_Diary/2006-03-10.rtf

http://ictclas4j.googlecode.com/ · Unknown · 81 lines · 80 code · 1 blank · 0 comment · 0 complexity · 78d80439bba24f682d09fc07f8c166b6 MD5 · raw file

  1. {\rtf1\ansi\ansicpg1252\uc2\deff0\stshfdbch13\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe2052{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
  2. {\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f13\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}SimSun{\*\falt \'cb\'ce\'cc\'e5};}{\f37\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}@SimSun;}
  3. {\f38\froman\fcharset238\fprq2 Times New Roman CE;}{\f39\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f41\froman\fcharset161\fprq2 Times New Roman Greek;}{\f42\froman\fcharset162\fprq2 Times New Roman Tur;}
  4. {\f43\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f44\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f45\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f46\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
  5. {\f48\fswiss\fcharset238\fprq2 Arial CE;}{\f49\fswiss\fcharset204\fprq2 Arial Cyr;}{\f51\fswiss\fcharset161\fprq2 Arial Greek;}{\f52\fswiss\fcharset162\fprq2 Arial Tur;}{\f53\fswiss\fcharset177\fprq2 Arial (Hebrew);}
  6. {\f54\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f55\fswiss\fcharset186\fprq2 Arial Baltic;}{\f56\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f170\fnil\fcharset0\fprq2 SimSun Western{\*\falt \'cb\'ce\'cc\'e5};}
  7. {\f410\fnil\fcharset0\fprq2 @SimSun Western;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;
  8. \red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
  9. \fs24\lang1033\langfe3076\loch\f0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
  10. \ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
  11. \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\listtable{\list\listtemplateid1776563028
  12. \listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\'01_;}{\levelnumbers;}\f3\fs20 \fi-240\li240\jclisttab\tx390\lin240 }{\listname ;}\listid43587243}}{\*\listoverridetable
  13. {\listoverride\listid43587243\listoverridecount0\ls1}}{\*\rsidtbl \rsid10966067}{\*\generator Microsoft Word 11.0.6359;}{\info{\author SEEM}{\operator SEEM}{\creatim\yr2006\mo4\dy13\hr23\min21}{\revtim\yr2006\mo4\dy13\hr23\min23}{\version2}{\edmins2}
  14. {\nofpages1}{\nofwords376}{\nofchars2145}{\*\company CUHK}{\nofcharsws2516}{\vern24703}}\paperw12240\paperh15840\margl1800\margr1800\margt1440\margb1440\gutter0
  15. \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3\jcompress\viewkind1\viewscale100\nolnhtadjtbl\rsidroot10966067 \fet0\sectd
  16. \linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl4
  17. \pnlcltr\pnstart1\pnindent720\pnhang {\pntxta \hich )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl7
  18. \pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}\pard\plain
  19. \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 \fs24\lang1033\langfe3076\loch\af0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1
  20. About the symbol set found in lexical.ctx.shw:
  21. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  22. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 by outputting coreDict.dct to textfile coreDict.txt, searching for the symbol 1, the only entry that has this symbol is }{
  23. \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u22987\'ca\'bc}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 ##}{
  24. \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u22987\'ca\'bc}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1
  25. , which means the beginning of a sentence. Moreover, the only entry that has the symbol 4 is }{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u26411\'c4\'a9}{
  26. \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 ##}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u26411\'c4\'a9}{
  27. \f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 , which means the end of a sentence. The guess of yesterday is correct!}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  28. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  29. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 Other unknown symbols are found in the textfile converted from coreDict.dct. They may\hich\af1\dbch\af13\loch\f1
  30. be the POS tag extended by ICT based on PKU POS tags. }{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  31. \par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  32. \par \hich\af1\dbch\af13\loch\f1 Question: }{\b\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 when is the information of lexical.ctx used}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  33. \hich\af1\dbch\af13\loch\f1 ?
  34. \par \hich\af1\dbch\af13\loch\f1 Ans: Probably used only In the last stage, i.e. m_POSTagger.POStagging() in CResult::Processing(). Because from the constructor of CResult, "le\hich\af1\dbch\af13\loch\f1
  35. xical.ctx" is only loaded to m_POSTagger, and m_POSTagger only take action in the last stage of CResult::Processing().
  36. \par
  37. \par \hich\af1\dbch\af13\loch\f1 About .dct and .ctx for named entities (i.e. nr, tr and ns):
  38. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  39. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The .dct files are converted to text files and the content of .ctx files \hich\af1\dbch\af13\loch\f1
  40. can be viewed from .ctx.shw files. Placed inside the folder \hich\af1\dbch\af13\loch\f1 ICTCLAS/Data_covertion/data_in_text. A pair of .dct and .ctx work together (e.g. nr.dct and nr.ctx work together).}{
  41. \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  42. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  43. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The symbols used in a pair do not have direct relation to\hich\af1\dbch\af13\loch\f1
  44. those used in other pairs. E.g. the symbol 1 in nr.ctx/nr.dct have the different meaning in ns.ctx/ns.dct.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  45. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  46. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 Very probably, the symbols represent the roles in Role Model for recognition of unknown named entities.}{
  47. \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  48. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  49. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The meaning of the symbol can be guessed\hich\af1\dbch\af13\loch\f1
  50. , but there is no documentation in the code to explain what the symbols stand for.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  51. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  52. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The symbol set can be re-defined and changed if training is needed.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  53. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  54. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 The .ctx files contain the frequency count for a role to transit to another role.}{
  55. \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  56. \par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  57. \par \hich\af1\dbch\af13\loch\f1 About how to read \hich\af1\dbch\af13\loch\f1 .dct text file and .ctx.shw files:
  58. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  59. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 How to read dictionary text file:}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  60. \par }\pard \ql \li0\ri0\nowidctlpar{\*\pn \pnlvlcont\ilvl0\ls0\pnrnot0\pndec }\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 E.g. for a entry:
  61. \par }{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \loch\af1\hich\af1\dbch\f13 \u-26418\'b7\'e7\loch\af1\hich\af1\dbch\f13 \u21270\'bb\'af}{\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1
  62. [2] n(28160)[0] v(30208)[0] vn(30318)[0]
  63. \par
  64. \par \hich\af1\dbch\af13\loch\f1 inside 1st []: wordLen (not counting the leading Chinese char)
  65. \par \hich\af1\dbch\af13\loch\f1 before(): POS tag in string / symbol
  66. \par \hich\af1\dbch\af13\loch\f1 inside(): symbol/handle\hich\af1\dbch\af13\loch\f1 value/POS tag
  67. \par \hich\af1\dbch\af13\loch\f1 inside[]: frequency
  68. \par
  69. \par {\pntext\pard\plain\f3\fs20\lang0\langfe2052\langnp0 \hich\af3\dbch\af13\loch\f3 _\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlblt\ilvl0\ls1\pnrnot0\pnf3\pnfs20\pnindent360\pnsp120\pnhang {\pntxtb \hich _}}
  70. \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 How to read .ctx.shw file:}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067
  71. \par }\pard \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid10966067 \hich\af1\dbch\af13\loch\f1 "Table Len" - the number of symbols/tags
  72. \par \hich\af1\dbch\af13\loch\f1 "Symbol" - the symbol set
  73. \par \hich\af1\dbch\af13\loch\f1 "nKey" - don't know what its usage. Can be ignored.
  74. \par \hich\af1\dbch\af13\loch\f1 "frequency" - the total frequency by adding all the entries for the following transition matrix.
  75. \par \hich\af1\dbch\af13\loch\f1 A row in the symbol transition matrix:
  76. \par \hich\af1\dbch\af13\loch\f1 No. [index]= [from_symbol]: [freq of to_symbol 1] [freq of to_symbol 2] ... [freq of to_symbol m] total=[total freq]:
  77. \par \hich\af1\dbch\af13\loch\f1 T\hich\af1\dbch\af13\loch\f1 he n_th column is the frequency of the n_th symbol appears before the from_symbol. "total freq" is the summation of all the frequencies of the to_symbol's.
  78. \par
  79. \par
  80. \par }}