/doc/ICTCLAS_Diary/2006-02-22.rtf
http://ictclas4j.googlecode.com/ · Unknown · 58 lines · 58 code · 0 blank · 0 comment · 0 complexity · 253b160868d9fd4bca0f2ef21a0bc1e4 MD5 · raw file
- {\rtf1\ansi\ansicpg1252\uc2\deff0\stshfdbch13\stshfloch0\stshfhich0\stshfbi0\deflang1033\deflangfe2052{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
- {\f13\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}SimSun{\*\falt \'cb\'ce\'cc\'e5};}{\f37\fnil\fcharset134\fprq2{\*\panose 02010600030101010101}@SimSun;}{\f38\froman\fcharset238\fprq2 Times New Roman CE;}
- {\f39\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f41\froman\fcharset161\fprq2 Times New Roman Greek;}{\f42\froman\fcharset162\fprq2 Times New Roman Tur;}{\f43\froman\fcharset177\fprq2 Times New Roman (Hebrew);}
- {\f44\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f45\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f46\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f48\fswiss\fcharset238\fprq2 Arial CE;}
- {\f49\fswiss\fcharset204\fprq2 Arial Cyr;}{\f51\fswiss\fcharset161\fprq2 Arial Greek;}{\f52\fswiss\fcharset162\fprq2 Arial Tur;}{\f53\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f54\fswiss\fcharset178\fprq2 Arial (Arabic);}
- {\f55\fswiss\fcharset186\fprq2 Arial Baltic;}{\f56\fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f170\fnil\fcharset0\fprq2 SimSun Western{\*\falt \'cb\'ce\'cc\'e5};}{\f410\fnil\fcharset0\fprq2 @SimSun Western;}}{\colortbl;\red0\green0\blue0;
- \red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;
- \red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
- \fs24\lang1033\langfe3076\loch\f0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
- \ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
- \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}}{\*\latentstyles\lsdstimax156\lsdlockeddef0}{\*\listtable{\list\listtemplateid128256817
- \listsimple{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\f1\fs20 \fi-240\li240\jclisttab\tx390\lin240 }{\listname ;}\listid629849591}
- {\list\listtemplateid176119771\listsimple{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\f1\fs20 \fi-240\li240\jclisttab\tx390\lin240 }{\listname
- ;}\listid1019612489}{\list\listtemplateid151546482\listsimple{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\f1\fs20 \fi-240\li240\jclisttab\tx390\lin240 }
- {\listname ;}\listid1805721148}}{\*\listoverridetable{\listoverride\listid1019612489\listoverridecount0\ls1}{\listoverride\listid1805721148\listoverridecount0\ls2}{\listoverride\listid629849591\listoverridecount0\ls3}}{\*\rsidtbl \rsid1508032}
- {\*\generator Microsoft Word 11.0.6359;}{\info{\author SEEM}{\operator SEEM}{\creatim\yr2006\mo4\dy13\hr22\min53}{\revtim\yr2006\mo4\dy13\hr22\min55}{\version2}{\edmins2}{\nofpages1}{\nofwords208}{\nofchars1189}{\*\company CUHK}{\nofcharsws1395}
- {\vern24703}}\paperw12240\paperh15840\margl1800\margr1800\margt1440\margb1440\gutter0 \widowctrl\ftnbj\aenddoc\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\hyphcaps0\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow0\dgvshow3
- \jcompress\viewkind1\viewscale100\nolnhtadjtbl\rsidroot1508032 \fet0\sectd \linex0\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl3
- \pndec\pnstart1\pnindent720\pnhang {\pntxta \hich .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta \hich )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang
- {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb \hich (}{\pntxta \hich )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang
- {\pntxtb \hich (}{\pntxta \hich )}}\pard\plain \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 \fs24\lang1033\langfe3076\loch\af0\hich\af0\dbch\af13\cgrid\langnp1033\langfenp3076 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \hich\af1\dbch\af13\loch\f1 Meeting Summary:
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 1.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls1\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 Try to find ways to obtain the 6-month PKU corpus. (will be used for training)}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 2.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls1\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 Decided to add the component to the free ICTCLAS in order to recognize the nested names (organization name and some location names)
- \line \hich\af1\dbch\af13\loch\f1 The component would be HMM-based and need training. Only train this component should be enough. }{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 3.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls1\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls1\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 In the future, may need to train the whole software with 6-month PKU corpus. May need to prepare for it.}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par }\pard \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par
- \par \hich\af1\dbch\af13\loch\f1 About tag-conversion:
- \par \hich\af1\dbch\af13\loch\f1 No matter which PKU corpus we will use\hich\af1\dbch\af13\loch\f1
- , we have to convert the PKU tag. The conversion is quite troublesome and cannot be done automatically. Training of the Role Model needs the corpus that (According to the paper - Chinese Named Entity Recognition Using Role Model page 8):
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 1.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls2\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls2\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 The surname and\hich\af1\dbch\af13\loch\f1
- given name of a Chinese name should be distinguishably tagged (not using same tag). -- Done in 2003 PKU standard.}{\f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 2.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls2\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls2\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 The transliterated PER and transliterated LOC are not mixed with Chinese PER and LOC. -- Not done in 2003 PKU standard.}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 3.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls2\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls2\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 The ambiguous l\hich\af1\dbch\af13\loch\f1 abel "/j" is replaced with its corresponding NE label. -- Not done in 2003 PKU standard.}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 4.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls2\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls2\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 Different tag is used to distinguish different punctuations. -- Done in 2003 PKU standard.}{
- \f1\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par }\pard \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par }\pard \qj \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 Except these tags, it seems that other tags should not be important.
- \par
- \par }\pard \ql \li0\ri0\nowidctlpar\faauto\rin0\lin0\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032
- \par
- \par \hich\af1\dbch\af13\loch\f1 Other findings:
- \par {\pntext\pard\plain\f1\fs20\lang0\langfe2052\langnp0 \hich\af1\dbch\af13\loch\f1 1.\tab}}\pard \ql \fi-240\li240\ri0\nowidctlpar\jclisttab\tx390{\*\pn \pnlvlbody\ilvl0\ls3\pnrnot0\pndec\pnf1\pnfs20\pnstart1\pnindent360\pnsp120\pnhang {\pntxta \hich .}}
- \faauto\ls3\rin0\lin240\itap0 {\f1\fs20\lang0\langfe2052\langnp0\langfenp2052\insrsid1508032 \hich\af1\dbch\af13\loch\f1 PKU regard Japanese name as foreign name. Therefore, it tag a Japanese name as a whole. It doesn't separate surname and given name.}{
- \insrsid1508032
- \par }}