/doc/tagger.extras-pysrc.html

http://github.com/apresta/tagger · HTML · 276 lines · 245 code · 10 blank · 21 comment · 0 complexity · c144541f7783f12ad44a0a71dc5ea828 MD5 · raw file

  1. <?xml version="1.0" encoding="ascii"?>
  2. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3. "DTD/xhtml1-transitional.dtd">
  4. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5. <head>
  6. <title>tagger.extras</title>
  7. <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8. <script type="text/javascript" src="epydoc.js"></script>
  9. </head>
  10. <body bgcolor="white" text="black" link="blue" vlink="#204080"
  11. alink="#204080">
  12. <!-- ==================== NAVIGATION BAR ==================== -->
  13. <table class="navbar" border="0" width="100%" cellpadding="0"
  14. bgcolor="#a0c0ff" cellspacing="0">
  15. <tr valign="middle">
  16. <!-- Tree link -->
  17. <th>&nbsp;&nbsp;&nbsp;<a
  18. href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
  19. <!-- Index link -->
  20. <th>&nbsp;&nbsp;&nbsp;<a
  21. href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
  22. <!-- Help link -->
  23. <th>&nbsp;&nbsp;&nbsp;<a
  24. href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
  25. <!-- Project homepage -->
  26. <th class="navbar" align="right" width="100%">
  27. <table border="0" cellpadding="0" cellspacing="0">
  28. <tr><th class="navbar" align="center"
  29. ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th>
  30. </tr></table></th>
  31. </tr>
  32. </table>
  33. <table width="100%" cellpadding="0" cellspacing="0">
  34. <tr valign="top">
  35. <td width="100%">
  36. <span class="breadcrumbs">
  37. Package&nbsp;tagger ::
  38. Module&nbsp;extras
  39. </span>
  40. </td>
  41. <td>
  42. <table cellpadding="0" cellspacing="0">
  43. <!-- hide/show private -->
  44. <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
  45. onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
  46. <tr><td align="right"><span class="options"
  47. >[<a href="frames.html" target="_top">frames</a
  48. >]&nbsp;|&nbsp;<a href="tagger.extras-pysrc.html"
  49. target="_top">no&nbsp;frames</a>]</span></td></tr>
  50. </table>
  51. </td>
  52. </tr>
  53. </table>
  54. <h1 class="epydoc">Source Code for <a href="tagger.extras-module.html">Module tagger.extras</a></h1>
  55. <pre class="py-src">
  56. <a name="L1"></a><tt class="py-lineno"> 1</tt> <tt class="py-line"><tt class="py-comment"># Copyright (C) 2011 by Alessandro Presta</tt> </tt>
  57. <a name="L2"></a><tt class="py-lineno"> 2</tt> <tt class="py-line"> </tt>
  58. <a name="L3"></a><tt class="py-lineno"> 3</tt> <tt class="py-line"><tt class="py-comment"># Permission is hereby granted, free of charge, to any person obtaining a copy</tt> </tt>
  59. <a name="L4"></a><tt class="py-lineno"> 4</tt> <tt class="py-line"><tt class="py-comment"># of this software and associated documentation files (the "Software"), to deal</tt> </tt>
  60. <a name="L5"></a><tt class="py-lineno"> 5</tt> <tt class="py-line"><tt class="py-comment"># in the Software without restriction, including without limitation the rights</tt> </tt>
  61. <a name="L6"></a><tt class="py-lineno"> 6</tt> <tt class="py-line"><tt class="py-comment"># to use, copy, modify, merge, publish, distribute, sublicense, and/or sell</tt> </tt>
  62. <a name="L7"></a><tt class="py-lineno"> 7</tt> <tt class="py-line"><tt class="py-comment"># copies of the Software, and to permit persons to whom the Software is</tt> </tt>
  63. <a name="L8"></a><tt class="py-lineno"> 8</tt> <tt class="py-line"><tt class="py-comment"># furnished to do so, subject to the following conditions:</tt> </tt>
  64. <a name="L9"></a><tt class="py-lineno"> 9</tt> <tt class="py-line"> </tt>
  65. <a name="L10"></a><tt class="py-lineno"> 10</tt> <tt class="py-line"><tt class="py-comment"># The above copyright notice and this permission notice shall be included in</tt> </tt>
  66. <a name="L11"></a><tt class="py-lineno"> 11</tt> <tt class="py-line"><tt class="py-comment"># all copies or substantial portions of the Software.</tt> </tt>
  67. <a name="L12"></a><tt class="py-lineno"> 12</tt> <tt class="py-line"> </tt>
  68. <a name="L13"></a><tt class="py-lineno"> 13</tt> <tt class="py-line"><tt class="py-comment"># THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR</tt> </tt>
  69. <a name="L14"></a><tt class="py-lineno"> 14</tt> <tt class="py-line"><tt class="py-comment"># IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,</tt> </tt>
  70. <a name="L15"></a><tt class="py-lineno"> 15</tt> <tt class="py-line"><tt class="py-comment"># FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE</tt> </tt>
  71. <a name="L16"></a><tt class="py-lineno"> 16</tt> <tt class="py-line"><tt class="py-comment"># AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER</tt> </tt>
  72. <a name="L17"></a><tt class="py-lineno"> 17</tt> <tt class="py-line"><tt class="py-comment"># LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,</tt> </tt>
  73. <a name="L18"></a><tt class="py-lineno"> 18</tt> <tt class="py-line"><tt class="py-comment"># OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN</tt> </tt>
  74. <a name="L19"></a><tt class="py-lineno"> 19</tt> <tt class="py-line"><tt class="py-comment"># THE SOFTWARE</tt> </tt>
  75. <a name="L20"></a><tt class="py-lineno"> 20</tt> <tt class="py-line"> </tt>
  76. <a name="L21"></a><tt class="py-lineno"> 21</tt> <tt class="py-line"> </tt>
  77. <a name="L22"></a><tt class="py-lineno"> 22</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-0" class="py-name" targets="Module tagger.tagger=tagger.tagger-module.html"><a title="tagger.tagger" class="py-name" href="#" onclick="return doclink('link-0', 'tagger', 'link-0');">tagger</a></tt> <tt class="py-keyword">import</tt> <tt class="py-op">*</tt> </tt>
  78. <a name="L23"></a><tt class="py-lineno"> 23</tt> <tt class="py-line"> </tt>
  79. <a name="L24"></a><tt class="py-lineno"> 24</tt> <tt class="py-line"> </tt>
  80. <a name="UnicodeReader"></a><div id="UnicodeReader-def"><a name="L25"></a><tt class="py-lineno"> 25</tt> <a class="py-toggle" href="#" id="UnicodeReader-toggle" onclick="return toggle('UnicodeReader');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.extras.UnicodeReader-class.html">UnicodeReader</a><tt class="py-op">(</tt><tt class="py-base-class">Reader</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  81. </div><div id="UnicodeReader-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="UnicodeReader-expanded"><a name="L26"></a><tt class="py-lineno"> 26</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  82. <a name="L27"></a><tt class="py-lineno"> 27</tt> <tt class="py-line"><tt class="py-docstring"> Reader subclass that converts Unicode strings to a close ASCII</tt> </tt>
  83. <a name="L28"></a><tt class="py-lineno"> 28</tt> <tt class="py-line"><tt class="py-docstring"> representation</tt> </tt>
  84. <a name="L29"></a><tt class="py-lineno"> 29</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  85. <a name="L30"></a><tt class="py-lineno"> 30</tt> <tt class="py-line"> </tt>
  86. <a name="UnicodeReader.__call__"></a><div id="UnicodeReader.__call__-def"><a name="L31"></a><tt class="py-lineno"> 31</tt> <a class="py-toggle" href="#" id="UnicodeReader.__call__-toggle" onclick="return toggle('UnicodeReader.__call__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.extras.UnicodeReader-class.html#__call__">__call__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">text</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  87. </div><div id="UnicodeReader.__call__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="UnicodeReader.__call__-expanded"><a name="L32"></a><tt class="py-lineno"> 32</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">unicodedata</tt> </tt>
  88. <a name="L33"></a><tt class="py-lineno"> 33</tt> <tt class="py-line"> </tt>
  89. <a name="L34"></a><tt class="py-lineno"> 34</tt> <tt class="py-line"> <tt class="py-name">text</tt> <tt class="py-op">=</tt> <tt class="py-name">unicodedata</tt><tt class="py-op">.</tt><tt class="py-name">normalize</tt><tt class="py-op">(</tt><tt class="py-string">'NFKD'</tt><tt class="py-op">,</tt> <tt class="py-name">text</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">encode</tt><tt class="py-op">(</tt><tt class="py-string">'ascii'</tt><tt class="py-op">,</tt> <tt class="py-string">'ignore'</tt><tt class="py-op">)</tt> </tt>
  90. <a name="L35"></a><tt class="py-lineno"> 35</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-1" class="py-name" targets="Class tagger.tagger.Reader=tagger.tagger.Reader-class.html"><a title="tagger.tagger.Reader" class="py-name" href="#" onclick="return doclink('link-1', 'Reader', 'link-1');">Reader</a></tt><tt class="py-op">.</tt><tt id="link-2" class="py-name" targets="Method tagger.extras.HTMLReader.__call__()=tagger.extras.HTMLReader-class.html#__call__,Method tagger.extras.NaiveRater.__call__()=tagger.extras.NaiveRater-class.html#__call__,Method tagger.extras.SimpleReader.__call__()=tagger.extras.SimpleReader-class.html#__call__,Method tagger.extras.UnicodeReader.__call__()=tagger.extras.UnicodeReader-class.html#__call__,Method tagger.tagger.Rater.__call__()=tagger.tagger.Rater-class.html#__call__,Method tagger.tagger.Reader.__call__()=tagger.tagger.Reader-class.html#__call__,Method tagger.tagger.Stemmer.__call__()=tagger.tagger.Stemmer-class.html#__call__,Method tagger.tagger.Tagger.__call__()=tagger.tagger.Tagger-class.html#__call__"><a title="tagger.extras.HTMLReader.__call__
  91. tagger.extras.NaiveRater.__call__
  92. tagger.extras.SimpleReader.__call__
  93. tagger.extras.UnicodeReader.__call__
  94. tagger.tagger.Rater.__call__
  95. tagger.tagger.Reader.__call__
  96. tagger.tagger.Stemmer.__call__
  97. tagger.tagger.Tagger.__call__" class="py-name" href="#" onclick="return doclink('link-2', '__call__', 'link-2');">__call__</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  98. </div></div><a name="L36"></a><tt class="py-lineno"> 36</tt> <tt class="py-line"> </tt>
  99. <a name="L37"></a><tt class="py-lineno"> 37</tt> <tt class="py-line"> </tt>
  100. <a name="HTMLReader"></a><div id="HTMLReader-def"><a name="L38"></a><tt class="py-lineno"> 38</tt> <a class="py-toggle" href="#" id="HTMLReader-toggle" onclick="return toggle('HTMLReader');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.extras.HTMLReader-class.html">HTMLReader</a><tt class="py-op">(</tt><tt class="py-base-class">UnicodeReader</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  101. </div><div id="HTMLReader-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="HTMLReader-expanded"><a name="L39"></a><tt class="py-lineno"> 39</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  102. <a name="L40"></a><tt class="py-lineno"> 40</tt> <tt class="py-line"><tt class="py-docstring"> Reader subclass that can parse HTML code from the input</tt> </tt>
  103. <a name="L41"></a><tt class="py-lineno"> 41</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  104. <a name="L42"></a><tt class="py-lineno"> 42</tt> <tt class="py-line"> </tt>
  105. <a name="HTMLReader.__call__"></a><div id="HTMLReader.__call__-def"><a name="L43"></a><tt class="py-lineno"> 43</tt> <a class="py-toggle" href="#" id="HTMLReader.__call__-toggle" onclick="return toggle('HTMLReader.__call__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.extras.HTMLReader-class.html#__call__">__call__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">html</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  106. </div><div id="HTMLReader.__call__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="HTMLReader.__call__-expanded"><a name="L44"></a><tt class="py-lineno"> 44</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">lxml</tt><tt class="py-op">.</tt><tt class="py-name">html</tt> </tt>
  107. <a name="L45"></a><tt class="py-lineno"> 45</tt> <tt class="py-line"> </tt>
  108. <a name="L46"></a><tt class="py-lineno"> 46</tt> <tt class="py-line"> <tt class="py-name">text</tt> <tt class="py-op">=</tt> <tt class="py-name">lxml</tt><tt class="py-op">.</tt><tt class="py-name">html</tt><tt class="py-op">.</tt><tt class="py-name">fromstring</tt><tt class="py-op">(</tt><tt class="py-name">html</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">text_content</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
  109. <a name="L47"></a><tt class="py-lineno"> 47</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">text</tt><tt class="py-op">,</tt> <tt class="py-name">unicode</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  110. <a name="L48"></a><tt class="py-lineno"> 48</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-3" class="py-name" targets="Class tagger.extras.UnicodeReader=tagger.extras.UnicodeReader-class.html"><a title="tagger.extras.UnicodeReader" class="py-name" href="#" onclick="return doclink('link-3', 'UnicodeReader', 'link-3');">UnicodeReader</a></tt><tt class="py-op">.</tt><tt id="link-4" class="py-name"><a title="tagger.extras.HTMLReader.__call__
  111. tagger.extras.NaiveRater.__call__
  112. tagger.extras.SimpleReader.__call__
  113. tagger.extras.UnicodeReader.__call__
  114. tagger.tagger.Rater.__call__
  115. tagger.tagger.Reader.__call__
  116. tagger.tagger.Stemmer.__call__
  117. tagger.tagger.Tagger.__call__" class="py-name" href="#" onclick="return doclink('link-4', '__call__', 'link-2');">__call__</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  118. <a name="L49"></a><tt class="py-lineno"> 49</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
  119. <a name="L50"></a><tt class="py-lineno"> 50</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-5" class="py-name"><a title="tagger.tagger.Reader" class="py-name" href="#" onclick="return doclink('link-5', 'Reader', 'link-1');">Reader</a></tt><tt class="py-op">.</tt><tt id="link-6" class="py-name"><a title="tagger.extras.HTMLReader.__call__
  120. tagger.extras.NaiveRater.__call__
  121. tagger.extras.SimpleReader.__call__
  122. tagger.extras.UnicodeReader.__call__
  123. tagger.tagger.Rater.__call__
  124. tagger.tagger.Reader.__call__
  125. tagger.tagger.Stemmer.__call__
  126. tagger.tagger.Tagger.__call__" class="py-name" href="#" onclick="return doclink('link-6', '__call__', 'link-2');">__call__</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  127. </div></div><a name="L51"></a><tt class="py-lineno"> 51</tt> <tt class="py-line"> </tt>
  128. <a name="L52"></a><tt class="py-lineno"> 52</tt> <tt class="py-line"> </tt>
  129. <a name="SimpleReader"></a><div id="SimpleReader-def"><a name="L53"></a><tt class="py-lineno"> 53</tt> <a class="py-toggle" href="#" id="SimpleReader-toggle" onclick="return toggle('SimpleReader');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.extras.SimpleReader-class.html">SimpleReader</a><tt class="py-op">(</tt><tt class="py-base-class">Reader</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  130. </div><div id="SimpleReader-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="SimpleReader-expanded"><a name="L54"></a><tt class="py-lineno"> 54</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  131. <a name="L55"></a><tt class="py-lineno"> 55</tt> <tt class="py-line"><tt class="py-docstring"> Reader subclass that doesn't perform any advanced analysis of the text</tt> </tt>
  132. <a name="L56"></a><tt class="py-lineno"> 56</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  133. <a name="L57"></a><tt class="py-lineno"> 57</tt> <tt class="py-line"> </tt>
  134. <a name="SimpleReader.__call__"></a><div id="SimpleReader.__call__-def"><a name="L58"></a><tt class="py-lineno"> 58</tt> <a class="py-toggle" href="#" id="SimpleReader.__call__-toggle" onclick="return toggle('SimpleReader.__call__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.extras.SimpleReader-class.html#__call__">__call__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">text</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  135. </div><div id="SimpleReader.__call__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SimpleReader.__call__-expanded"><a name="L59"></a><tt class="py-lineno"> 59</tt> <tt class="py-line"> <tt class="py-name">text</tt> <tt class="py-op">=</tt> <tt class="py-name">text</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
  136. <a name="L60"></a><tt class="py-lineno"> 60</tt> <tt class="py-line"> <tt class="py-name">text</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-7" class="py-name" targets="Method tagger.tagger.Reader.preprocess()=tagger.tagger.Reader-class.html#preprocess,Method tagger.tagger.Stemmer.preprocess()=tagger.tagger.Stemmer-class.html#preprocess"><a title="tagger.tagger.Reader.preprocess
  137. tagger.tagger.Stemmer.preprocess" class="py-name" href="#" onclick="return doclink('link-7', 'preprocess', 'link-7');">preprocess</a></tt><tt class="py-op">(</tt><tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  138. <a name="L61"></a><tt class="py-lineno"> 61</tt> <tt class="py-line"> <tt class="py-name">words</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-8" class="py-name" targets="Variable tagger.tagger.Reader.match_words=tagger.tagger.Reader-class.html#match_words"><a title="tagger.tagger.Reader.match_words" class="py-name" href="#" onclick="return doclink('link-8', 'match_words', 'link-8');">match_words</a></tt><tt class="py-op">.</tt><tt class="py-name">findall</tt><tt class="py-op">(</tt><tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  139. <a name="L62"></a><tt class="py-lineno"> 62</tt> <tt class="py-line"> <tt class="py-name">tags</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt id="link-9" class="py-name" targets="Class tagger.tagger.Tag=tagger.tagger.Tag-class.html"><a title="tagger.tagger.Tag" class="py-name" href="#" onclick="return doclink('link-9', 'Tag', 'link-9');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">w</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">words</tt><tt class="py-op">]</tt> </tt>
  140. <a name="L63"></a><tt class="py-lineno"> 63</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">tags</tt> </tt>
  141. </div></div><a name="L64"></a><tt class="py-lineno"> 64</tt> <tt class="py-line"> </tt>
  142. <a name="L65"></a><tt class="py-lineno"> 65</tt> <tt class="py-line"> </tt>
  143. <a name="FastStemmer"></a><div id="FastStemmer-def"><a name="L66"></a><tt class="py-lineno"> 66</tt> <a class="py-toggle" href="#" id="FastStemmer-toggle" onclick="return toggle('FastStemmer');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.extras.FastStemmer-class.html">FastStemmer</a><tt class="py-op">(</tt><tt class="py-base-class">Stemmer</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  144. </div><div id="FastStemmer-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="FastStemmer-expanded"><a name="L67"></a><tt class="py-lineno"> 67</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  145. <a name="L68"></a><tt class="py-lineno"> 68</tt> <tt class="py-line"><tt class="py-docstring"> Stemmer subclass that uses a much faster, but less correct algorithm</tt> </tt>
  146. <a name="L69"></a><tt class="py-lineno"> 69</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  147. <a name="L70"></a><tt class="py-lineno"> 70</tt> <tt class="py-line"> </tt>
  148. <a name="FastStemmer.__init__"></a><div id="FastStemmer.__init__-def"><a name="L71"></a><tt class="py-lineno"> 71</tt> <a class="py-toggle" href="#" id="FastStemmer.__init__-toggle" onclick="return toggle('FastStemmer.__init__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.extras.FastStemmer-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  149. </div><div id="FastStemmer.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="FastStemmer.__init__-expanded"><a name="L72"></a><tt class="py-lineno"> 72</tt> <tt class="py-line"> <tt class="py-keyword">from</tt> <tt class="py-name">stemming</tt> <tt class="py-keyword">import</tt> <tt class="py-name">porter</tt> </tt>
  150. <a name="L73"></a><tt class="py-lineno"> 73</tt> <tt class="py-line"> </tt>
  151. <a name="L74"></a><tt class="py-lineno"> 74</tt> <tt class="py-line"> <tt id="link-10" class="py-name" targets="Class tagger.tagger.Stemmer=tagger.tagger.Stemmer-class.html"><a title="tagger.tagger.Stemmer" class="py-name" href="#" onclick="return doclink('link-10', 'Stemmer', 'link-10');">Stemmer</a></tt><tt class="py-op">.</tt><tt id="link-11" class="py-name" targets="Method tagger.extras.FastStemmer.__init__()=tagger.extras.FastStemmer-class.html#__init__,Method tagger.tagger.MultiTag.__init__()=tagger.tagger.MultiTag-class.html#__init__,Method tagger.tagger.Rater.__init__()=tagger.tagger.Rater-class.html#__init__,Method tagger.tagger.Stemmer.__init__()=tagger.tagger.Stemmer-class.html#__init__,Method tagger.tagger.Tag.__init__()=tagger.tagger.Tag-class.html#__init__,Method tagger.tagger.Tagger.__init__()=tagger.tagger.Tagger-class.html#__init__"><a title="tagger.extras.FastStemmer.__init__
  152. tagger.tagger.MultiTag.__init__
  153. tagger.tagger.Rater.__init__
  154. tagger.tagger.Stemmer.__init__
  155. tagger.tagger.Tag.__init__
  156. tagger.tagger.Tagger.__init__" class="py-name" href="#" onclick="return doclink('link-11', '__init__', 'link-11');">__init__</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">porter</tt><tt class="py-op">)</tt> </tt>
  157. </div></div><a name="L75"></a><tt class="py-lineno"> 75</tt> <tt class="py-line"> </tt>
  158. <a name="L76"></a><tt class="py-lineno"> 76</tt> <tt class="py-line"> </tt>
  159. <a name="NaiveRater"></a><div id="NaiveRater-def"><a name="L77"></a><tt class="py-lineno"> 77</tt> <a class="py-toggle" href="#" id="NaiveRater-toggle" onclick="return toggle('NaiveRater');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.extras.NaiveRater-class.html">NaiveRater</a><tt class="py-op">(</tt><tt class="py-base-class">Rater</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  160. </div><div id="NaiveRater-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="NaiveRater-expanded"><a name="L78"></a><tt class="py-lineno"> 78</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  161. <a name="L79"></a><tt class="py-lineno"> 79</tt> <tt class="py-line"><tt class="py-docstring"> Rater subclass that jusk ranks single-word tags by their frequency and</tt> </tt>
  162. <a name="L80"></a><tt class="py-lineno"> 80</tt> <tt class="py-line"><tt class="py-docstring"> weight</tt> </tt>
  163. <a name="L81"></a><tt class="py-lineno"> 81</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  164. <a name="L82"></a><tt class="py-lineno"> 82</tt> <tt class="py-line"> </tt>
  165. <a name="NaiveRater.__call__"></a><div id="NaiveRater.__call__-def"><a name="L83"></a><tt class="py-lineno"> 83</tt> <a class="py-toggle" href="#" id="NaiveRater.__call__-toggle" onclick="return toggle('NaiveRater.__call__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.extras.NaiveRater-class.html#__call__">__call__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tags</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  166. </div><div id="NaiveRater.__call__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="NaiveRater.__call__-expanded"><a name="L84"></a><tt class="py-lineno"> 84</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-12" class="py-name" targets="Method tagger.tagger.Rater.rate_tags()=tagger.tagger.Rater-class.html#rate_tags"><a title="tagger.tagger.Rater.rate_tags" class="py-name" href="#" onclick="return doclink('link-12', 'rate_tags', 'link-12');">rate_tags</a></tt><tt class="py-op">(</tt><tt class="py-name">tags</tt><tt class="py-op">)</tt> </tt>
  167. <a name="L85"></a><tt class="py-lineno"> 85</tt> <tt class="py-line"> <tt class="py-comment"># we still get rid of one-character tags and stopwords</tt> </tt>
  168. <a name="L86"></a><tt class="py-lineno"> 86</tt> <tt class="py-line"> <tt class="py-name">unique_tags</tt> <tt class="py-op">=</tt> <tt class="py-name">set</tt><tt class="py-op">(</tt><tt class="py-name">t</tt> <tt class="py-keyword">for</tt> <tt class="py-name">t</tt> <tt class="py-keyword">in</tt> <tt class="py-name">tags</tt> </tt>
  169. <a name="L87"></a><tt class="py-lineno"> 87</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">t</tt><tt class="py-op">.</tt><tt class="py-name">string</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">1</tt> <tt class="py-keyword">and</tt> <tt class="py-name">t</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">0.0</tt><tt class="py-op">)</tt> </tt>
  170. <a name="L88"></a><tt class="py-lineno"> 88</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">sorted</tt><tt class="py-op">(</tt><tt class="py-name">unique_tags</tt><tt class="py-op">)</tt> </tt>
  171. </div></div><a name="L89"></a><tt class="py-lineno"> 89</tt> <tt class="py-line"> </tt>
  172. <a name="L90"></a><tt class="py-lineno"> 90</tt> <tt class="py-line"> </tt>
  173. <a name="build_dict_from_nltk"></a><div id="build_dict_from_nltk-def"><a name="L91"></a><tt class="py-lineno"> 91</tt> <a class="py-toggle" href="#" id="build_dict_from_nltk-toggle" onclick="return toggle('build_dict_from_nltk');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.extras-module.html#build_dict_from_nltk">build_dict_from_nltk</a><tt class="py-op">(</tt><tt class="py-param">output_file</tt><tt class="py-op">,</tt> <tt class="py-param">corpus</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">,</tt> <tt class="py-param">stopwords</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">,</tt> </tt>
  174. <a name="L92"></a><tt class="py-lineno"> 92</tt> <tt class="py-line"> <tt class="py-param">stemmer</tt><tt class="py-op">=</tt><tt id="link-13" class="py-name"><a title="tagger.tagger.Stemmer" class="py-name" href="#" onclick="return doclink('link-13', 'Stemmer', 'link-10');">Stemmer</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-param">measure</tt><tt class="py-op">=</tt><tt class="py-string">'IDF'</tt><tt class="py-op">,</tt> <tt class="py-param">verbose</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  175. </div><div id="build_dict_from_nltk-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="build_dict_from_nltk-expanded"><a name="L93"></a><tt class="py-lineno"> 93</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  176. <a name="L94"></a><tt class="py-lineno"> 94</tt> <tt class="py-line"><tt class="py-docstring"> @param output_file: the name of the file where the dictionary should be</tt> </tt>
  177. <a name="L95"></a><tt class="py-lineno"> 95</tt> <tt class="py-line"><tt class="py-docstring"> saved</tt> </tt>
  178. <a name="L96"></a><tt class="py-lineno"> 96</tt> <tt class="py-line"><tt class="py-docstring"> @param corpus: the NLTK corpus to use (defaults to nltk.corpus.reuters)</tt> </tt>
  179. <a name="L97"></a><tt class="py-lineno"> 97</tt> <tt class="py-line"><tt class="py-docstring"> @param stopwords: a list of (not stemmed) stopwords (defaults to</tt> </tt>
  180. <a name="L98"></a><tt class="py-lineno"> 98</tt> <tt class="py-line"><tt class="py-docstring"> nltk.corpus.reuters.words('stopwords'))</tt> </tt>
  181. <a name="L99"></a><tt class="py-lineno"> 99</tt> <tt class="py-line"><tt class="py-docstring"> @param stemmer: the L{Stemmer} object to be used</tt> </tt>
  182. <a name="L100"></a><tt class="py-lineno">100</tt> <tt class="py-line"><tt class="py-docstring"> @param measure: the measure used to compute the weights ('IDF'</tt> </tt>
  183. <a name="L101"></a><tt class="py-lineno">101</tt> <tt class="py-line"><tt class="py-docstring"> i.e. 'inverse document frequency' or 'ICF' i.e.</tt> </tt>
  184. <a name="L102"></a><tt class="py-lineno">102</tt> <tt class="py-line"><tt class="py-docstring"> 'inverse collection frequency'; defaults to 'IDF')</tt> </tt>
  185. <a name="L103"></a><tt class="py-lineno">103</tt> <tt class="py-line"><tt class="py-docstring"> @param verbose: whether information on the progress should be printed</tt> </tt>
  186. <a name="L104"></a><tt class="py-lineno">104</tt> <tt class="py-line"><tt class="py-docstring"> on screen</tt> </tt>
  187. <a name="L105"></a><tt class="py-lineno">105</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  188. <a name="L106"></a><tt class="py-lineno">106</tt> <tt class="py-line"> </tt>
  189. <a name="L107"></a><tt class="py-lineno">107</tt> <tt class="py-line"> <tt class="py-keyword">from</tt> <tt id="link-14" class="py-name" targets="Module tagger.build_dict=tagger.build_dict-module.html,Function tagger.build_dict.build_dict()=tagger.build_dict-module.html#build_dict"><a title="tagger.build_dict
  190. tagger.build_dict.build_dict" class="py-name" href="#" onclick="return doclink('link-14', 'build_dict', 'link-14');">build_dict</a></tt> <tt class="py-keyword">import</tt> <tt id="link-15" class="py-name"><a title="tagger.build_dict
  191. tagger.build_dict.build_dict" class="py-name" href="#" onclick="return doclink('link-15', 'build_dict', 'link-14');">build_dict</a></tt> </tt>
  192. <a name="L108"></a><tt class="py-lineno">108</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">nltk</tt> </tt>
  193. <a name="L109"></a><tt class="py-lineno">109</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">pickle</tt> </tt>
  194. <a name="L110"></a><tt class="py-lineno">110</tt> <tt class="py-line"> </tt>
  195. <a name="L111"></a><tt class="py-lineno">111</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-op">(</tt><tt class="py-name">corpus</tt> <tt class="py-keyword">and</tt> <tt class="py-name">stopwords</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  196. <a name="L112"></a><tt class="py-lineno">112</tt> <tt class="py-line"> <tt class="py-name">nltk</tt><tt class="py-op">.</tt><tt class="py-name">download</tt><tt class="py-op">(</tt><tt class="py-string">'reuters'</tt><tt class="py-op">)</tt> </tt>
  197. <a name="L113"></a><tt class="py-lineno">113</tt> <tt class="py-line"> </tt>
  198. <a name="L114"></a><tt class="py-lineno">114</tt> <tt class="py-line"> <tt class="py-name">corpus</tt> <tt class="py-op">=</tt> <tt class="py-name">corpus</tt> <tt class="py-keyword">or</tt> <tt class="py-name">nltk</tt><tt class="py-op">.</tt><tt class="py-name">corpus</tt><tt class="py-op">.</tt><tt class="py-name">reuters</tt> </tt>
  199. <a name="L115"></a><tt class="py-lineno">115</tt> <tt class="py-line"> <tt class="py-name">stopwords</tt> <tt class="py-op">=</tt> <tt class="py-name">stopwords</tt> <tt class="py-keyword">or</tt> <tt class="py-name">nltk</tt><tt class="py-op">.</tt><tt class="py-name">corpus</tt><tt class="py-op">.</tt><tt class="py-name">reuters</tt><tt class="py-op">.</tt><tt class="py-name">words</tt><tt class="py-op">(</tt><tt class="py-string">'stopwords'</tt><tt class="py-op">)</tt> </tt>
  200. <a name="L116"></a><tt class="py-lineno">116</tt> <tt class="py-line"> </tt>
  201. <a name="L117"></a><tt class="py-lineno">117</tt> <tt class="py-line"> <tt class="py-name">corpus_list</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-op">]</tt> </tt>
  202. <a name="L118"></a><tt class="py-lineno">118</tt> <tt class="py-line"> </tt>
  203. <a name="L119"></a><tt class="py-lineno">119</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">verbose</tt><tt class="py-op">:</tt> <tt class="py-keyword">print</tt> <tt class="py-string">'Processing corpus...'</tt> </tt>
  204. <a name="L120"></a><tt class="py-lineno">120</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">file</tt> <tt class="py-keyword">in</tt> <tt class="py-name">corpus</tt><tt class="py-op">.</tt><tt class="py-name">fileids</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  205. <a name="L121"></a><tt class="py-lineno">121</tt> <tt class="py-line"> <tt class="py-name">doc</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">stemmer</tt><tt class="py-op">(</tt><tt id="link-16" class="py-name"><a title="tagger.tagger.Tag" class="py-name" href="#" onclick="return doclink('link-16', 'Tag', 'link-9');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">w</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">corpus</tt><tt class="py-op">.</tt><tt class="py-name">words</tt><tt class="py-op">(</tt><tt class="py-name">file</tt><tt class="py-op">)</tt> </tt>
  206. <a name="L122"></a><tt class="py-lineno">122</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">w</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">isalpha</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">]</tt> </tt>
  207. <a name="L123"></a><tt class="py-lineno">123</tt> <tt class="py-line"> <tt class="py-name">corpus_list</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">doc</tt><tt class="py-op">)</tt> </tt>
  208. <a name="L124"></a><tt class="py-lineno">124</tt> <tt class="py-line"> </tt>
  209. <a name="L125"></a><tt class="py-lineno">125</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">verbose</tt><tt class="py-op">:</tt> <tt class="py-keyword">print</tt> <tt class="py-string">'Processing stopwords...'</tt> </tt>
  210. <a name="L126"></a><tt class="py-lineno">126</tt> <tt class="py-line"> <tt class="py-name">stopwords</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">stemmer</tt><tt class="py-op">(</tt><tt id="link-17" class="py-name"><a title="tagger.tagger.Tag" class="py-name" href="#" onclick="return doclink('link-17', 'Tag', 'link-9');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">w</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">stopwords</tt><tt class="py-op">]</tt> </tt>
  211. <a name="L127"></a><tt class="py-lineno">127</tt> <tt class="py-line"> </tt>
  212. <a name="L128"></a><tt class="py-lineno">128</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">verbose</tt><tt class="py-op">:</tt> <tt class="py-keyword">print</tt> <tt class="py-string">'Building dictionary... '</tt> </tt>
  213. <a name="L129"></a><tt class="py-lineno">129</tt> <tt class="py-line"> <tt class="py-name">dictionary</tt> <tt class="py-op">=</tt> <tt id="link-18" class="py-name"><a title="tagger.build_dict
  214. tagger.build_dict.build_dict" class="py-name" href="#" onclick="return doclink('link-18', 'build_dict', 'link-14');">build_dict</a></tt><tt class="py-op">(</tt><tt class="py-name">corpus_list</tt><tt class="py-op">,</tt> <tt class="py-name">stopwords</tt><tt class="py-op">,</tt> <tt class="py-name">measure</tt><tt class="py-op">)</tt> </tt>
  215. <a name="L130"></a><tt class="py-lineno">130</tt> <tt class="py-line"> <tt class="py-keyword">with</tt> <tt class="py-name">open</tt><tt class="py-op">(</tt><tt class="py-name">output_file</tt><tt class="py-op">,</tt> <tt class="py-string">'wb'</tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">out</tt><tt class="py-op">:</tt> </tt>
  216. <a name="L131"></a><tt class="py-lineno">131</tt> <tt class="py-line"> <tt class="py-name">pickle</tt><tt class="py-op">.</tt><tt class="py-name">dump</tt><tt class="py-op">(</tt><tt class="py-name">dictionary</tt><tt class="py-op">,</tt> <tt class="py-name">out</tt><tt class="py-op">,</tt> <tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
  217. </div><a name="L132"></a><tt class="py-lineno">132</tt> <tt class="py-line"> </tt><script type="text/javascript">
  218. <!--
  219. expandto(location.href);
  220. // -->
  221. </script>
  222. </pre>
  223. <br />
  224. <!-- ==================== NAVIGATION BAR ==================== -->
  225. <table class="navbar" border="0" width="100%" cellpadding="0"
  226. bgcolor="#a0c0ff" cellspacing="0">
  227. <tr valign="middle">
  228. <!-- Tree link -->
  229. <th>&nbsp;&nbsp;&nbsp;<a
  230. href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
  231. <!-- Index link -->
  232. <th>&nbsp;&nbsp;&nbsp;<a
  233. href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
  234. <!-- Help link -->
  235. <th>&nbsp;&nbsp;&nbsp;<a
  236. href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
  237. <!-- Project homepage -->
  238. <th class="navbar" align="right" width="100%">
  239. <table border="0" cellpadding="0" cellspacing="0">
  240. <tr><th class="navbar" align="center"
  241. ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th>
  242. </tr></table></th>
  243. </tr>
  244. </table>
  245. <table border="0" cellpadding="0" cellspacing="0" width="100%%">
  246. <tr>
  247. <td align="left" class="footer">
  248. Generated by Epydoc 3.0.1 on Wed Jun 8 01:57:46 2011
  249. </td>
  250. <td align="right" class="footer">
  251. <a target="mainFrame" href="http://epydoc.sourceforge.net"
  252. >http://epydoc.sourceforge.net</a>
  253. </td>
  254. </tr>
  255. </table>
  256. <script type="text/javascript">
  257. <!--
  258. // Private objects are initially displayed (because if
  259. // javascript is turned off then we want them to be
  260. // visible); but by default, we want to hide them. So hide
  261. // them unless we have a cookie that says to show them.
  262. checkCookie();
  263. // -->
  264. </script>
  265. </body>
  266. </html>