PageRenderTime 736ms CodeModel.GetById 29ms RepoModel.GetById 7ms app.codeStats 1ms

/doc/tagger-pysrc.html

http://github.com/apresta/tagger
HTML | 350 lines | 339 code | 5 blank | 6 comment | 0 complexity | dd3aaf02d228b0ce7e5e9792458d22b6 MD5 | raw file
  1. <?xml version="1.0" encoding="ascii"?>
  2. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3. "DTD/xhtml1-transitional.dtd">
  4. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5. <head>
  6. <title>tagger</title>
  7. <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8. <script type="text/javascript" src="epydoc.js"></script>
  9. </head>
  10. <body bgcolor="white" text="black" link="blue" vlink="#204080"
  11. alink="#204080">
  12. <!-- ==================== NAVIGATION BAR ==================== -->
  13. <table class="navbar" border="0" width="100%" cellpadding="0"
  14. bgcolor="#a0c0ff" cellspacing="0">
  15. <tr valign="middle">
  16. <!-- Tree link -->
  17. <th>&nbsp;&nbsp;&nbsp;<a
  18. href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
  19. <!-- Index link -->
  20. <th>&nbsp;&nbsp;&nbsp;<a
  21. href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
  22. <!-- Help link -->
  23. <th>&nbsp;&nbsp;&nbsp;<a
  24. href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
  25. <!-- Project homepage -->
  26. <th class="navbar" align="right" width="100%">
  27. <table border="0" cellpadding="0" cellspacing="0">
  28. <tr><th class="navbar" align="center"
  29. ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th>
  30. </tr></table></th>
  31. </tr>
  32. </table>
  33. <table width="100%" cellpadding="0" cellspacing="0">
  34. <tr valign="top">
  35. <td width="100%">
  36. <span class="breadcrumbs">
  37. Module&nbsp;tagger
  38. </span>
  39. </td>
  40. <td>
  41. <table cellpadding="0" cellspacing="0">
  42. <!-- hide/show private -->
  43. <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
  44. onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
  45. <tr><td align="right"><span class="options"
  46. >[<a href="frames.html" target="_top">frames</a
  47. >]&nbsp;|&nbsp;<a href="tagger-pysrc.html"
  48. target="_top">no&nbsp;frames</a>]</span></td></tr>
  49. </table>
  50. </td>
  51. </tr>
  52. </table>
  53. <h1 class="epydoc">Source Code for <a href="tagger-module.html">Module tagger</a></h1>
  54. <pre class="py-src">
  55. <a name="L1"></a><tt class="py-lineno"> 1</tt> <tt class="py-line"><tt class="py-comment">#!/usr/bin/env python</tt> </tt>
  56. <a name="L2"></a><tt class="py-lineno"> 2</tt> <tt class="py-line"><tt class="py-comment"># -*- coding: utf-8 -*-</tt> </tt>
  57. <a name="L3"></a><tt class="py-lineno"> 3</tt> <tt class="py-line"> </tt>
  58. <a name="L4"></a><tt class="py-lineno"> 4</tt> <tt class="py-line"><tt class="py-comment"># Copyright (C) 2011 by Alessandro Presta</tt> </tt>
  59. <a name="L5"></a><tt class="py-lineno"> 5</tt> <tt class="py-line"> </tt>
  60. <a name="L6"></a><tt class="py-lineno"> 6</tt> <tt class="py-line"><tt class="py-comment"># Permission is hereby granted, free of charge, to any person obtaining a copy</tt> </tt>
  61. <a name="L7"></a><tt class="py-lineno"> 7</tt> <tt class="py-line"><tt class="py-comment"># of this software and associated documentation files (the "Software"), to deal</tt> </tt>
  62. <a name="L8"></a><tt class="py-lineno"> 8</tt> <tt class="py-line"><tt class="py-comment"># in the Software without restriction, including without limitation the rights</tt> </tt>
  63. <a name="L9"></a><tt class="py-lineno"> 9</tt> <tt class="py-line"><tt class="py-comment"># to use, copy, modify, merge, publish, distribute, sublicense, and/or sell</tt> </tt>
  64. <a name="L10"></a><tt class="py-lineno"> 10</tt> <tt class="py-line"><tt class="py-comment"># copies of the Software, and to permit persons to whom the Software is</tt> </tt>
  65. <a name="L11"></a><tt class="py-lineno"> 11</tt> <tt class="py-line"><tt class="py-comment"># furnished to do so, subject to the following conditions:</tt> </tt>
  66. <a name="L12"></a><tt class="py-lineno"> 12</tt> <tt class="py-line"> </tt>
  67. <a name="L13"></a><tt class="py-lineno"> 13</tt> <tt class="py-line"><tt class="py-comment"># The above copyright notice and this permission notice shall be included in</tt> </tt>
  68. <a name="L14"></a><tt class="py-lineno"> 14</tt> <tt class="py-line"><tt class="py-comment"># all copies or substantial portions of the Software.</tt> </tt>
  69. <a name="L15"></a><tt class="py-lineno"> 15</tt> <tt class="py-line"> </tt>
  70. <a name="L16"></a><tt class="py-lineno"> 16</tt> <tt class="py-line"><tt class="py-comment"># THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR</tt> </tt>
  71. <a name="L17"></a><tt class="py-lineno"> 17</tt> <tt class="py-line"><tt class="py-comment"># IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,</tt> </tt>
  72. <a name="L18"></a><tt class="py-lineno"> 18</tt> <tt class="py-line"><tt class="py-comment"># FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE</tt> </tt>
  73. <a name="L19"></a><tt class="py-lineno"> 19</tt> <tt class="py-line"><tt class="py-comment"># AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER</tt> </tt>
  74. <a name="L20"></a><tt class="py-lineno"> 20</tt> <tt class="py-line"><tt class="py-comment"># LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,</tt> </tt>
  75. <a name="L21"></a><tt class="py-lineno"> 21</tt> <tt class="py-line"><tt class="py-comment"># OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN</tt> </tt>
  76. <a name="L22"></a><tt class="py-lineno"> 22</tt> <tt class="py-line"><tt class="py-comment"># THE SOFTWARE</tt> </tt>
  77. <a name="L23"></a><tt class="py-lineno"> 23</tt> <tt class="py-line"> </tt>
  78. <a name="L24"></a><tt class="py-lineno"> 24</tt> <tt class="py-line"> </tt>
  79. <a name="L25"></a><tt class="py-lineno"> 25</tt> <tt class="py-line"><tt class="py-docstring">'''</tt> </tt>
  80. <a name="L26"></a><tt class="py-lineno"> 26</tt> <tt class="py-line"><tt class="py-docstring">======</tt> </tt>
  81. <a name="L27"></a><tt class="py-lineno"> 27</tt> <tt class="py-line"><tt class="py-docstring">tagger</tt> </tt>
  82. <a name="L28"></a><tt class="py-lineno"> 28</tt> <tt class="py-line"><tt class="py-docstring">======</tt> </tt>
  83. <a name="L29"></a><tt class="py-lineno"> 29</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  84. <a name="L30"></a><tt class="py-lineno"> 30</tt> <tt class="py-line"><tt class="py-docstring">Module for extracting tags from text documents.</tt> </tt>
  85. <a name="L31"></a><tt class="py-lineno"> 31</tt> <tt class="py-line"><tt class="py-docstring"> </tt> </tt>
  86. <a name="L32"></a><tt class="py-lineno"> 32</tt> <tt class="py-line"><tt class="py-docstring">Copyright (C) 2011 by Alessandro Presta</tt> </tt>
  87. <a name="L33"></a><tt class="py-lineno"> 33</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  88. <a name="L34"></a><tt class="py-lineno"> 34</tt> <tt class="py-line"><tt class="py-docstring">Configuration</tt> </tt>
  89. <a name="L35"></a><tt class="py-lineno"> 35</tt> <tt class="py-line"><tt class="py-docstring">=============</tt> </tt>
  90. <a name="L36"></a><tt class="py-lineno"> 36</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  91. <a name="L37"></a><tt class="py-lineno"> 37</tt> <tt class="py-line"><tt class="py-docstring">Dependencies:</tt> </tt>
  92. <a name="L38"></a><tt class="py-lineno"> 38</tt> <tt class="py-line"><tt class="py-docstring">python2.7, stemming, nltk (optional), lxml (optional), tkinter (optional)</tt> </tt>
  93. <a name="L39"></a><tt class="py-lineno"> 39</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  94. <a name="L40"></a><tt class="py-lineno"> 40</tt> <tt class="py-line"><tt class="py-docstring">You can install the stemming package with::</tt> </tt>
  95. <a name="L41"></a><tt class="py-lineno"> 41</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  96. <a name="L42"></a><tt class="py-lineno"> 42</tt> <tt class="py-line"><tt class="py-docstring"> $ easy_install stemming</tt> </tt>
  97. <a name="L43"></a><tt class="py-lineno"> 43</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  98. <a name="L44"></a><tt class="py-lineno"> 44</tt> <tt class="py-line"><tt class="py-docstring">Usage</tt> </tt>
  99. <a name="L45"></a><tt class="py-lineno"> 45</tt> <tt class="py-line"><tt class="py-docstring">=====</tt> </tt>
  100. <a name="L46"></a><tt class="py-lineno"> 46</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  101. <a name="L47"></a><tt class="py-lineno"> 47</tt> <tt class="py-line"><tt class="py-docstring">Tagging a text document from Python::</tt> </tt>
  102. <a name="L48"></a><tt class="py-lineno"> 48</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  103. <a name="L49"></a><tt class="py-lineno"> 49</tt> <tt class="py-line"><tt class="py-docstring"> import tagger</tt> </tt>
  104. <a name="L50"></a><tt class="py-lineno"> 50</tt> <tt class="py-line"><tt class="py-docstring"> weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary</tt> </tt>
  105. <a name="L51"></a><tt class="py-lineno"> 51</tt> <tt class="py-line"><tt class="py-docstring"> myreader = tagger.Reader() # or your own reader class</tt> </tt>
  106. <a name="L52"></a><tt class="py-lineno"> 52</tt> <tt class="py-line"><tt class="py-docstring"> mystemmer = tagger.Stemmer() # or your own stemmer class</tt> </tt>
  107. <a name="L53"></a><tt class="py-lineno"> 53</tt> <tt class="py-line"><tt class="py-docstring"> myrater = tagger.Rater(weights) # or your own... (you got the idea)</tt> </tt>
  108. <a name="L54"></a><tt class="py-lineno"> 54</tt> <tt class="py-line"><tt class="py-docstring"> mytagger = Tagger(myreader, mystemmer, myrater)</tt> </tt>
  109. <a name="L55"></a><tt class="py-lineno"> 55</tt> <tt class="py-line"><tt class="py-docstring"> best_3_tags = mytagger(text_string, 3)</tt> </tt>
  110. <a name="L56"></a><tt class="py-lineno"> 56</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  111. <a name="L57"></a><tt class="py-lineno"> 57</tt> <tt class="py-line"><tt class="py-docstring">Running the module as a script::</tt> </tt>
  112. <a name="L58"></a><tt class="py-lineno"> 58</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  113. <a name="L59"></a><tt class="py-lineno"> 59</tt> <tt class="py-line"><tt class="py-docstring"> $ ./tagger.py &lt;text document(s) to tag&gt;</tt> </tt>
  114. <a name="L60"></a><tt class="py-lineno"> 60</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  115. <a name="L61"></a><tt class="py-lineno"> 61</tt> <tt class="py-line"><tt class="py-docstring">Example::</tt> </tt>
  116. <a name="L62"></a><tt class="py-lineno"> 62</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  117. <a name="L63"></a><tt class="py-lineno"> 63</tt> <tt class="py-line"><tt class="py-docstring"> $ ./tagger.py tests/*</tt> </tt>
  118. <a name="L64"></a><tt class="py-lineno"> 64</tt> <tt class="py-line"><tt class="py-docstring"> Loading dictionary... </tt> </tt>
  119. <a name="L65"></a><tt class="py-lineno"> 65</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/bbc1.txt :</tt> </tt>
  120. <a name="L66"></a><tt class="py-lineno"> 66</tt> <tt class="py-line"><tt class="py-docstring"> ['bin laden', 'obama', 'pakistan', 'killed', 'raid']</tt> </tt>
  121. <a name="L67"></a><tt class="py-lineno"> 67</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/bbc2.txt :</tt> </tt>
  122. <a name="L68"></a><tt class="py-lineno"> 68</tt> <tt class="py-line"><tt class="py-docstring"> ['jo yeates', 'bristol', 'vincent tabak', 'murder', 'strangled']</tt> </tt>
  123. <a name="L69"></a><tt class="py-lineno"> 69</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/bbc3.txt :</tt> </tt>
  124. <a name="L70"></a><tt class="py-lineno"> 70</tt> <tt class="py-line"><tt class="py-docstring"> ['snp', 'party', 'election', 'scottish', 'labour']</tt> </tt>
  125. <a name="L71"></a><tt class="py-lineno"> 71</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/guardian1.txt :</tt> </tt>
  126. <a name="L72"></a><tt class="py-lineno"> 72</tt> <tt class="py-line"><tt class="py-docstring"> ['bin laden', 'al-qaida', 'killed', 'pakistan', 'al-fawwaz']</tt> </tt>
  127. <a name="L73"></a><tt class="py-lineno"> 73</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/guardian2.txt :</tt> </tt>
  128. <a name="L74"></a><tt class="py-lineno"> 74</tt> <tt class="py-line"><tt class="py-docstring"> ['clegg', 'tory', 'lib dem', 'party', 'coalition']</tt> </tt>
  129. <a name="L75"></a><tt class="py-lineno"> 75</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/post1.txt :</tt> </tt>
  130. <a name="L76"></a><tt class="py-lineno"> 76</tt> <tt class="py-line"><tt class="py-docstring"> ['sony', 'stolen', 'playstation network', 'hacker attack', 'lawsuit']</tt> </tt>
  131. <a name="L77"></a><tt class="py-lineno"> 77</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/wikipedia1.txt :</tt> </tt>
  132. <a name="L78"></a><tt class="py-lineno"> 78</tt> <tt class="py-line"><tt class="py-docstring"> ['universe', 'anthropic principle', 'observed', 'cosmological', 'theory']</tt> </tt>
  133. <a name="L79"></a><tt class="py-lineno"> 79</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/wikipedia2.txt :</tt> </tt>
  134. <a name="L80"></a><tt class="py-lineno"> 80</tt> <tt class="py-line"><tt class="py-docstring"> ['beetroot', 'beet', 'betaine', 'blood pressure', 'dietary nitrate']</tt> </tt>
  135. <a name="L81"></a><tt class="py-lineno"> 81</tt> <tt class="py-line"><tt class="py-docstring"> Tags for tests/wikipedia3.txt :</tt> </tt>
  136. <a name="L82"></a><tt class="py-lineno"> 82</tt> <tt class="py-line"><tt class="py-docstring"> ['the lounge lizards', 'jazz', 'john lurie', 'musical', 'albums']</tt> </tt>
  137. <a name="L83"></a><tt class="py-lineno"> 83</tt> <tt class="py-line"><tt class="py-docstring">'''</tt> </tt>
  138. <a name="L84"></a><tt class="py-lineno"> 84</tt> <tt class="py-line"> </tt>
  139. <a name="L85"></a><tt class="py-lineno"> 85</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">collections</tt> </tt>
  140. <a name="L86"></a><tt class="py-lineno"> 86</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">re</tt> </tt>
  141. <a name="L87"></a><tt class="py-lineno"> 87</tt> <tt class="py-line"> </tt>
  142. <a name="L88"></a><tt class="py-lineno"> 88</tt> <tt class="py-line"> </tt>
  143. <a name="Tag"></a><div id="Tag-def"><a name="L89"></a><tt class="py-lineno"> 89</tt> <a class="py-toggle" href="#" id="Tag-toggle" onclick="return toggle('Tag');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.Tag-class.html">Tag</a><tt class="py-op">:</tt> </tt>
  144. </div><div id="Tag-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="Tag-expanded"><a name="L90"></a><tt class="py-lineno"> 90</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  145. <a name="L91"></a><tt class="py-lineno"> 91</tt> <tt class="py-line"><tt class="py-docstring"> General class for tags (small units of text)</tt> </tt>
  146. <a name="L92"></a><tt class="py-lineno"> 92</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  147. <a name="L93"></a><tt class="py-lineno"> 93</tt> <tt class="py-line"> </tt>
  148. <a name="Tag.__init__"></a><div id="Tag.__init__-def"><a name="L94"></a><tt class="py-lineno"> 94</tt> <a class="py-toggle" href="#" id="Tag.__init__-toggle" onclick="return toggle('Tag.__init__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Tag-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">string</tt><tt class="py-op">,</tt> <tt class="py-param">stem</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">,</tt> <tt class="py-param">rating</tt><tt class="py-op">=</tt><tt class="py-number">1.0</tt><tt class="py-op">,</tt> <tt class="py-param">proper</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">,</tt> </tt>
  149. <a name="L95"></a><tt class="py-lineno"> 95</tt> <tt class="py-line"> <tt class="py-param">terminal</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  150. </div><div id="Tag.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Tag.__init__-expanded"><a name="L96"></a><tt class="py-lineno"> 96</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  151. <a name="L97"></a><tt class="py-lineno"> 97</tt> <tt class="py-line"><tt class="py-docstring"> @param string: the actual representation of the tag</tt> </tt>
  152. <a name="L98"></a><tt class="py-lineno"> 98</tt> <tt class="py-line"><tt class="py-docstring"> @param stem: the internal (usually stemmed) representation;</tt> </tt>
  153. <a name="L99"></a><tt class="py-lineno"> 99</tt> <tt class="py-line"><tt class="py-docstring"> tags with the same stem are regarded as equal</tt> </tt>
  154. <a name="L100"></a><tt class="py-lineno">100</tt> <tt class="py-line"><tt class="py-docstring"> @param rating: a measure of the tag's relevance in the interval [0,1]</tt> </tt>
  155. <a name="L101"></a><tt class="py-lineno">101</tt> <tt class="py-line"><tt class="py-docstring"> @param proper: whether the tag is a proper noun</tt> </tt>
  156. <a name="L102"></a><tt class="py-lineno">102</tt> <tt class="py-line"><tt class="py-docstring"> @param terminal: set to True if the tag is at the end of a phrase</tt> </tt>
  157. <a name="L103"></a><tt class="py-lineno">103</tt> <tt class="py-line"><tt class="py-docstring"> (or anyway it cannot be logically merged to the</tt> </tt>
  158. <a name="L104"></a><tt class="py-lineno">104</tt> <tt class="py-line"><tt class="py-docstring"> following one)</tt> </tt>
  159. <a name="L105"></a><tt class="py-lineno">105</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  160. <a name="L106"></a><tt class="py-lineno">106</tt> <tt class="py-line"><tt class="py-docstring"> @returns: a new L{Tag} object</tt> </tt>
  161. <a name="L107"></a><tt class="py-lineno">107</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  162. <a name="L108"></a><tt class="py-lineno">108</tt> <tt class="py-line"> </tt>
  163. <a name="L109"></a><tt class="py-lineno">109</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">string</tt> <tt class="py-op">=</tt> <tt class="py-name">string</tt> </tt>
  164. <a name="L110"></a><tt class="py-lineno">110</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-op">=</tt> <tt class="py-name">stem</tt> <tt class="py-keyword">or</tt> <tt class="py-name">string</tt> </tt>
  165. <a name="L111"></a><tt class="py-lineno">111</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt> <tt class="py-op">=</tt> <tt class="py-name">rating</tt> </tt>
  166. <a name="L112"></a><tt class="py-lineno">112</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">proper</tt> <tt class="py-op">=</tt> <tt class="py-name">proper</tt> </tt>
  167. <a name="L113"></a><tt class="py-lineno">113</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">terminal</tt> <tt class="py-op">=</tt> <tt class="py-name">terminal</tt> </tt>
  168. </div><a name="L114"></a><tt class="py-lineno">114</tt> <tt class="py-line"> </tt>
  169. <a name="Tag.__eq__"></a><div id="Tag.__eq__-def"><a name="L115"></a><tt class="py-lineno">115</tt> <a class="py-toggle" href="#" id="Tag.__eq__-toggle" onclick="return toggle('Tag.__eq__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Tag-class.html#__eq__">__eq__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  170. </div><div id="Tag.__eq__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Tag.__eq__-expanded"><a name="L116"></a><tt class="py-lineno">116</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-op">==</tt> <tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> </tt>
  171. </div><a name="L117"></a><tt class="py-lineno">117</tt> <tt class="py-line"> </tt>
  172. <a name="Tag.__repr__"></a><div id="Tag.__repr__-def"><a name="L118"></a><tt class="py-lineno">118</tt> <a class="py-toggle" href="#" id="Tag.__repr__-toggle" onclick="return toggle('Tag.__repr__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Tag-class.html#__repr__">__repr__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  173. </div><div id="Tag.__repr__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Tag.__repr__-expanded"><a name="L119"></a><tt class="py-lineno">119</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">repr</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">string</tt><tt class="py-op">)</tt> </tt>
  174. </div><a name="L120"></a><tt class="py-lineno">120</tt> <tt class="py-line"> </tt>
  175. <a name="Tag.__lt__"></a><div id="Tag.__lt__-def"><a name="L121"></a><tt class="py-lineno">121</tt> <a class="py-toggle" href="#" id="Tag.__lt__-toggle" onclick="return toggle('Tag.__lt__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Tag-class.html#__lt__">__lt__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  176. </div><div id="Tag.__lt__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Tag.__lt__-expanded"><a name="L122"></a><tt class="py-lineno">122</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt> <tt class="py-op">&gt;</tt> <tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt> </tt>
  177. </div><a name="L123"></a><tt class="py-lineno">123</tt> <tt class="py-line"> </tt>
  178. <a name="Tag.__hash__"></a><div id="Tag.__hash__-def"><a name="L124"></a><tt class="py-lineno">124</tt> <a class="py-toggle" href="#" id="Tag.__hash__-toggle" onclick="return toggle('Tag.__hash__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Tag-class.html#__hash__">__hash__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  179. </div><div id="Tag.__hash__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Tag.__hash__-expanded"><a name="L125"></a><tt class="py-lineno">125</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">hash</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt><tt class="py-op">)</tt> </tt>
  180. </div></div><a name="L126"></a><tt class="py-lineno">126</tt> <tt class="py-line"> </tt>
  181. <a name="L127"></a><tt class="py-lineno">127</tt> <tt class="py-line"> </tt>
  182. <a name="MultiTag"></a><div id="MultiTag-def"><a name="L128"></a><tt class="py-lineno">128</tt> <a class="py-toggle" href="#" id="MultiTag-toggle" onclick="return toggle('MultiTag');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.MultiTag-class.html">MultiTag</a><tt class="py-op">(</tt><tt class="py-base-class">Tag</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  183. </div><div id="MultiTag-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="MultiTag-expanded"><a name="L129"></a><tt class="py-lineno">129</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  184. <a name="L130"></a><tt class="py-lineno">130</tt> <tt class="py-line"><tt class="py-docstring"> Class for aggregates of tags (usually next to each other in the document)</tt> </tt>
  185. <a name="L131"></a><tt class="py-lineno">131</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  186. <a name="L132"></a><tt class="py-lineno">132</tt> <tt class="py-line"> </tt>
  187. <a name="MultiTag.__init__"></a><div id="MultiTag.__init__-def"><a name="L133"></a><tt class="py-lineno">133</tt> <a class="py-toggle" href="#" id="MultiTag.__init__-toggle" onclick="return toggle('MultiTag.__init__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.MultiTag-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tail</tt><tt class="py-op">,</tt> <tt class="py-param">head</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  188. </div><div id="MultiTag.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="MultiTag.__init__-expanded"><a name="L134"></a><tt class="py-lineno">134</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  189. <a name="L135"></a><tt class="py-lineno">135</tt> <tt class="py-line"><tt class="py-docstring"> @param tail: the L{Tag} object to add to the first part (head)</tt> </tt>
  190. <a name="L136"></a><tt class="py-lineno">136</tt> <tt class="py-line"><tt class="py-docstring"> @param head: the (eventually absent) L{MultiTag} to be extended</tt> </tt>
  191. <a name="L137"></a><tt class="py-lineno">137</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  192. <a name="L138"></a><tt class="py-lineno">138</tt> <tt class="py-line"><tt class="py-docstring"> @returns: a new L{MultiTag} object</tt> </tt>
  193. <a name="L139"></a><tt class="py-lineno">139</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  194. <a name="L140"></a><tt class="py-lineno">140</tt> <tt class="py-line"> </tt>
  195. <a name="L141"></a><tt class="py-lineno">141</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">head</tt><tt class="py-op">:</tt> </tt>
  196. <a name="L142"></a><tt class="py-lineno">142</tt> <tt class="py-line"> <tt id="link-0" class="py-name" targets="Class tagger.Tag=tagger.Tag-class.html"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-0', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">.</tt><tt id="link-1" class="py-name" targets="Method extras.FastStemmer.__init__()=extras.FastStemmer-class.html#__init__,Method tagger.MultiTag.__init__()=tagger.MultiTag-class.html#__init__,Method tagger.Rater.__init__()=tagger.Rater-class.html#__init__,Method tagger.Stemmer.__init__()=tagger.Stemmer-class.html#__init__,Method tagger.Tag.__init__()=tagger.Tag-class.html#__init__,Method tagger.Tagger.__init__()=tagger.Tagger-class.html#__init__"><a title="extras.FastStemmer.__init__
  197. tagger.MultiTag.__init__
  198. tagger.Rater.__init__
  199. tagger.Stemmer.__init__
  200. tagger.Tag.__init__
  201. tagger.Tagger.__init__" class="py-name" href="#" onclick="return doclink('link-1', '__init__', 'link-1');">__init__</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">string</tt><tt class="py-op">,</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt><tt class="py-op">,</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt><tt class="py-op">,</tt> </tt>
  202. <a name="L143"></a><tt class="py-lineno">143</tt> <tt class="py-line"> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">proper</tt><tt class="py-op">,</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">terminal</tt><tt class="py-op">)</tt> </tt>
  203. <a name="L144"></a><tt class="py-lineno">144</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">size</tt> <tt class="py-op">=</tt> <tt class="py-number">1</tt> </tt>
  204. <a name="L145"></a><tt class="py-lineno">145</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">subratings</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt><tt class="py-op">]</tt> </tt>
  205. <a name="L146"></a><tt class="py-lineno">146</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
  206. <a name="L147"></a><tt class="py-lineno">147</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">string</tt> <tt class="py-op">=</tt> <tt class="py-string">' '</tt><tt class="py-op">.</tt><tt class="py-name">join</tt><tt class="py-op">(</tt><tt class="py-op">[</tt><tt class="py-name">head</tt><tt class="py-op">.</tt><tt class="py-name">string</tt><tt class="py-op">,</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">string</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
  207. <a name="L148"></a><tt class="py-lineno">148</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-op">=</tt> <tt class="py-string">' '</tt><tt class="py-op">.</tt><tt class="py-name">join</tt><tt class="py-op">(</tt><tt class="py-op">[</tt><tt class="py-name">head</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt><tt class="py-op">,</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
  208. <a name="L149"></a><tt class="py-lineno">149</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">size</tt> <tt class="py-op">=</tt> <tt class="py-name">head</tt><tt class="py-op">.</tt><tt class="py-name">size</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt> </tt>
  209. <a name="L150"></a><tt class="py-lineno">150</tt> <tt class="py-line"> </tt>
  210. <a name="L151"></a><tt class="py-lineno">151</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">proper</tt> <tt class="py-op">=</tt> <tt class="py-op">(</tt><tt class="py-name">head</tt><tt class="py-op">.</tt><tt class="py-name">proper</tt> <tt class="py-keyword">and</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">proper</tt><tt class="py-op">)</tt> </tt>
  211. <a name="L152"></a><tt class="py-lineno">152</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">terminal</tt> <tt class="py-op">=</tt> <tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">terminal</tt> </tt>
  212. <a name="L153"></a><tt class="py-lineno">153</tt> <tt class="py-line"> </tt>
  213. <a name="L154"></a><tt class="py-lineno">154</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">subratings</tt> <tt class="py-op">=</tt> <tt class="py-name">head</tt><tt class="py-op">.</tt><tt class="py-name">subratings</tt> <tt class="py-op">+</tt> <tt class="py-op">[</tt><tt class="py-name">tail</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt><tt class="py-op">]</tt> </tt>
  214. <a name="L155"></a><tt class="py-lineno">155</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">rating</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-2" class="py-name" targets="Method tagger.MultiTag.combined_rating()=tagger.MultiTag-class.html#combined_rating"><a title="tagger.MultiTag.combined_rating" class="py-name" href="#" onclick="return doclink('link-2', 'combined_rating', 'link-2');">combined_rating</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
  215. </div><a name="L156"></a><tt class="py-lineno">156</tt> <tt class="py-line"> </tt>
  216. <a name="MultiTag.combined_rating"></a><div id="MultiTag.combined_rating-def"><a name="L157"></a><tt class="py-lineno">157</tt> <a class="py-toggle" href="#" id="MultiTag.combined_rating-toggle" onclick="return toggle('MultiTag.combined_rating');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.MultiTag-class.html#combined_rating">combined_rating</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  217. </div><div id="MultiTag.combined_rating-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="MultiTag.combined_rating-expanded"><a name="L158"></a><tt class="py-lineno">158</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  218. <a name="L159"></a><tt class="py-lineno">159</tt> <tt class="py-line"><tt class="py-docstring"> Method that computes the multitag's rating from the ratings of unit</tt> </tt>
  219. <a name="L160"></a><tt class="py-lineno">160</tt> <tt class="py-line"><tt class="py-docstring"> subtags</tt> </tt>
  220. <a name="L161"></a><tt class="py-lineno">161</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  221. <a name="L162"></a><tt class="py-lineno">162</tt> <tt class="py-line"><tt class="py-docstring"> (the default implementation uses the geometric mean - with a special</tt> </tt>
  222. <a name="L163"></a><tt class="py-lineno">163</tt> <tt class="py-line"><tt class="py-docstring"> treatment for proper nouns - but this method can be overridden)</tt> </tt>
  223. <a name="L164"></a><tt class="py-lineno">164</tt> <tt class="py-line"><tt class="py-docstring"> </tt> </tt>
  224. <a name="L165"></a><tt class="py-lineno">165</tt> <tt class="py-line"><tt class="py-docstring"> @returns: the rating of the multitag</tt> </tt>
  225. <a name="L166"></a><tt class="py-lineno">166</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  226. <a name="L167"></a><tt class="py-lineno">167</tt> <tt class="py-line"> </tt>
  227. <a name="L168"></a><tt class="py-lineno">168</tt> <tt class="py-line"> <tt class="py-comment"># by default, the rating of a multitag is the geometric mean of its</tt> </tt>
  228. <a name="L169"></a><tt class="py-lineno">169</tt> <tt class="py-line"> <tt class="py-comment"># unit subtags' ratings</tt> </tt>
  229. <a name="L170"></a><tt class="py-lineno">170</tt> <tt class="py-line"> <tt class="py-name">product</tt> <tt class="py-op">=</tt> <tt class="py-name">reduce</tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">y</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt> <tt class="py-op">*</tt> <tt class="py-name">y</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">subratings</tt><tt class="py-op">,</tt> <tt class="py-number">1.0</tt><tt class="py-op">)</tt> </tt>
  230. <a name="L171"></a><tt class="py-lineno">171</tt> <tt class="py-line"> <tt class="py-name">root</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">size</tt> </tt>
  231. <a name="L172"></a><tt class="py-lineno">172</tt> <tt class="py-line"> </tt>
  232. <a name="L173"></a><tt class="py-lineno">173</tt> <tt class="py-line"> <tt class="py-comment"># but proper nouns shouldn't be penalized by stopwords</tt> </tt>
  233. <a name="L174"></a><tt class="py-lineno">174</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">product</tt> <tt class="py-op">==</tt> <tt class="py-number">0.0</tt> <tt class="py-keyword">and</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">proper</tt><tt class="py-op">:</tt> </tt>
  234. <a name="L175"></a><tt class="py-lineno">175</tt> <tt class="py-line"> <tt class="py-name">nonzero</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">r</tt> <tt class="py-keyword">for</tt> <tt class="py-name">r</tt> <tt class="py-keyword">in</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">subratings</tt> <tt class="py-keyword">if</tt> <tt class="py-name">r</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">0.0</tt><tt class="py-op">]</tt> </tt>
  235. <a name="L176"></a><tt class="py-lineno">176</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">nonzero</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
  236. <a name="L177"></a><tt class="py-lineno">177</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-number">0.0</tt> </tt>
  237. <a name="L178"></a><tt class="py-lineno">178</tt> <tt class="py-line"> <tt class="py-name">product</tt> <tt class="py-op">=</tt> <tt class="py-name">reduce</tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">y</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt> <tt class="py-op">*</tt> <tt class="py-name">y</tt><tt class="py-op">,</tt> <tt class="py-name">nonzero</tt><tt class="py-op">,</tt> <tt class="py-number">1.0</tt><tt class="py-op">)</tt> </tt>
  238. <a name="L179"></a><tt class="py-lineno">179</tt> <tt class="py-line"> <tt class="py-name">root</tt> <tt class="py-op">=</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">nonzero</tt><tt class="py-op">)</tt> </tt>
  239. <a name="L180"></a><tt class="py-lineno">180</tt> <tt class="py-line"> </tt>
  240. <a name="L181"></a><tt class="py-lineno">181</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">product</tt> <tt class="py-op">**</tt> <tt class="py-op">(</tt><tt class="py-number">1.0</tt> <tt class="py-op">/</tt> <tt class="py-name">root</tt><tt class="py-op">)</tt> </tt>
  241. </div></div><a name="L182"></a><tt class="py-lineno">182</tt> <tt class="py-line"> </tt>
  242. <a name="L183"></a><tt class="py-lineno">183</tt> <tt class="py-line"> </tt>
  243. <a name="Reader"></a><div id="Reader-def"><a name="L184"></a><tt class="py-lineno">184</tt> <a class="py-toggle" href="#" id="Reader-toggle" onclick="return toggle('Reader');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.Reader-class.html">Reader</a><tt class="py-op">:</tt> </tt>
  244. </div><div id="Reader-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="Reader-expanded"><a name="L185"></a><tt class="py-lineno">185</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  245. <a name="L186"></a><tt class="py-lineno">186</tt> <tt class="py-line"><tt class="py-docstring"> Class for parsing a string of text to obtain tags</tt> </tt>
  246. <a name="L187"></a><tt class="py-lineno">187</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  247. <a name="L188"></a><tt class="py-lineno">188</tt> <tt class="py-line"><tt class="py-docstring"> (it just turns the string to lowercase and splits it according to</tt> </tt>
  248. <a name="L189"></a><tt class="py-lineno">189</tt> <tt class="py-line"><tt class="py-docstring"> whitespaces and punctuation, identifying proper nouns and terminal words;</tt> </tt>
  249. <a name="L190"></a><tt class="py-lineno">190</tt> <tt class="py-line"><tt class="py-docstring"> different rules and formats other than plain text could be used)</tt> </tt>
  250. <a name="L191"></a><tt class="py-lineno">191</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  251. <a name="L192"></a><tt class="py-lineno">192</tt> <tt class="py-line"> </tt>
  252. <a name="L193"></a><tt class="py-lineno">193</tt> <tt class="py-line"> <tt id="link-3" class="py-name" targets="Variable tagger.Reader.match_apostrophes=tagger.Reader-class.html#match_apostrophes"><a title="tagger.Reader.match_apostrophes" class="py-name" href="#" onclick="return doclink('link-3', 'match_apostrophes', 'link-3');">match_apostrophes</a></tt> <tt class="py-op">=</tt> <tt class="py-name">re</tt><tt class="py-op">.</tt><tt class="py-name">compile</tt><tt class="py-op">(</tt><tt class="py-string">'`|&#8217;'</tt><tt class="py-op">)</tt> </tt>
  253. <a name="L194"></a><tt class="py-lineno">194</tt> <tt class="py-line"> <tt id="link-4" class="py-name" targets="Variable tagger.Reader.match_paragraphs=tagger.Reader-class.html#match_paragraphs"><a title="tagger.Reader.match_paragraphs" class="py-name" href="#" onclick="return doclink('link-4', 'match_paragraphs', 'link-4');">match_paragraphs</a></tt> <tt class="py-op">=</tt> <tt class="py-name">re</tt><tt class="py-op">.</tt><tt class="py-name">compile</tt><tt class="py-op">(</tt><tt class="py-string">'[\.\?!\t\n\r\f\v]+'</tt><tt class="py-op">)</tt> </tt>
  254. <a name="L195"></a><tt class="py-lineno">195</tt> <tt class="py-line"> <tt id="link-5" class="py-name" targets="Variable tagger.Reader.match_phrases=tagger.Reader-class.html#match_phrases"><a title="tagger.Reader.match_phrases" class="py-name" href="#" onclick="return doclink('link-5', 'match_phrases', 'link-5');">match_phrases</a></tt> <tt class="py-op">=</tt> <tt class="py-name">re</tt><tt class="py-op">.</tt><tt class="py-name">compile</tt><tt class="py-op">(</tt><tt class="py-string">'[,;:\(\)\[\]\{\}&lt;&gt;]+'</tt><tt class="py-op">)</tt> </tt>
  255. <a name="L196"></a><tt class="py-lineno">196</tt> <tt class="py-line"> <tt id="link-6" class="py-name" targets="Variable tagger.Reader.match_words=tagger.Reader-class.html#match_words"><a title="tagger.Reader.match_words" class="py-name" href="#" onclick="return doclink('link-6', 'match_words', 'link-6');">match_words</a></tt> <tt class="py-op">=</tt> <tt class="py-name">re</tt><tt class="py-op">.</tt><tt class="py-name">compile</tt><tt class="py-op">(</tt><tt class="py-string">'[\w\-\'_/&amp;]+'</tt><tt class="py-op">)</tt> </tt>
  256. <a name="L197"></a><tt class="py-lineno">197</tt> <tt class="py-line"> </tt>
  257. <a name="Reader.__call__"></a><div id="Reader.__call__-def"><a name="L198"></a><tt class="py-lineno">198</tt> <a class="py-toggle" href="#" id="Reader.__call__-toggle" onclick="return toggle('Reader.__call__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Reader-class.html#__call__">__call__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">text</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  258. </div><div id="Reader.__call__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Reader.__call__-expanded"><a name="L199"></a><tt class="py-lineno">199</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  259. <a name="L200"></a><tt class="py-lineno">200</tt> <tt class="py-line"><tt class="py-docstring"> @param text: the string of text to be tagged</tt> </tt>
  260. <a name="L201"></a><tt class="py-lineno">201</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  261. <a name="L202"></a><tt class="py-lineno">202</tt> <tt class="py-line"><tt class="py-docstring"> @returns: a list of tags respecting the order in the text</tt> </tt>
  262. <a name="L203"></a><tt class="py-lineno">203</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  263. <a name="L204"></a><tt class="py-lineno">204</tt> <tt class="py-line"> </tt>
  264. <a name="L205"></a><tt class="py-lineno">205</tt> <tt class="py-line"> <tt class="py-name">text</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-7" class="py-name" targets="Method tagger.Reader.preprocess()=tagger.Reader-class.html#preprocess,Method tagger.Stemmer.preprocess()=tagger.Stemmer-class.html#preprocess"><a title="tagger.Reader.preprocess
  265. tagger.Stemmer.preprocess" class="py-name" href="#" onclick="return doclink('link-7', 'preprocess', 'link-7');">preprocess</a></tt><tt class="py-op">(</tt><tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  266. <a name="L206"></a><tt class="py-lineno">206</tt> <tt class="py-line"> </tt>
  267. <a name="L207"></a><tt class="py-lineno">207</tt> <tt class="py-line"> <tt class="py-comment"># split by full stops, newlines, question marks...</tt> </tt>
  268. <a name="L208"></a><tt class="py-lineno">208</tt> <tt class="py-line"> <tt class="py-name">paragraphs</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-8" class="py-name"><a title="tagger.Reader.match_paragraphs" class="py-name" href="#" onclick="return doclink('link-8', 'match_paragraphs', 'link-4');">match_paragraphs</a></tt><tt class="py-op">.</tt><tt class="py-name">split</tt><tt class="py-op">(</tt><tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  269. <a name="L209"></a><tt class="py-lineno">209</tt> <tt class="py-line"> </tt>
  270. <a name="L210"></a><tt class="py-lineno">210</tt> <tt class="py-line"> <tt class="py-name">tags</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-op">]</tt> </tt>
  271. <a name="L211"></a><tt class="py-lineno">211</tt> <tt class="py-line"> </tt>
  272. <a name="L212"></a><tt class="py-lineno">212</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">par</tt> <tt class="py-keyword">in</tt> <tt class="py-name">paragraphs</tt><tt class="py-op">:</tt> </tt>
  273. <a name="L213"></a><tt class="py-lineno">213</tt> <tt class="py-line"> <tt class="py-comment"># split by commas, colons, parentheses...</tt> </tt>
  274. <a name="L214"></a><tt class="py-lineno">214</tt> <tt class="py-line"> <tt class="py-name">phrases</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-9" class="py-name"><a title="tagger.Reader.match_phrases" class="py-name" href="#" onclick="return doclink('link-9', 'match_phrases', 'link-5');">match_phrases</a></tt><tt class="py-op">.</tt><tt class="py-name">split</tt><tt class="py-op">(</tt><tt class="py-name">par</tt><tt class="py-op">)</tt> </tt>
  275. <a name="L215"></a><tt class="py-lineno">215</tt> <tt class="py-line"> </tt>
  276. <a name="L216"></a><tt class="py-lineno">216</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">phrases</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
  277. <a name="L217"></a><tt class="py-lineno">217</tt> <tt class="py-line"> <tt class="py-comment"># first phrase of a paragraph</tt> </tt>
  278. <a name="L218"></a><tt class="py-lineno">218</tt> <tt class="py-line"> <tt class="py-name">words</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-10" class="py-name"><a title="tagger.Reader.match_words" class="py-name" href="#" onclick="return doclink('link-10', 'match_words', 'link-6');">match_words</a></tt><tt class="py-op">.</tt><tt class="py-name">findall</tt><tt class="py-op">(</tt><tt class="py-name">phrases</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
  279. <a name="L219"></a><tt class="py-lineno">219</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
  280. <a name="L220"></a><tt class="py-lineno">220</tt> <tt class="py-line"> <tt class="py-name">tags</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt id="link-11" class="py-name"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-11', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
  281. <a name="L221"></a><tt class="py-lineno">221</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">:</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">:</tt> </tt>
  282. <a name="L222"></a><tt class="py-lineno">222</tt> <tt class="py-line"> <tt class="py-name">tags</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt id="link-12" class="py-name"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-12', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">w</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">proper</tt><tt class="py-op">=</tt><tt class="py-name">w</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">isupper</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
  283. <a name="L223"></a><tt class="py-lineno">223</tt> <tt class="py-line"> <tt class="py-name">tags</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt id="link-13" class="py-name"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-13', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt>
  284. <a name="L224"></a><tt class="py-lineno">224</tt> <tt class="py-line"> <tt class="py-name">proper</tt><tt class="py-op">=</tt><tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">isupper</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt>
  285. <a name="L225"></a><tt class="py-lineno">225</tt> <tt class="py-line"> <tt class="py-name">terminal</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
  286. <a name="L226"></a><tt class="py-lineno">226</tt> <tt class="py-line"> <tt class="py-keyword">elif</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
  287. <a name="L227"></a><tt class="py-lineno">227</tt> <tt class="py-line"> <tt class="py-name">tags</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt id="link-14" class="py-name"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-14', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">terminal</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
  288. <a name="L228"></a><tt class="py-lineno">228</tt> <tt class="py-line"> </tt>
  289. <a name="L229"></a><tt class="py-lineno">229</tt> <tt class="py-line"> <tt class="py-comment"># following phrases</tt> </tt>
  290. <a name="L230"></a><tt class="py-lineno">230</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">phr</tt> <tt class="py-keyword">in</tt> <tt class="py-name">phrases</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">:</tt><tt class="py-op">]</tt><tt class="py-op">:</tt> </tt>
  291. <a name="L231"></a><tt class="py-lineno">231</tt> <tt class="py-line"> <tt class="py-name">words</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-15" class="py-name"><a title="tagger.Reader.match_words" class="py-name" href="#" onclick="return doclink('link-15', 'match_words', 'link-6');">match_words</a></tt><tt class="py-op">.</tt><tt class="py-name">findall</tt><tt class="py-op">(</tt><tt class="py-name">phr</tt><tt class="py-op">)</tt> </tt>
  292. <a name="L232"></a><tt class="py-lineno">232</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
  293. <a name="L233"></a><tt class="py-lineno">233</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-op">:</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">:</tt> </tt>
  294. <a name="L234"></a><tt class="py-lineno">234</tt> <tt class="py-line"> <tt class="py-name">tags</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt id="link-16" class="py-name"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-16', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">w</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">proper</tt><tt class="py-op">=</tt><tt class="py-name">w</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">isupper</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
  295. <a name="L235"></a><tt class="py-lineno">235</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
  296. <a name="L236"></a><tt class="py-lineno">236</tt> <tt class="py-line"> <tt class="py-name">tags</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt id="link-17" class="py-name"><a title="tagger.Tag" class="py-name" href="#" onclick="return doclink('link-17', 'Tag', 'link-0');">Tag</a></tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">lower</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt>
  297. <a name="L237"></a><tt class="py-lineno">237</tt> <tt class="py-line"> <tt class="py-name">proper</tt><tt class="py-op">=</tt><tt class="py-name">words</tt><tt class="py-op">[</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">isupper</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt>
  298. <a name="L238"></a><tt class="py-lineno">238</tt> <tt class="py-line"> <tt class="py-name">terminal</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
  299. <a name="L239"></a><tt class="py-lineno">239</tt> <tt class="py-line"> </tt>
  300. <a name="L240"></a><tt class="py-lineno">240</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">tags</tt> </tt>
  301. </div><a name="L241"></a><tt class="py-lineno">241</tt> <tt class="py-line"> </tt>
  302. <a name="Reader.preprocess"></a><div id="Reader.preprocess-def"><a name="L242"></a><tt class="py-lineno">242</tt> <a class="py-toggle" href="#" id="Reader.preprocess-toggle" onclick="return toggle('Reader.preprocess');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Reader-class.html#preprocess">preprocess</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">text</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  303. </div><div id="Reader.preprocess-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Reader.preprocess-expanded"><a name="L243"></a><tt class="py-lineno">243</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  304. <a name="L244"></a><tt class="py-lineno">244</tt> <tt class="py-line"><tt class="py-docstring"> @param text: a string containing the text document to perform any</tt> </tt>
  305. <a name="L245"></a><tt class="py-lineno">245</tt> <tt class="py-line"><tt class="py-docstring"> required transformation before splitting</tt> </tt>
  306. <a name="L246"></a><tt class="py-lineno">246</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  307. <a name="L247"></a><tt class="py-lineno">247</tt> <tt class="py-line"><tt class="py-docstring"> @returns: the processed text</tt> </tt>
  308. <a name="L248"></a><tt class="py-lineno">248</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  309. <a name="L249"></a><tt class="py-lineno">249</tt> <tt class="py-line"> </tt>
  310. <a name="L250"></a><tt class="py-lineno">250</tt> <tt class="py-line"> <tt class="py-name">text</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-18" class="py-name"><a title="tagger.Reader.match_apostrophes" class="py-name" href="#" onclick="return doclink('link-18', 'match_apostrophes', 'link-3');">match_apostrophes</a></tt><tt class="py-op">.</tt><tt class="py-name">sub</tt><tt class="py-op">(</tt><tt class="py-string">'\''</tt><tt class="py-op">,</tt> <tt class="py-name">text</tt><tt class="py-op">)</tt> </tt>
  311. <a name="L251"></a><tt class="py-lineno">251</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">text</tt> </tt>
  312. </div></div><a name="L252"></a><tt class="py-lineno">252</tt> <tt class="py-line"> </tt>
  313. <a name="L253"></a><tt class="py-lineno">253</tt> <tt class="py-line"> </tt>
  314. <a name="Stemmer"></a><div id="Stemmer-def"><a name="L254"></a><tt class="py-lineno">254</tt> <a class="py-toggle" href="#" id="Stemmer-toggle" onclick="return toggle('Stemmer');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="tagger.Stemmer-class.html">Stemmer</a><tt class="py-op">:</tt> </tt>
  315. </div><div id="Stemmer-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="Stemmer-expanded"><a name="L255"></a><tt class="py-lineno">255</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  316. <a name="L256"></a><tt class="py-lineno">256</tt> <tt class="py-line"><tt class="py-docstring"> Class for extracting the stem of a word</tt> </tt>
  317. <a name="L257"></a><tt class="py-lineno">257</tt> <tt class="py-line"><tt class="py-docstring"> </tt> </tt>
  318. <a name="L258"></a><tt class="py-lineno">258</tt> <tt class="py-line"><tt class="py-docstring"> (by default it uses a simple open-source implementation of Porter's</tt> </tt>
  319. <a name="L259"></a><tt class="py-lineno">259</tt> <tt class="py-line"><tt class="py-docstring"> algorithm; this can be improved a lot, so experimenting with different ones</tt> </tt>
  320. <a name="L260"></a><tt class="py-lineno">260</tt> <tt class="py-line"><tt class="py-docstring"> is advisable; nltk.stem provides different algorithms for many languages)</tt> </tt>
  321. <a name="L261"></a><tt class="py-lineno">261</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  322. <a name="L262"></a><tt class="py-lineno">262</tt> <tt class="py-line"> </tt>
  323. <a name="L263"></a><tt class="py-lineno">263</tt> <tt class="py-line"> <tt id="link-19" class="py-name" targets="Variable tagger.Stemmer.match_contractions=tagger.Stemmer-class.html#match_contractions"><a title="tagger.Stemmer.match_contractions" class="py-name" href="#" onclick="return doclink('link-19', 'match_contractions', 'link-19');">match_contractions</a></tt> <tt class="py-op">=</tt> <tt class="py-name">re</tt><tt class="py-op">.</tt><tt class="py-name">compile</tt><tt class="py-op">(</tt><tt class="py-string">'(\w+)\'(m|re|d|ve|s|ll|t)?'</tt><tt class="py-op">)</tt> </tt>
  324. <a name="L264"></a><tt class="py-lineno">264</tt> <tt class="py-line"> </tt>
  325. <a name="Stemmer.__init__"></a><div id="Stemmer.__init__-def"><a name="L265"></a><tt class="py-lineno">265</tt> <a class="py-toggle" href="#" id="Stemmer.__init__-toggle" onclick="return toggle('Stemmer.__init__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Stemmer-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">stemmer</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  326. </div><div id="Stemmer.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Stemmer.__init__-expanded"><a name="L266"></a><tt class="py-lineno">266</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  327. <a name="L267"></a><tt class="py-lineno">267</tt> <tt class="py-line"><tt class="py-docstring"> @param stemmer: an object or module with a 'stem' method (defaults to</tt> </tt>
  328. <a name="L268"></a><tt class="py-lineno">268</tt> <tt class="py-line"><tt class="py-docstring"> stemming.porter2)</tt> </tt>
  329. <a name="L269"></a><tt class="py-lineno">269</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  330. <a name="L270"></a><tt class="py-lineno">270</tt> <tt class="py-line"><tt class="py-docstring"> @returns: a new L{Stemmer} object</tt> </tt>
  331. <a name="L271"></a><tt class="py-lineno">271</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  332. <a name="L272"></a><tt class="py-lineno">272</tt> <tt class="py-line"> </tt>
  333. <a name="L273"></a><tt class="py-lineno">273</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">stemmer</tt><tt class="py-op">:</tt> </tt>
  334. <a name="L274"></a><tt class="py-lineno">274</tt> <tt class="py-line"> <tt class="py-keyword">from</tt> <tt class="py-name">stemming</tt> <tt class="py-keyword">import</tt> <tt class="py-name">porter2</tt> </tt>
  335. <a name="L275"></a><tt class="py-lineno">275</tt> <tt class="py-line"> <tt class="py-name">stemmer</tt> <tt class="py-op">=</tt> <tt class="py-name">porter2</tt> </tt>
  336. <a name="L276"></a><tt class="py-lineno">276</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">stemmer</tt> <tt class="py-op">=</tt> <tt class="py-name">stemmer</tt> </tt>
  337. </div><a name="L277"></a><tt class="py-lineno">277</tt> <tt class="py-line"> </tt>
  338. <a name="Stemmer.__call__"></a><div id="Stemmer.__call__-def"><a name="L278"></a><tt class="py-lineno">278</tt> <a class="py-toggle" href="#" id="Stemmer.__call__-toggle" onclick="return toggle('Stemmer.__call__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.Stemmer-class.html#__call__">__call__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tag</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
  339. </div><div id="Stemmer.__call__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Stemmer.__call__-expanded"><a name="L279"></a><tt class="py-lineno">279</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt>
  340. <a name="L280"></a><tt class="py-lineno">280</tt> <tt class="py-line"><tt class="py-docstring"> @param tag: the tag to be stemmed</tt> </tt>
  341. <a name="L281"></a><tt class="py-lineno">281</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
  342. <a name="L282"></a><tt class="py-lineno">282</tt> <tt class="py-line"><tt class="py-docstring"> @returns: the stemmed tag</tt> </tt>
  343. <a name="L283"></a><tt class="py-lineno">283</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt>
  344. <a name="L284"></a><tt class="py-lineno">284</tt> <tt class="py-line"> </tt>
  345. <a name="L285"></a><tt class="py-lineno">285</tt> <tt class="py-line"> <tt class="py-name