/doc/build_dict-module.html

http://github.com/apresta/tagger · HTML · 284 lines · 243 code · 19 blank · 22 comment · 0 complexity · 840942380625f2cbec724e73c3024a26 MD5 · raw file

  1. <?xml version="1.0" encoding="ascii"?>
  2. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3. "DTD/xhtml1-transitional.dtd">
  4. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5. <head>
  6. <title>build_dict</title>
  7. <link rel="stylesheet" href="epydoc.css" type="text/css" />
  8. <script type="text/javascript" src="epydoc.js"></script>
  9. </head>
  10. <body bgcolor="white" text="black" link="blue" vlink="#204080"
  11. alink="#204080">
  12. <!-- ==================== NAVIGATION BAR ==================== -->
  13. <table class="navbar" border="0" width="100%" cellpadding="0"
  14. bgcolor="#a0c0ff" cellspacing="0">
  15. <tr valign="middle">
  16. <!-- Tree link -->
  17. <th>&nbsp;&nbsp;&nbsp;<a
  18. href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
  19. <!-- Index link -->
  20. <th>&nbsp;&nbsp;&nbsp;<a
  21. href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
  22. <!-- Help link -->
  23. <th>&nbsp;&nbsp;&nbsp;<a
  24. href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
  25. <!-- Project homepage -->
  26. <th class="navbar" align="right" width="100%">
  27. <table border="0" cellpadding="0" cellspacing="0">
  28. <tr><th class="navbar" align="center"
  29. ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th>
  30. </tr></table></th>
  31. </tr>
  32. </table>
  33. <table width="100%" cellpadding="0" cellspacing="0">
  34. <tr valign="top">
  35. <td width="100%">
  36. <span class="breadcrumbs">
  37. Module&nbsp;build_dict
  38. </span>
  39. </td>
  40. <td>
  41. <table cellpadding="0" cellspacing="0">
  42. <!-- hide/show private -->
  43. <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
  44. onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
  45. <tr><td align="right"><span class="options"
  46. >[<a href="frames.html" target="_top">frames</a
  47. >]&nbsp;|&nbsp;<a href="build_dict-module.html"
  48. target="_top">no&nbsp;frames</a>]</span></td></tr>
  49. </table>
  50. </td>
  51. </tr>
  52. </table>
  53. <!-- ==================== MODULE DESCRIPTION ==================== -->
  54. <h1 class="epydoc">Module build_dict</h1><p class="nomargin-top"><span class="codelink"><a href="build_dict-pysrc.html">source&nbsp;code</a></span></p>
  55. <p>Usage: build_dict.py -o &lt;output file&gt; -s &lt;stopwords file&gt;
  56. &lt;list of files&gt;</p>
  57. <!-- ==================== FUNCTIONS ==================== -->
  58. <a name="section-Functions"></a>
  59. <table class="summary" border="1" cellpadding="3"
  60. cellspacing="0" width="100%" bgcolor="white">
  61. <tr bgcolor="#70b0f0" class="table-header">
  62. <td colspan="2" class="table-header">
  63. <table border="0" cellpadding="0" cellspacing="0" width="100%">
  64. <tr valign="top">
  65. <td align="left"><span class="table-header">Functions</span></td>
  66. <td align="right" valign="top"
  67. ><span class="options">[<a href="#section-Functions"
  68. class="privatelink" onclick="toggle_private();"
  69. >hide private</a>]</span></td>
  70. </tr>
  71. </table>
  72. </td>
  73. </tr>
  74. <tr>
  75. <td width="15%" align="right" valign="top" class="summary">
  76. <span class="summary-type">&nbsp;</span>
  77. </td><td class="summary">
  78. <table width="100%" cellpadding="0" cellspacing="0" border="0">
  79. <tr>
  80. <td><span class="summary-sig"><a href="build_dict-module.html#build_dict" class="summary-sig-name">build_dict</a>(<span class="summary-sig-arg">corpus</span>,
  81. <span class="summary-sig-arg">stopwords</span>=<span class="summary-sig-default">None</span>,
  82. <span class="summary-sig-arg">measure</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">IDF</code><code class="variable-quote">'</code></span>)</span><br />
  83. Returns:
  84. a dictionary of weights in the interval [0,1]</td>
  85. <td align="right" valign="top">
  86. <span class="codelink"><a href="build_dict-pysrc.html#build_dict">source&nbsp;code</a></span>
  87. </td>
  88. </tr>
  89. </table>
  90. </td>
  91. </tr>
  92. <tr>
  93. <td width="15%" align="right" valign="top" class="summary">
  94. <span class="summary-type">&nbsp;</span>
  95. </td><td class="summary">
  96. <table width="100%" cellpadding="0" cellspacing="0" border="0">
  97. <tr>
  98. <td><span class="summary-sig"><a href="build_dict-module.html#build_dict_from_files" class="summary-sig-name">build_dict_from_files</a>(<span class="summary-sig-arg">output_file</span>,
  99. <span class="summary-sig-arg">corpus_files</span>,
  100. <span class="summary-sig-arg">stopwords_file</span>=<span class="summary-sig-default">None</span>,
  101. <span class="summary-sig-arg">reader</span>=<span class="summary-sig-default">SimpleReader()</span>,
  102. <span class="summary-sig-arg">stemmer</span>=<span class="summary-sig-default">Stemmer()</span>,
  103. <span class="summary-sig-arg">measure</span>=<span class="summary-sig-default"><code class="variable-quote">'</code><code class="variable-string">IDF</code><code class="variable-quote">'</code></span>,
  104. <span class="summary-sig-arg">verbose</span>=<span class="summary-sig-default">False</span>)</span></td>
  105. <td align="right" valign="top">
  106. <span class="codelink"><a href="build_dict-pysrc.html#build_dict_from_files">source&nbsp;code</a></span>
  107. </td>
  108. </tr>
  109. </table>
  110. </td>
  111. </tr>
  112. </table>
  113. <!-- ==================== VARIABLES ==================== -->
  114. <a name="section-Variables"></a>
  115. <table class="summary" border="1" cellpadding="3"
  116. cellspacing="0" width="100%" bgcolor="white">
  117. <tr bgcolor="#70b0f0" class="table-header">
  118. <td colspan="2" class="table-header">
  119. <table border="0" cellpadding="0" cellspacing="0" width="100%">
  120. <tr valign="top">
  121. <td align="left"><span class="table-header">Variables</span></td>
  122. <td align="right" valign="top"
  123. ><span class="options">[<a href="#section-Variables"
  124. class="privatelink" onclick="toggle_private();"
  125. >hide private</a>]</span></td>
  126. </tr>
  127. </table>
  128. </td>
  129. </tr>
  130. <tr>
  131. <td width="15%" align="right" valign="top" class="summary">
  132. <span class="summary-type">&nbsp;</span>
  133. </td><td class="summary">
  134. <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="None">None</code>
  135. </td>
  136. </tr>
  137. </table>
  138. <!-- ==================== FUNCTION DETAILS ==================== -->
  139. <a name="section-FunctionDetails"></a>
  140. <table class="details" border="1" cellpadding="3"
  141. cellspacing="0" width="100%" bgcolor="white">
  142. <tr bgcolor="#70b0f0" class="table-header">
  143. <td colspan="2" class="table-header">
  144. <table border="0" cellpadding="0" cellspacing="0" width="100%">
  145. <tr valign="top">
  146. <td align="left"><span class="table-header">Function Details</span></td>
  147. <td align="right" valign="top"
  148. ><span class="options">[<a href="#section-FunctionDetails"
  149. class="privatelink" onclick="toggle_private();"
  150. >hide private</a>]</span></td>
  151. </tr>
  152. </table>
  153. </td>
  154. </tr>
  155. </table>
  156. <a name="build_dict"></a>
  157. <div>
  158. <table class="details" border="1" cellpadding="3"
  159. cellspacing="0" width="100%" bgcolor="white">
  160. <tr><td>
  161. <table width="100%" cellpadding="0" cellspacing="0" border="0">
  162. <tr valign="top"><td>
  163. <h3 class="epydoc"><span class="sig"><span class="sig-name">build_dict</span>(<span class="sig-arg">corpus</span>,
  164. <span class="sig-arg">stopwords</span>=<span class="sig-default">None</span>,
  165. <span class="sig-arg">measure</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">IDF</code><code class="variable-quote">'</code></span>)</span>
  166. </h3>
  167. </td><td align="right" valign="top"
  168. ><span class="codelink"><a href="build_dict-pysrc.html#build_dict">source&nbsp;code</a></span>&nbsp;
  169. </td>
  170. </tr></table>
  171. <dl class="fields">
  172. <dt>Parameters:</dt>
  173. <dd><ul class="nomargin-top">
  174. <li><strong class="pname"><code>corpus</code></strong> - a list of documents, represented as lists of (stemmed) words</li>
  175. <li><strong class="pname"><code>stopwords</code></strong> - the list of (stemmed) words that should have zero weight</li>
  176. <li><strong class="pname"><code>measure</code></strong> - the measure used to compute the weights ('IDF' i.e. 'inverse
  177. document frequency' or 'ICF' i.e. 'inverse collection frequency';
  178. defaults to 'IDF')</li>
  179. </ul></dd>
  180. <dt>Returns:</dt>
  181. <dd>a dictionary of weights in the interval [0,1]</dd>
  182. </dl>
  183. </td></tr></table>
  184. </div>
  185. <a name="build_dict_from_files"></a>
  186. <div>
  187. <table class="details" border="1" cellpadding="3"
  188. cellspacing="0" width="100%" bgcolor="white">
  189. <tr><td>
  190. <table width="100%" cellpadding="0" cellspacing="0" border="0">
  191. <tr valign="top"><td>
  192. <h3 class="epydoc"><span class="sig"><span class="sig-name">build_dict_from_files</span>(<span class="sig-arg">output_file</span>,
  193. <span class="sig-arg">corpus_files</span>,
  194. <span class="sig-arg">stopwords_file</span>=<span class="sig-default">None</span>,
  195. <span class="sig-arg">reader</span>=<span class="sig-default">SimpleReader()</span>,
  196. <span class="sig-arg">stemmer</span>=<span class="sig-default">Stemmer()</span>,
  197. <span class="sig-arg">measure</span>=<span class="sig-default"><code class="variable-quote">'</code><code class="variable-string">IDF</code><code class="variable-quote">'</code></span>,
  198. <span class="sig-arg">verbose</span>=<span class="sig-default">False</span>)</span>
  199. </h3>
  200. </td><td align="right" valign="top"
  201. ><span class="codelink"><a href="build_dict-pysrc.html#build_dict_from_files">source&nbsp;code</a></span>&nbsp;
  202. </td>
  203. </tr></table>
  204. <dl class="fields">
  205. <dt>Parameters:</dt>
  206. <dd><ul class="nomargin-top">
  207. <li><strong class="pname"><code>output_file</code></strong> - the binary stream where the dictionary should be saved</li>
  208. <li><strong class="pname"><code>corpus_files</code></strong> - a list of streams with words to process</li>
  209. <li><strong class="pname"><code>stopwords_file</code></strong> - a stream containing a list of stopwords</li>
  210. <li><strong class="pname"><code>reader</code></strong> - the <a href="tagger.Reader-class.html" class="link">Reader</a>
  211. object to be used</li>
  212. <li><strong class="pname"><code>stemmer</code></strong> - the <a href="tagger.Stemmer-class.html" class="link">Stemmer</a>
  213. object to be used</li>
  214. <li><strong class="pname"><code>measure</code></strong> - the measure used to compute the weights ('IDF' i.e. 'inverse
  215. document frequency' or 'ICF' i.e. 'inverse collection frequency';
  216. defaults to 'IDF')</li>
  217. <li><strong class="pname"><code>verbose</code></strong> - whether information on the progress should be printed on screen</li>
  218. </ul></dd>
  219. </dl>
  220. </td></tr></table>
  221. </div>
  222. <br />
  223. <!-- ==================== NAVIGATION BAR ==================== -->
  224. <table class="navbar" border="0" width="100%" cellpadding="0"
  225. bgcolor="#a0c0ff" cellspacing="0">
  226. <tr valign="middle">
  227. <!-- Tree link -->
  228. <th>&nbsp;&nbsp;&nbsp;<a
  229. href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
  230. <!-- Index link -->
  231. <th>&nbsp;&nbsp;&nbsp;<a
  232. href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
  233. <!-- Help link -->
  234. <th>&nbsp;&nbsp;&nbsp;<a
  235. href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
  236. <!-- Project homepage -->
  237. <th class="navbar" align="right" width="100%">
  238. <table border="0" cellpadding="0" cellspacing="0">
  239. <tr><th class="navbar" align="center"
  240. ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th>
  241. </tr></table></th>
  242. </tr>
  243. </table>
  244. <table border="0" cellpadding="0" cellspacing="0" width="100%%">
  245. <tr>
  246. <td align="left" class="footer">
  247. Generated by Epydoc 3.0.1 on Fri May 13 11:13:02 2011
  248. </td>
  249. <td align="right" class="footer">
  250. <a target="mainFrame" href="http://epydoc.sourceforge.net"
  251. >http://epydoc.sourceforge.net</a>
  252. </td>
  253. </tr>
  254. </table>
  255. <script type="text/javascript">
  256. <!--
  257. // Private objects are initially displayed (because if
  258. // javascript is turned off then we want them to be
  259. // visible); but by default, we want to hide them. So hide
  260. // them unless we have a cookie that says to show them.
  261. checkCookie();
  262. // -->
  263. </script>
  264. </body>
  265. </html>