/doc/tagger.build_dict-pysrc.html
HTML | 250 lines | 219 code | 10 blank | 21 comment | 0 complexity | a85fec16986c0c11d83ce7676af49e6d MD5 | raw file
1<?xml version="1.0" encoding="ascii"?> 2<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 3 "DTD/xhtml1-transitional.dtd"> 4<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 5<head> 6 <title>tagger.build_dict</title> 7 <link rel="stylesheet" href="epydoc.css" type="text/css" /> 8 <script type="text/javascript" src="epydoc.js"></script> 9</head> 10 11<body bgcolor="white" text="black" link="blue" vlink="#204080" 12 alink="#204080"> 13<!-- ==================== NAVIGATION BAR ==================== --> 14<table class="navbar" border="0" width="100%" cellpadding="0" 15 bgcolor="#a0c0ff" cellspacing="0"> 16 <tr valign="middle"> 17 18 <!-- Tree link --> 19 <th> <a 20 href="module-tree.html">Trees</a> </th> 21 22 <!-- Index link --> 23 <th> <a 24 href="identifier-index.html">Indices</a> </th> 25 26 <!-- Help link --> 27 <th> <a 28 href="help.html">Help</a> </th> 29 30 <!-- Project homepage --> 31 <th class="navbar" align="right" width="100%"> 32 <table border="0" cellpadding="0" cellspacing="0"> 33 <tr><th class="navbar" align="center" 34 ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th> 35 </tr></table></th> 36 </tr> 37</table> 38<table width="100%" cellpadding="0" cellspacing="0"> 39 <tr valign="top"> 40 <td width="100%"> 41 <span class="breadcrumbs"> 42 Package tagger :: 43 Module build_dict 44 </span> 45 </td> 46 <td> 47 <table cellpadding="0" cellspacing="0"> 48 <!-- hide/show private --> 49 <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink" 50 onclick="toggle_private();">hide private</a>]</span></td></tr> 51 <tr><td align="right"><span class="options" 52 >[<a href="frames.html" target="_top">frames</a 53 >] | <a href="tagger.build_dict-pysrc.html" 54 target="_top">no frames</a>]</span></td></tr> 55 </table> 56 </td> 57 </tr> 58</table> 59<h1 class="epydoc">Source Code for <a href="tagger.build_dict-module.html">Module tagger.build_dict</a></h1> 60<pre class="py-src"> 61<a name="L1"></a><tt class="py-lineno"> 1</tt> <tt class="py-line"><tt class="py-comment">#!/usr/bin/env python</tt> </tt> 62<a name="L2"></a><tt class="py-lineno"> 2</tt> <tt class="py-line"> </tt> 63<a name="L3"></a><tt class="py-lineno"> 3</tt> <tt class="py-line"><tt class="py-comment"># Copyright (C) 2011 by Alessandro Presta</tt> </tt> 64<a name="L4"></a><tt class="py-lineno"> 4</tt> <tt class="py-line"> </tt> 65<a name="L5"></a><tt class="py-lineno"> 5</tt> <tt class="py-line"><tt class="py-comment"># Permission is hereby granted, free of charge, to any person obtaining a copy</tt> </tt> 66<a name="L6"></a><tt class="py-lineno"> 6</tt> <tt class="py-line"><tt class="py-comment"># of this software and associated documentation files (the "Software"), to deal</tt> </tt> 67<a name="L7"></a><tt class="py-lineno"> 7</tt> <tt class="py-line"><tt class="py-comment"># in the Software without restriction, including without limitation the rights</tt> </tt> 68<a name="L8"></a><tt class="py-lineno"> 8</tt> <tt class="py-line"><tt class="py-comment"># to use, copy, modify, merge, publish, distribute, sublicense, and/or sell</tt> </tt> 69<a name="L9"></a><tt class="py-lineno"> 9</tt> <tt class="py-line"><tt class="py-comment"># copies of the Software, and to permit persons to whom the Software is</tt> </tt> 70<a name="L10"></a><tt class="py-lineno"> 10</tt> <tt class="py-line"><tt class="py-comment"># furnished to do so, subject to the following conditions:</tt> </tt> 71<a name="L11"></a><tt class="py-lineno"> 11</tt> <tt class="py-line"> </tt> 72<a name="L12"></a><tt class="py-lineno"> 12</tt> <tt class="py-line"><tt class="py-comment"># The above copyright notice and this permission notice shall be included in</tt> </tt> 73<a name="L13"></a><tt class="py-lineno"> 13</tt> <tt class="py-line"><tt class="py-comment"># all copies or substantial portions of the Software.</tt> </tt> 74<a name="L14"></a><tt class="py-lineno"> 14</tt> <tt class="py-line"> </tt> 75<a name="L15"></a><tt class="py-lineno"> 15</tt> <tt class="py-line"><tt class="py-comment"># THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR</tt> </tt> 76<a name="L16"></a><tt class="py-lineno"> 16</tt> <tt class="py-line"><tt class="py-comment"># IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,</tt> </tt> 77<a name="L17"></a><tt class="py-lineno"> 17</tt> <tt class="py-line"><tt class="py-comment"># FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE</tt> </tt> 78<a name="L18"></a><tt class="py-lineno"> 18</tt> <tt class="py-line"><tt class="py-comment"># AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER</tt> </tt> 79<a name="L19"></a><tt class="py-lineno"> 19</tt> <tt class="py-line"><tt class="py-comment"># LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,</tt> </tt> 80<a name="L20"></a><tt class="py-lineno"> 20</tt> <tt class="py-line"><tt class="py-comment"># OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN</tt> </tt> 81<a name="L21"></a><tt class="py-lineno"> 21</tt> <tt class="py-line"><tt class="py-comment"># THE SOFTWARE</tt> </tt> 82<a name="L22"></a><tt class="py-lineno"> 22</tt> <tt class="py-line"> </tt> 83<a name="L23"></a><tt class="py-lineno"> 23</tt> <tt class="py-line"> </tt> 84<a name="L24"></a><tt class="py-lineno"> 24</tt> <tt class="py-line"><tt class="py-docstring">'''</tt> </tt> 85<a name="L25"></a><tt class="py-lineno"> 25</tt> <tt class="py-line"><tt class="py-docstring">Usage: build_dict.py -o <output file> -s <stopwords file> <list of files></tt> </tt> 86<a name="L26"></a><tt class="py-lineno"> 26</tt> <tt class="py-line"><tt class="py-docstring">'''</tt> </tt> 87<a name="L27"></a><tt class="py-lineno"> 27</tt> <tt class="py-line"> </tt> 88<a name="L28"></a><tt class="py-lineno"> 28</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">__future__</tt> <tt class="py-keyword">import</tt> <tt class="py-name">division</tt> </tt> 89<a name="L29"></a><tt class="py-lineno"> 29</tt> <tt class="py-line"> </tt> 90<a name="L30"></a><tt class="py-lineno"> 30</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-0" class="py-name" targets="Module tagger.tagger=tagger.tagger-module.html"><a title="tagger.tagger" class="py-name" href="#" onclick="return doclink('link-0', 'tagger', 'link-0');">tagger</a></tt> <tt class="py-keyword">import</tt> <tt id="link-1" class="py-name" targets="Class tagger.tagger.Stemmer=tagger.tagger.Stemmer-class.html"><a title="tagger.tagger.Stemmer" class="py-name" href="#" onclick="return doclink('link-1', 'Stemmer', 'link-1');">Stemmer</a></tt> </tt> 91<a name="L31"></a><tt class="py-lineno"> 31</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-2" class="py-name" targets="Module tagger.extras=tagger.extras-module.html"><a title="tagger.extras" class="py-name" href="#" onclick="return doclink('link-2', 'extras', 'link-2');">extras</a></tt> <tt class="py-keyword">import</tt> <tt id="link-3" class="py-name" targets="Class tagger.extras.SimpleReader=tagger.extras.SimpleReader-class.html"><a title="tagger.extras.SimpleReader" class="py-name" href="#" onclick="return doclink('link-3', 'SimpleReader', 'link-3');">SimpleReader</a></tt> </tt> 92<a name="L32"></a><tt class="py-lineno"> 32</tt> <tt class="py-line"> </tt> 93<a name="build_dict"></a><div id="build_dict-def"><a name="L33"></a><tt class="py-lineno"> 33</tt> <a class="py-toggle" href="#" id="build_dict-toggle" onclick="return toggle('build_dict');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.build_dict-module.html#build_dict">build_dict</a><tt class="py-op">(</tt><tt class="py-param">corpus</tt><tt class="py-op">,</tt> <tt class="py-param">stopwords</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">,</tt> <tt class="py-param">measure</tt><tt class="py-op">=</tt><tt class="py-string">'IDF'</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt> 94</div><div id="build_dict-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="build_dict-expanded"><a name="L34"></a><tt class="py-lineno"> 34</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt> 95<a name="L35"></a><tt class="py-lineno"> 35</tt> <tt class="py-line"><tt class="py-docstring"> @param corpus: a list of documents, represented as lists of (stemmed)</tt> </tt> 96<a name="L36"></a><tt class="py-lineno"> 36</tt> <tt class="py-line"><tt class="py-docstring"> words</tt> </tt> 97<a name="L37"></a><tt class="py-lineno"> 37</tt> <tt class="py-line"><tt class="py-docstring"> @param stopwords: the list of (stemmed) words that should have zero weight</tt> </tt> 98<a name="L38"></a><tt class="py-lineno"> 38</tt> <tt class="py-line"><tt class="py-docstring"> @param measure: the measure used to compute the weights ('IDF'</tt> </tt> 99<a name="L39"></a><tt class="py-lineno"> 39</tt> <tt class="py-line"><tt class="py-docstring"> i.e. 'inverse document frequency' or 'ICF' i.e.</tt> </tt> 100<a name="L40"></a><tt class="py-lineno"> 40</tt> <tt class="py-line"><tt class="py-docstring"> 'inverse collection frequency'; defaults to 'IDF')</tt> </tt> 101<a name="L41"></a><tt class="py-lineno"> 41</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt> 102<a name="L42"></a><tt class="py-lineno"> 42</tt> <tt class="py-line"><tt class="py-docstring"> @returns: a dictionary of weights in the interval [0,1]</tt> </tt> 103<a name="L43"></a><tt class="py-lineno"> 43</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt> 104<a name="L44"></a><tt class="py-lineno"> 44</tt> <tt class="py-line"> </tt> 105<a name="L45"></a><tt class="py-lineno"> 45</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">collections</tt> </tt> 106<a name="L46"></a><tt class="py-lineno"> 46</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">math</tt> </tt> 107<a name="L47"></a><tt class="py-lineno"> 47</tt> <tt class="py-line"> </tt> 108<a name="L48"></a><tt class="py-lineno"> 48</tt> <tt class="py-line"> <tt class="py-name">dictionary</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt> 109<a name="L49"></a><tt class="py-lineno"> 49</tt> <tt class="py-line"> </tt> 110<a name="L50"></a><tt class="py-lineno"> 50</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">measure</tt> <tt class="py-op">==</tt> <tt class="py-string">'ICF'</tt><tt class="py-op">:</tt> </tt> 111<a name="L51"></a><tt class="py-lineno"> 51</tt> <tt class="py-line"> <tt class="py-name">words</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">w</tt> <tt class="py-keyword">for</tt> <tt class="py-name">doc</tt> <tt class="py-keyword">in</tt> <tt class="py-name">corpus</tt> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">doc</tt><tt class="py-op">]</tt> </tt> 112<a name="L52"></a><tt class="py-lineno"> 52</tt> <tt class="py-line"> </tt> 113<a name="L53"></a><tt class="py-lineno"> 53</tt> <tt class="py-line"> <tt class="py-name">term_count</tt> <tt class="py-op">=</tt> <tt class="py-name">collections</tt><tt class="py-op">.</tt><tt class="py-name">Counter</tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">)</tt> </tt> 114<a name="L54"></a><tt class="py-lineno"> 54</tt> <tt class="py-line"> <tt class="py-name">total_count</tt> <tt class="py-op">=</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">words</tt><tt class="py-op">)</tt> </tt> 115<a name="L55"></a><tt class="py-lineno"> 55</tt> <tt class="py-line"> <tt class="py-name">scale</tt> <tt class="py-op">=</tt> <tt class="py-name">math</tt><tt class="py-op">.</tt><tt class="py-name">log</tt><tt class="py-op">(</tt><tt class="py-name">total_count</tt><tt class="py-op">)</tt> </tt> 116<a name="L56"></a><tt class="py-lineno"> 56</tt> <tt class="py-line"> </tt> 117<a name="L57"></a><tt class="py-lineno"> 57</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt><tt class="py-op">,</tt> <tt class="py-name">cnt</tt> <tt class="py-keyword">in</tt> <tt class="py-name">term_count</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt> 118<a name="L58"></a><tt class="py-lineno"> 58</tt> <tt class="py-line"> <tt class="py-name">dictionary</tt><tt class="py-op">[</tt><tt class="py-name">w</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">math</tt><tt class="py-op">.</tt><tt class="py-name">log</tt><tt class="py-op">(</tt><tt class="py-name">total_count</tt> <tt class="py-op">/</tt> <tt class="py-op">(</tt><tt class="py-name">cnt</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> <tt class="py-op">/</tt> <tt class="py-name">scale</tt> </tt> 119<a name="L59"></a><tt class="py-lineno"> 59</tt> <tt class="py-line"> </tt> 120<a name="L60"></a><tt class="py-lineno"> 60</tt> <tt class="py-line"> <tt class="py-keyword">elif</tt> <tt class="py-name">measure</tt> <tt class="py-op">==</tt> <tt class="py-string">'IDF'</tt><tt class="py-op">:</tt> </tt> 121<a name="L61"></a><tt class="py-lineno"> 61</tt> <tt class="py-line"> <tt class="py-name">corpus_size</tt> <tt class="py-op">=</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">corpus</tt><tt class="py-op">)</tt> </tt> 122<a name="L62"></a><tt class="py-lineno"> 62</tt> <tt class="py-line"> <tt class="py-name">scale</tt> <tt class="py-op">=</tt> <tt class="py-name">math</tt><tt class="py-op">.</tt><tt class="py-name">log</tt><tt class="py-op">(</tt><tt class="py-name">corpus_size</tt><tt class="py-op">)</tt> </tt> 123<a name="L63"></a><tt class="py-lineno"> 63</tt> <tt class="py-line"> </tt> 124<a name="L64"></a><tt class="py-lineno"> 64</tt> <tt class="py-line"> <tt class="py-name">term_count</tt> <tt class="py-op">=</tt> <tt class="py-name">collections</tt><tt class="py-op">.</tt><tt class="py-name">defaultdict</tt><tt class="py-op">(</tt><tt class="py-name">int</tt><tt class="py-op">)</tt> </tt> 125<a name="L65"></a><tt class="py-lineno"> 65</tt> <tt class="py-line"> </tt> 126<a name="L66"></a><tt class="py-lineno"> 66</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">doc</tt> <tt class="py-keyword">in</tt> <tt class="py-name">corpus</tt><tt class="py-op">:</tt> </tt> 127<a name="L67"></a><tt class="py-lineno"> 67</tt> <tt class="py-line"> <tt class="py-name">words</tt> <tt class="py-op">=</tt> <tt class="py-name">set</tt><tt class="py-op">(</tt><tt class="py-name">doc</tt><tt class="py-op">)</tt> </tt> 128<a name="L68"></a><tt class="py-lineno"> 68</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">words</tt><tt class="py-op">:</tt> </tt> 129<a name="L69"></a><tt class="py-lineno"> 69</tt> <tt class="py-line"> <tt class="py-name">term_count</tt><tt class="py-op">[</tt><tt class="py-name">w</tt><tt class="py-op">]</tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt> 130<a name="L70"></a><tt class="py-lineno"> 70</tt> <tt class="py-line"> </tt> 131<a name="L71"></a><tt class="py-lineno"> 71</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt><tt class="py-op">,</tt> <tt class="py-name">cnt</tt> <tt class="py-keyword">in</tt> <tt class="py-name">term_count</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt> 132<a name="L72"></a><tt class="py-lineno"> 72</tt> <tt class="py-line"> <tt class="py-name">dictionary</tt><tt class="py-op">[</tt><tt class="py-name">w</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">math</tt><tt class="py-op">.</tt><tt class="py-name">log</tt><tt class="py-op">(</tt><tt class="py-name">corpus_size</tt> <tt class="py-op">/</tt> <tt class="py-op">(</tt><tt class="py-name">cnt</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> <tt class="py-op">/</tt> <tt class="py-name">scale</tt> </tt> 133<a name="L73"></a><tt class="py-lineno"> 73</tt> <tt class="py-line"> </tt> 134<a name="L74"></a><tt class="py-lineno"> 74</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">stopwords</tt><tt class="py-op">:</tt> </tt> 135<a name="L75"></a><tt class="py-lineno"> 75</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">stopwords</tt><tt class="py-op">:</tt> </tt> 136<a name="L76"></a><tt class="py-lineno"> 76</tt> <tt class="py-line"> <tt class="py-name">dictionary</tt><tt class="py-op">[</tt><tt class="py-name">w</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-number">0.0</tt> </tt> 137<a name="L77"></a><tt class="py-lineno"> 77</tt> <tt class="py-line"> </tt> 138<a name="L78"></a><tt class="py-lineno"> 78</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">dictionary</tt> </tt> 139</div><a name="L79"></a><tt class="py-lineno"> 79</tt> <tt class="py-line"> </tt> 140<a name="L80"></a><tt class="py-lineno"> 80</tt> <tt class="py-line"> </tt> 141<a name="build_dict_from_files"></a><div id="build_dict_from_files-def"><a name="L81"></a><tt class="py-lineno"> 81</tt> <a class="py-toggle" href="#" id="build_dict_from_files-toggle" onclick="return toggle('build_dict_from_files');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="tagger.build_dict-module.html#build_dict_from_files">build_dict_from_files</a><tt class="py-op">(</tt><tt class="py-param">output_file</tt><tt class="py-op">,</tt> <tt class="py-param">corpus_files</tt><tt class="py-op">,</tt> <tt class="py-param">stopwords_file</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">,</tt> </tt> 142<a name="L82"></a><tt class="py-lineno"> 82</tt> <tt class="py-line"> <tt class="py-param">reader</tt><tt class="py-op">=</tt><tt id="link-4" class="py-name"><a title="tagger.extras.SimpleReader" class="py-name" href="#" onclick="return doclink('link-4', 'SimpleReader', 'link-3');">SimpleReader</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-param">stemmer</tt><tt class="py-op">=</tt><tt id="link-5" class="py-name"><a title="tagger.tagger.Stemmer" class="py-name" href="#" onclick="return doclink('link-5', 'Stemmer', 'link-1');">Stemmer</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt> 143<a name="L83"></a><tt class="py-lineno"> 83</tt> <tt class="py-line"> <tt class="py-param">measure</tt><tt class="py-op">=</tt><tt class="py-string">'IDF'</tt><tt class="py-op">,</tt> <tt class="py-param">verbose</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt> 144</div><div id="build_dict_from_files-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="build_dict_from_files-expanded"><a name="L84"></a><tt class="py-lineno"> 84</tt> <tt class="py-line"> <tt class="py-docstring">'''</tt> </tt> 145<a name="L85"></a><tt class="py-lineno"> 85</tt> <tt class="py-line"><tt class="py-docstring"> @param output_file: the name of the file where the dictionary should be</tt> </tt> 146<a name="L86"></a><tt class="py-lineno"> 86</tt> <tt class="py-line"><tt class="py-docstring"> saved</tt> </tt> 147<a name="L87"></a><tt class="py-lineno"> 87</tt> <tt class="py-line"><tt class="py-docstring"> @param corpus_files: a list of files with words to process</tt> </tt> 148<a name="L88"></a><tt class="py-lineno"> 88</tt> <tt class="py-line"><tt class="py-docstring"> @param stopwords_file: a file containing a list of stopwords</tt> </tt> 149<a name="L89"></a><tt class="py-lineno"> 89</tt> <tt class="py-line"><tt class="py-docstring"> @param reader: the L{Reader} object to be used</tt> </tt> 150<a name="L90"></a><tt class="py-lineno"> 90</tt> <tt class="py-line"><tt class="py-docstring"> @param stemmer: the L{Stemmer} object to be used</tt> </tt> 151<a name="L91"></a><tt class="py-lineno"> 91</tt> <tt class="py-line"><tt class="py-docstring"> @param measure: the measure used to compute the weights ('IDF'</tt> </tt> 152<a name="L92"></a><tt class="py-lineno"> 92</tt> <tt class="py-line"><tt class="py-docstring"> i.e. 'inverse document frequency' or 'ICF' i.e.</tt> </tt> 153<a name="L93"></a><tt class="py-lineno"> 93</tt> <tt class="py-line"><tt class="py-docstring"> 'inverse collection frequency'; defaults to 'IDF')</tt> </tt> 154<a name="L94"></a><tt class="py-lineno"> 94</tt> <tt class="py-line"><tt class="py-docstring"> @param verbose: whether information on the progress should be</tt> </tt> 155<a name="L95"></a><tt class="py-lineno"> 95</tt> <tt class="py-line"><tt class="py-docstring"> printed on screen</tt> </tt> 156<a name="L96"></a><tt class="py-lineno"> 96</tt> <tt class="py-line"><tt class="py-docstring"> '''</tt> </tt> 157<a name="L97"></a><tt class="py-lineno"> 97</tt> <tt class="py-line"> </tt> 158<a name="L98"></a><tt class="py-lineno"> 98</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">pickle</tt> </tt> 159<a name="L99"></a><tt class="py-lineno"> 99</tt> <tt class="py-line"> </tt> 160<a name="L100"></a><tt class="py-lineno">100</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">verbose</tt><tt class="py-op">:</tt> <tt class="py-keyword">print</tt> <tt class="py-string">'Processing corpus...'</tt> </tt> 161<a name="L101"></a><tt class="py-lineno">101</tt> <tt class="py-line"> <tt class="py-name">corpus</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-op">]</tt> </tt> 162<a name="L102"></a><tt class="py-lineno">102</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">filename</tt> <tt class="py-keyword">in</tt> <tt class="py-name">corpus_files</tt><tt class="py-op">:</tt> </tt> 163<a name="L103"></a><tt class="py-lineno">103</tt> <tt class="py-line"> <tt class="py-keyword">with</tt> <tt class="py-name">open</tt><tt class="py-op">(</tt><tt class="py-name">filename</tt><tt class="py-op">,</tt> <tt class="py-string">'r'</tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">doc</tt><tt class="py-op">:</tt> </tt> 164<a name="L104"></a><tt class="py-lineno">104</tt> <tt class="py-line"> <tt class="py-name">corpus</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">reader</tt><tt class="py-op">(</tt><tt class="py-name">doc</tt><tt class="py-op">.</tt><tt class="py-name">read</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt> 165<a name="L105"></a><tt class="py-lineno">105</tt> <tt class="py-line"> <tt class="py-name">corpus</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-op">[</tt><tt class="py-name">w</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">map</tt><tt class="py-op">(</tt><tt class="py-name">stemmer</tt><tt class="py-op">,</tt> <tt class="py-name">doc</tt><tt class="py-op">)</tt><tt class="py-op">]</tt> <tt class="py-keyword">for</tt> <tt class="py-name">doc</tt> <tt class="py-keyword">in</tt> <tt class="py-name">corpus</tt><tt class="py-op">]</tt> </tt> 166<a name="L106"></a><tt class="py-lineno">106</tt> <tt class="py-line"> </tt> 167<a name="L107"></a><tt class="py-lineno">107</tt> <tt class="py-line"> <tt class="py-name">stopwords</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt> </tt> 168<a name="L108"></a><tt class="py-lineno">108</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">stopwords_file</tt><tt class="py-op">:</tt> </tt> 169<a name="L109"></a><tt class="py-lineno">109</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">verbose</tt><tt class="py-op">:</tt> <tt class="py-keyword">print</tt> <tt class="py-string">'Processing stopwords...'</tt> </tt> 170<a name="L110"></a><tt class="py-lineno">110</tt> <tt class="py-line"> <tt class="py-keyword">with</tt> <tt class="py-name">open</tt><tt class="py-op">(</tt><tt class="py-name">stopwords_file</tt><tt class="py-op">,</tt> <tt class="py-string">'r'</tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">sw</tt><tt class="py-op">:</tt> </tt> 171<a name="L111"></a><tt class="py-lineno">111</tt> <tt class="py-line"> <tt class="py-name">stopwords</tt> <tt class="py-op">=</tt> <tt class="py-name">reader</tt><tt class="py-op">(</tt><tt class="py-name">sw</tt><tt class="py-op">.</tt><tt class="py-name">read</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt> 172<a name="L112"></a><tt class="py-lineno">112</tt> <tt class="py-line"> <tt class="py-name">stopwords</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">w</tt><tt class="py-op">.</tt><tt class="py-name">stem</tt> <tt class="py-keyword">for</tt> <tt class="py-name">w</tt> <tt class="py-keyword">in</tt> <tt class="py-name">map</tt><tt class="py-op">(</tt><tt class="py-name">stemmer</tt><tt class="py-op">,</tt> <tt class="py-name">stopwords</tt><tt class="py-op">)</tt><tt class="py-op">]</tt> </tt> 173<a name="L113"></a><tt class="py-lineno">113</tt> <tt class="py-line"> </tt> 174<a name="L114"></a><tt class="py-lineno">114</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">verbose</tt><tt class="py-op">:</tt> <tt class="py-keyword">print</tt> <tt class="py-string">'Building dictionary... '</tt> </tt> 175<a name="L115"></a><tt class="py-lineno">115</tt> <tt class="py-line"> <tt class="py-name">dictionary</tt> <tt class="py-op">=</tt> <tt id="link-6" class="py-name" targets="Module tagger.build_dict=tagger.build_dict-module.html,Function tagger.build_dict.build_dict()=tagger.build_dict-module.html#build_dict"><a title="tagger.build_dict 176tagger.build_dict.build_dict" class="py-name" href="#" onclick="return doclink('link-6', 'build_dict', 'link-6');">build_dict</a></tt><tt class="py-op">(</tt><tt class="py-name">corpus</tt><tt class="py-op">,</tt> <tt class="py-name">stopwords</tt><tt class="py-op">,</tt> <tt class="py-name">measure</tt><tt class="py-op">)</tt> </tt> 177<a name="L116"></a><tt class="py-lineno">116</tt> <tt class="py-line"> <tt class="py-keyword">with</tt> <tt class="py-name">open</tt><tt class="py-op">(</tt><tt class="py-name">output_file</tt><tt class="py-op">,</tt> <tt class="py-string">'wb'</tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">out</tt><tt class="py-op">:</tt> </tt> 178<a name="L117"></a><tt class="py-lineno">117</tt> <tt class="py-line"> <tt class="py-name">pickle</tt><tt class="py-op">.</tt><tt class="py-name">dump</tt><tt class="py-op">(</tt><tt class="py-name">dictionary</tt><tt class="py-op">,</tt> <tt class="py-name">out</tt><tt class="py-op">,</tt> <tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt> 179</div><a name="L118"></a><tt class="py-lineno">118</tt> <tt class="py-line"> </tt> 180<a name="L119"></a><tt class="py-lineno">119</tt> <tt class="py-line"> </tt> 181<a name="L120"></a><tt class="py-lineno">120</tt> <tt class="py-line"><tt class="py-keyword">if</tt> <tt class="py-name">__name__</tt> <tt class="py-op">==</tt> <tt class="py-string">'__main__'</tt><tt class="py-op">:</tt> </tt> 182<a name="L121"></a><tt class="py-lineno">121</tt> <tt class="py-line"> </tt> 183<a name="L122"></a><tt class="py-lineno">122</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">getopt</tt> </tt> 184<a name="L123"></a><tt class="py-lineno">123</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">sys</tt> </tt> 185<a name="L124"></a><tt class="py-lineno">124</tt> <tt class="py-line"> </tt> 186<a name="L125"></a><tt class="py-lineno">125</tt> <tt class="py-line"> <tt class="py-keyword">try</tt><tt class="py-op">:</tt> </tt> 187<a name="L126"></a><tt class="py-lineno">126</tt> <tt class="py-line"> <tt class="py-name">options</tt> <tt class="py-op">=</tt> <tt class="py-name">getopt</tt><tt class="py-op">.</tt><tt class="py-name">getopt</tt><tt class="py-op">(</tt><tt class="py-name">sys</tt><tt class="py-op">.</tt><tt class="py-name">argv</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">:</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-string">'o:s:'</tt><tt class="py-op">)</tt> </tt> 188<a name="L127"></a><tt class="py-lineno">127</tt> <tt class="py-line"> <tt class="py-name">output_file</tt> <tt class="py-op">=</tt> <tt class="py-name">options</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt> </tt> 189<a name="L128"></a><tt class="py-lineno">128</tt> <tt class="py-line"> <tt class="py-name">stopwords_file</tt> <tt class="py-op">=</tt> <tt class="py-name">options</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt> </tt> 190<a name="L129"></a><tt class="py-lineno">129</tt> <tt class="py-line"> <tt class="py-name">corpus</tt> <tt class="py-op">=</tt> <tt class="py-name">options</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt> </tt> 191<a name="L130"></a><tt class="py-lineno">130</tt> <tt class="py-line"> <tt class="py-keyword">except</tt><tt class="py-op">:</tt> </tt> 192<a name="L131"></a><tt class="py-lineno">131</tt> <tt class="py-line"> <tt class="py-keyword">print</tt> <tt class="py-name">__doc__</tt> </tt> 193<a name="L132"></a><tt class="py-lineno">132</tt> <tt class="py-line"> <tt class="py-name">exit</tt><tt class="py-op">(</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt> 194<a name="L133"></a><tt class="py-lineno">133</tt> <tt class="py-line"> </tt> 195<a name="L134"></a><tt class="py-lineno">134</tt> <tt class="py-line"> <tt id="link-7" class="py-name" targets="Function tagger.build_dict.build_dict_from_files()=tagger.build_dict-module.html#build_dict_from_files"><a title="tagger.build_dict.build_dict_from_files" class="py-name" href="#" onclick="return doclink('link-7', 'build_dict_from_files', 'link-7');">build_dict_from_files</a></tt><tt class="py-op">(</tt><tt class="py-name">output_file</tt><tt class="py-op">,</tt> <tt class="py-name">corpus</tt><tt class="py-op">,</tt> <tt class="py-name">stopwords_file</tt><tt class="py-op">,</tt> <tt class="py-name">verbose</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt> 196<a name="L135"></a><tt class="py-lineno">135</tt> <tt class="py-line"> </tt><script type="text/javascript"> 197<!-- 198expandto(location.href); 199// --> 200</script> 201</pre> 202<br /> 203<!-- ==================== NAVIGATION BAR ==================== --> 204<table class="navbar" border="0" width="100%" cellpadding="0" 205 bgcolor="#a0c0ff" cellspacing="0"> 206 <tr valign="middle"> 207 208 <!-- Tree link --> 209 <th> <a 210 href="module-tree.html">Trees</a> </th> 211 212 <!-- Index link --> 213 <th> <a 214 href="identifier-index.html">Indices</a> </th> 215 216 <!-- Help link --> 217 <th> <a 218 href="help.html">Help</a> </th> 219 220 <!-- Project homepage --> 221 <th class="navbar" align="right" width="100%"> 222 <table border="0" cellpadding="0" cellspacing="0"> 223 <tr><th class="navbar" align="center" 224 ><a class="navbar" target="_top" href="http://github.com/apresta/tagger">tagger</a></th> 225 </tr></table></th> 226 </tr> 227</table> 228<table border="0" cellpadding="0" cellspacing="0" width="100%%"> 229 <tr> 230 <td align="left" class="footer"> 231 Generated by Epydoc 3.0.1 on Wed Jun 8 01:57:46 2011 232 </td> 233 <td align="right" class="footer"> 234 <a target="mainFrame" href="http://epydoc.sourceforge.net" 235 >http://epydoc.sourceforge.net</a> 236 </td> 237 </tr> 238</table> 239 240<script type="text/javascript"> 241 <!-- 242 // Private objects are initially displayed (because if 243 // javascript is turned off then we want them to be 244 // visible); but by default, we want to hide them. So hide 245 // them unless we have a cookie that says to show them. 246 checkCookie(); 247 // --> 248</script> 249</body> 250</html>