PageRenderTime 49ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/src/php/setup/populate-tf-idf.php

https://bitbucket.org/silverasm/wordseer
PHP | 56 lines | 43 code | 5 blank | 8 comment | 1 complexity | 9bd7a595a385fabc9a51a908e6b5f953 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
  1. <?php
  2. /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
  3. /**************************************************************
  4. populate-tf-idf.php
  5. Called once, after the main dataabse has been created to populate tables of
  6. TF-IDF values for words in paragraphs.
  7. ***********************************************************/
  8. include_once 'dbsetup.php';
  9. //populateWordIDFs();
  10. populateParagraphTFs();
  11. function populateWordIDFs(){
  12. $query = "SELECT * from word;";
  13. $result = mysql_query($query);
  14. while($row = mysql_fetch_array($result)){
  15. echo $row['word']."\t ";
  16. $query = "SELECT COUNT(distinct paragraph_id) as p, COUNT(distinct sentence_id) as s, word_id FROM sentence, sentence_xref_word WHERE sentence.id = sentence_id AND word_id = ".$row['id'].";";
  17. $counts = mysql_query($query);
  18. $counts = mysql_fetch_array($counts);
  19. $query = "INSERT INTO word_idf (word_id, word, paragraph_frequency, sentence_frequency) VALUES(".$row['id'].", '".$row['word']."', ".$counts['p'].", ".$counts['s'].");";
  20. mysql_query($query);
  21. echo $counts['p'],"\n";
  22. }
  23. }
  24. function populateParagraphTFs(){
  25. $query = "SELECT * from paragraph;";
  26. $result = mysql_query($query);
  27. $N = mysql_num_rows($result);
  28. while($row = mysql_fetch_array($result)){
  29. echo $row['id'];
  30. $query = "SELECT paragraph_id, COUNT(word_id) as count, word_id FROM sentence, sentence_xref_word WHERE sentence.id = sentence_id AND paragraph_id = ".$row['id']." GROUP BY word_id;";
  31. $counts = mysql_query($query);
  32. while($count = mysql_fetch_array($counts)){
  33. $query = "SELECT paragraph_frequency, word from word_idf WHERE word_id = ".$count['word_id'].";";
  34. $df = mysql_query($query);
  35. $df = mysql_fetch_array($df);
  36. $word = mysql_real_escape_string($df['word']);
  37. $df = $df['paragraph_frequency'];
  38. if($df){
  39. $idf = log($N/$df);
  40. $tf_idf = $count['count']*$idf;
  41. $query = "INSERT INTO paragraph_tf (paragraph_id, word_id, tf, tf_idf, word) VALUES (".$row['id'].", ".$count['word_id'].", ".$count['count'].", ".$tf_idf.",'".$word."');";
  42. mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  43. <br/> Query: " . $query . "
  44. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  45. }
  46. }
  47. echo "\n";
  48. }
  49. }
  50. ?>