PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/src/php/setup/linSimilarity.php

https://bitbucket.org/silverasm/wordseer
PHP | 322 lines | 251 code | 9 blank | 62 comment | 22 complexity | d4abcf4403d18e8cc9f0538ba3ef55ac MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
  1. <?php
  2. /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
  3. /**************************************************************
  4. linSimilarity.php
  5. Calculates statistics relating words to each other
  6. acoording to the lin similarity metric described in
  7. Dekang Lin, 1998, Automatic Retrieval and Clustering of Similar Words
  8. All the functions needed to calculate lin similarity are in this file
  9. albeit in a little disorganized way that needs supervision to execute
  10. 1. Calculate the pointwise mutual information of a word
  11. with a particular dependency relationship.
  12. The formula is:
  13. I(w, r, w') = log(|w, r, w'|x|*,r,*|/|w, r, *|x|*,r,w'|)
  14. in order to be fast, this requires you to expand dependency to include
  15. counts of how often each relationship occurs and how often each
  16. (dep, rel) and (gov, rel) occurs.
  17. >> expand()
  18. >> calculateRelationshipCounts()
  19. Then, you can run
  20. >> calculateDependencyInformation()
  21. 2. Calculate word information: the denominator in the lin similarity calculation
  22. >> calculateWordInformation();
  23. This takes a few hours
  24. 3. Calcualte similarity.
  25. >> calculateSimilarity();
  26. This takes 2 days, excluding stopwords and infrequent words
  27. 4. Group words together into synsets for easy access later on.
  28. >> makeSynsets()
  29. ***********************************************************/
  30. include_once '../dbsetup.php';
  31. include_once '../util.php';
  32. include_once '../synonym_groups.php';
  33. gc_enable();
  34. function expand(){
  35. $gov_id = 0;
  36. $dep_id = 0;
  37. $relation_id = 0;
  38. $row = 0;
  39. $query = 0;
  40. $id = 0;
  41. $result = 0;
  42. $count = 0;
  43. $r = false;
  44. while($id < 3000000){
  45. $query = "SELECT * from dependency WHERE id =".$id.";";
  46. $result = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  47. <br/> Query: " . $query . "
  48. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  49. while($row = mysql_fetch_array($result)){
  50. if($row['relation_id'] != 11){
  51. if($row['gov_count']>0){}else{
  52. $query = "SELECT COUNT(*) as c from dependency_xref_sentence WHERE gov_id = ".$row['gov_id']." AND relation_id = ".$row['relation_id'].";";
  53. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  54. <br/> Query: " . $query . "
  55. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  56. $count = mysql_fetch_array($r);
  57. $count = $count['c'];
  58. $query = "UPDATE dependency SET gov_count = ".$count.' WHERE gov_id = '.$row['gov_id'].' AND relation_id = '.$row['relation_id'].';';
  59. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  60. <br/> Query: " . $query . "
  61. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  62. }
  63. if($row['dep_count']>0){}else{
  64. $query = "SELECT COUNT(*) as c from dependency_xref_sentence WHERE dep_id = ".$row['dep_id']." AND relation_id = ".$row['relation_id'].";";
  65. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  66. <br/> Query: " . $query . "
  67. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  68. $count = mysql_fetch_array($r);
  69. $count = $count['c'];
  70. $query = "UPDATE dependency SET dep_count = ".$count.' WHERE dep_id = '.$row['dep_id'].' AND relation_id = '.$row['relation_id'].';';
  71. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  72. <br/> Query: " . $query . "
  73. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  74. }
  75. }
  76. }
  77. echo $id."\n";
  78. $id++;
  79. }
  80. }
  81. //expand();
  82. function calculateRelationshipCounts(){
  83. $count = 0;
  84. $row = array();
  85. $query = "SELECT id from relationship;";
  86. $result = mysql_query($query);
  87. while($row = mysql_fetch_array($result)){
  88. $query = "SELECT count(sentence_id) as c from dependency_xref_sentence, dependency WHERE dependency_id = dependency.id AND relation_id = ".$row['id'].";";
  89. $count = mysql_query($query);
  90. $count = mysql_fetch_array($count);
  91. $count = $count['c'];
  92. $query = "UPDATE relationship SET count =".$count." WHERE id = ".$row['id'].";";
  93. mysql_query($query);
  94. echo $row['id']."
  95. ";
  96. }
  97. }
  98. /** calculate the information carried by a dependency relationship
  99. I(w, r, w') = log(|w, r, w'|x|*,r,*|/|w, r, *|x|*,r,w'|)
  100. **/
  101. function calculateDependencyInformation(){
  102. $countBoth = 0;
  103. $countDep = 0;
  104. $countGov = 0;
  105. $countRel = 0;
  106. $count;
  107. $information = 0;
  108. $row = array();
  109. $row_gov = array();
  110. $row_dep = array();
  111. $rel_id = 0;
  112. $dep_id = 0;
  113. $gov_id = 0;
  114. $id = 0;
  115. $query = 0;
  116. $results = 0;
  117. $dep_results = 0;
  118. $gov_results = 0;
  119. $i = 0;
  120. while($rel_id < 368){
  121. echo "-----------------------".$rel_id."----------------------";
  122. if($rel_id != 11){
  123. // get rel count
  124. $query = "SELECT * from relationship WHERE id = ".$rel_id.";";
  125. $count = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  126. <br/> Query: " . $query . "
  127. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  128. $count = mysql_fetch_array($count);
  129. $countRel = $count['count'];
  130. if($countRel > 0){
  131. $query = "SELECT id, dep_count, gov_count, frequency from dependency WHERE relation_id = ".$rel_id.";";
  132. $results = mysql_query($query) or die("<b>A fatal MySQL error occured</b>.
  133. <br/> Query: " . $query . "
  134. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  135. if(mysql_num_rows($results) > 0){
  136. while($row = mysql_fetch_array($results)){
  137. $countBoth = $row['frequency'];
  138. $countGov = $row['gov_count'];
  139. $countDep = $row['dep_count'];
  140. $id = $row['id'];
  141. if($countBoth>0){
  142. $i += 1;
  143. if($i%10000==0) echo $i."\n";
  144. $query = "UPDATE dependency SET information =".log(($countBoth*$countRel)/($countDep*$countGov))." WHERE id = ".$id.";" ;
  145. mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  146. }
  147. }
  148. }
  149. }
  150. }
  151. $rel_id++;
  152. }
  153. }
  154. /** calculate word information **/
  155. function calculateWordInformation(){
  156. $query = " SELECT * from word WHERE pos NOT IN ('X', '-LRB-', '-RRB-', '.', ',');";
  157. $result = mysql_query($query);
  158. $row = array();
  159. $r = array();
  160. $information = 0;
  161. while($row = mysql_fetch_array($result)){
  162. $query = "SELECT SUM(information) as i FROM dependency where gov_id = ".$row['id']." OR dep_id = ".$row['id']." AND information > 0;";
  163. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  164. $r = mysql_fetch_array($r);
  165. $information = $r['i'];
  166. $query = "UPDATE word SET information = ".$information." WHERE id = ".$row['id'].";" or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  167. $r = mysql_query($query);
  168. echo $row['word'],": ",$query,"\n";
  169. }
  170. }
  171. /** calculate the similarity between two words
  172. only do the top 20 - 40 most similar words
  173. **/
  174. function calculateSimilarity(){
  175. // variable declarations
  176. $w1 = array();
  177. $pos;
  178. $id;
  179. $sim;
  180. $information1;
  181. $information2;
  182. $numerator;
  183. $similarity;
  184. $r;
  185. $i = 0;
  186. $result2;
  187. //get word 1. excluding stop words, proper nouns and words that only occur once.
  188. /*Batch 1: < 645 - >= 300
  189. Batch 2: < 300 >= 25*/
  190. $query = "SELECT id, word.word, pos, information, sentence_frequency
  191. from word, word_idf
  192. WHERE pos IN ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'RB', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ')
  193. AND word.id = word_id
  194. AND sentence_frequency >= 25
  195. AND word.id <= 40000
  196. AND word.id >= 3518
  197. AND sentence_frequency < 300
  198. AND sentence_frequency >= 2
  199. ORDER BY sentence_frequency desc, word_id asc;";
  200. $result1 = mysql_query($query);
  201. while($w1 = mysql_fetch_array($result1)){
  202. $pos = $w1['pos'];
  203. $id = $w1['id'];
  204. $information1 = $w1['information'];
  205. if(!stopword($w1['word'] && !done($id))){
  206. echo $w1['sentence_frequency'],", ",$id,": ",$w1['word'], " ",$pos," ";
  207. // get dep numerator
  208. $query = "SELECT SUM(w1.information + w2.information) as numerator, w2.dep_id as word2, word.information as information2 from word,
  209. (SELECT gov_id, relation_id, information from dependency WHERE relation_id != 11 AND information > 0 AND dep_id = ".$id.") as w1
  210. JOIN
  211. (SELECT gov_id, relation_id, information, dep_id from dependency WHERE relation_id != 11 AND information > 0 AND dep_id != ".$id." AND dep_pos = '".$pos."') as w2
  212. ON
  213. (w1.relation_id = w2.relation_id AND w1.gov_id = w2.gov_id) WHERE word.id = w2.dep_id GROUP BY w2.dep_id ORDER BY numerator desc;";
  214. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  215. echo "g:",mysql_num_rows($r)," d:";
  216. //clear any previous result
  217. //$query = "DELETE FROM similarity where word1_id = $id;";
  218. //$result2 = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  219. $i = 0;
  220. while($sim = mysql_fetch_array($r)){
  221. $i += 1;
  222. if($i <= 100){
  223. //calculate similarity
  224. $similarity = $sim['numerator']/($information1+$sim['information2']);
  225. if($similarity > 0){
  226. $query = "INSERT IGNORE INTO similarity (word1_id, word2_id, lin_similarity) VALUES (".$id.", ".$sim['word2'].", ".$similarity.");";
  227. $result2 = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  228. }else{
  229. break;
  230. }
  231. }
  232. }
  233. // get gov numerator
  234. $query="SELECT SUM(w1.information + w2.information) as numerator, w2.gov_id as word2, word.information as information2 from word,
  235. (SELECT dep_id, relation_id, information from dependency WHERE relation_id != 11 AND information > 0 AND gov_id = ".$id.") as w1
  236. JOIN
  237. (SELECT dep_id, relation_id, information, gov_id from dependency WHERE relation_id != 11 AND information > 0 AND gov_id != ".$id." AND gov_pos = '".$pos."') as w2
  238. ON
  239. (w1.relation_id = w2.relation_id AND w1.dep_id = w2.dep_id) WHERE w2.gov_id=word.id GROUP BY w2.gov_id ORDER BY numerator desc;";
  240. $r = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  241. echo mysql_num_rows($r),"\n";
  242. $i = 0;
  243. while($sim = mysql_fetch_array($r)){
  244. $i += 1;
  245. if($i <= 100){
  246. //calculate similarity
  247. $similarity = $sim['numerator']/($information1+$sim['information2']);
  248. if($similarity > 0){
  249. $query = "INSERT INTO similarity (word1_id, word2_id, lin_similarity) VALUES (".$id.", ".$sim['word2'].", ".$similarity.") ON DUPLICATE KEY UPDATE lin_similarity = lin_similarity + VALUES(lin_similarity);";
  250. $result2 = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  251. }
  252. }else{
  253. break;
  254. }
  255. }
  256. }
  257. }
  258. }
  259. $stopwords = "a's, able, about, above, according, accordingly, across, actually, after, afterwards, again, against, ain't, all, allow, allows, almost, alone, along, already, also, although, always, am, among, amongst, an, and, another, any, anybody, anyhow, anyone, anything, anyway, anyways, anywhere, apart, appear, appreciate, appropriate, are, aren't, around, as, aside, ask, asking, associated, at, available, away, awfully, be, became, because, become, becomes, becoming, been, before, beforehand, behind, being, believe, below, beside, besides, best, better, between, beyond, both, brief, but, by, c'mon, c's, came, can, can't, cannot, cant, cause, causes, certain, certainly, changes, clearly, co, com, come, comes, concerning, consequently, consider, considering, contain, containing, contains, corresponding, could, couldn't, course, currently, definitely, described, despite, did, didn't, different, do, does, doesn't, doing, don't, done, down, downwards, during, each, edu, eg, eight, either, else, elsewhere, enough, entirely, especially, et, etc, even, ever, every, everybody, everyone, everything, everywhere, ex, exactly, example, except, far, few, fifth, first, five, followed, following, follows, for, former, formerly, forth, four, from, further, furthermore, get, gets, getting, given, gives, go, goes, going, gone, got, gotten, greetings, had, hadn't, happens, hardly, has, hasn't, have, haven't, having, he, he's, hello, help, hence, her, here, here's, hereafter, hereby, herein, hereupon, hers, herself, hi, him, himself, his, hither, hopefully, how, howbeit, however, i'd, i'll, i'm, i've, ie, if, ignored, immediate, in, inasmuch, inc, indeed, indicate, indicated, indicates, inner, insofar, instead, into, inward, is, isn't, it, it'd, it'll, it's, its, itself, just, keep, keeps, kept, know, knows, known, last, lately, later, latter, latterly, least, less, lest, let, let's, like, liked, likely, little, look, looking, looks, ltd, mainly, many, may, maybe, me, mean, meanwhile, merely, might, more, moreover, most, mostly, much, must, my, myself, name, namely, nd, near, nearly, necessary, need, needs, neither, never, nevertheless, new, next, nine, no, nobody, non, none, noone, nor, normally, not, nothing, novel, now, nowhere, obviously, of, off, often, oh, ok, okay, old, on, once, one, ones, only, onto, or, other, others, otherwise, ought, our, ours, ourselves, out, outside, over, overall, own, particular, particularly, per, perhaps, placed, please, plus, possible, presumably, probably, provides, que, quite, qv, rather, rd, re, really, reasonably, regarding, regardless, regards, relatively, respectively, right, said, same, saw, say, saying, says, second, secondly, see, seeing, seem, seemed, seeming, seems, seen, self, selves, sensible, sent, serious, seriously, seven, several, shall, she, should, shouldn't, since, six, so, some, somebody, somehow, someone, something, sometime, sometimes, somewhat, somewhere, soon, sorry, specified, specify, specifying, still, sub, such, sup, sure, t's, take, taken, tell, tends, th, than, thank, thanks, thanx, that, that's, thats, the, their, theirs, them, themselves, then, thence, there, there's, thereafter, thereby, therefore, therein, theres, thereupon, these, they, they'd, they'll, they're, they've, think, third, this, thorough, thoroughly, those, though, three, through, throughout, thru, thus, to, together, too, took, toward, towards, tried, tries, truly, try, trying, twice, two, un, under, unfortunately, unless, unlikely, until, unto, up, upon, us, use, used, useful, uses, using, usually, value, various, very, via, viz, vs, want, wants, was, wasn't, way, we, we'd, we'll, we're, we've, welcome, well, went, were, weren't, what, what's, whatever, when, whence, whenever, where, where's, whereafter, whereas, whereby, wherein, whereupon, wherever, whether, which, while, whither, who, who's, whoever, whole, whom, whose, why, will, willing, wish, with, within, without, won't, wonder, would, would, wouldn't, yes, yet, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves, zero";
  260. function stopword($w){
  261. global $stopwords;
  262. return strstr(" ".strtolower($w).",", $stopwords);
  263. }
  264. /** checks if an id is done */
  265. function done($id){
  266. $query = "SELECT * from similarity where word1_id = ".$id." LIMIT 1;";
  267. $result = mysql_query($query);
  268. return(mysql_num_rows($result) > 0);
  269. }
  270. /** once all the lin similarities have been calculated, calculate
  271. synsets for easy access**/
  272. function makeSynsets(){
  273. echo "MAKING SYNSETS
  274. ";
  275. $query = "SELECT distinct word2_id, word
  276. from similarity join word ON word2_id = id
  277. WHERE word2_id >= 4625;";
  278. $result = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  279. $row;
  280. $id;
  281. $synset;
  282. $result2;
  283. $s;
  284. while($row = mysql_fetch_array($result)){
  285. $id = $row['word2_id'];
  286. echo $row['word'],"
  287. ";
  288. $synset = synset($id, "", "");
  289. foreach($synset as $s){
  290. $query = "INSERT IGNORE INTO synsets (word1_id, word1, word2_id, word2, similarity)
  291. VALUES (".$id.", '".mysql_real_escape_string($row['word'])."', ".$s['id'].", '".mysql_real_escape_string($s['word'])."', ".$s['similarity'].");";
  292. $result2 = mysql_query($query) or die("<b>A fatal MySQL error occured</b>\n<br/> Query: " . $query . "\n<br/> Error: (" . mysql_errno() . ") ". mysql_error());
  293. }
  294. }
  295. }
  296. makeSynsets();
  297. ?>