PageRenderTime 45ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/src/php/phrases/get-phrases.php

https://bitbucket.org/silverasm/wordseer
PHP | 381 lines | 351 code | 16 blank | 14 comment | 41 complexity | 40afa0bd4ed414310442f9ef5dd2977c MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
  1. <?php
  2. /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
  3. /*******************************************************************************
  4. get-phrases.php
  5. Utilities for fetching frequent phrases.
  6. *******************************************************************************/
  7. include_once "../util.php";
  8. include_once "../grammaticalsearch/get-search-results.php";
  9. include_once '../document/get-metadata.php';
  10. //Query parameters
  11. $wordseer_instance = getGetParam('instance');
  12. $gov = '';
  13. $dep = '';
  14. $relation = '';
  15. $govtype = 'word';
  16. $deptype = 'word';
  17. $searches = decodeGetJson('search');
  18. if (count($searches) <= 1) {
  19. $gov = getGetParam('gov');
  20. $govtype = getGetParam('govtype');
  21. $dep = getGetParam('dep');
  22. $deptype = getGetParam('deptype');
  23. $relation = getGetParam('relation');
  24. }
  25. $collection = getGetParam('collection');
  26. $statistics = getGetParam('statistics');
  27. $start = getGetParam('start');
  28. $limit = getGetParam('limit');
  29. $metadata = decodeGetJson('metadata');
  30. $phrases = decodeGetJson('phrases');
  31. $timing = getGetParam('timing');
  32. $path = '../../../instances/'.$wordseer_instance.'/config.php';
  33. include_once $path;
  34. include_once '../subsets/read.php';
  35. $sql = "SET session tmp_table_size = 1073741824;";
  36. mysql_query($sql);
  37. $sql = "SET session max_heap_table_size = 1073741824;";
  38. mysql_query($sql);
  39. if (strstr($_SERVER['REQUEST_URI'], 'get-phrases.php')) {
  40. dispatch_phrases($gov, $govtype, $dep, $deptype, $relation,
  41. $collection, $metadata, $phrases);
  42. }
  43. /** Returns a list of sentence ID's in which all the given words or phrases
  44. occur.
  45. @param {Array{String}} An array of Word or phrase specifiers in the format
  46. word_<id> or phrase_<id>
  47. @return {Array[Number]} phrase ID's
  48. */
  49. function getSentenceIDsForPhrases($phrases) {
  50. global $timing;
  51. global $num_filter_conditions;
  52. $table_identifier = 'filtered_sent_ids';
  53. $insertion_fields = '(id, document_id )';
  54. $field_identifier = "DISTINCT sentence_id, document_id ";
  55. $query_id_where = '';
  56. global $cache_results;
  57. global $query_id;
  58. if ($cache_results || $query_id) {
  59. $table_identifier = 'cached_filtered_sent_ids';
  60. $insertion_fields = '(id, document_id, query_id)';
  61. $field_identifier = "DISTINCT sentence_id, document_id $query_id";
  62. $query_id_where = " AND query_id = $query_id ";
  63. }
  64. if ($timing != 0) {
  65. echo "<br>Phrases: ".json_encode($phrases)."
  66. <br>";
  67. }
  68. if (count($phrases) == 0) {
  69. return "all";
  70. } else{
  71. if (!$query_id || $cache_results) {
  72. $t1 = time();
  73. $sentence_ids = array();
  74. $first = true;
  75. foreach ($phrases as $phrase) {
  76. $num_filter_conditions += 1;
  77. if (strlen(trim($phrase)) > 0) {
  78. $phrase_ids = getPhraseIDs($phrase);
  79. if (count($phrase_ids) > 0) {
  80. $phrase_id_string = join(", ", $phrase_ids);
  81. $sql = "INSERT INTO $table_identifier $insertion_fields
  82. SELECT $field_identifier FROM sequence_xref_sentence
  83. WHERE sequence_id in ($phrase_id_string)
  84. ON DUPLICATE KEY update num_matched = num_matched + 1;";
  85. if ($timing) {
  86. echo $sql;
  87. }
  88. mysql_query($sql) or die (mysql_error()." On: <br> $sql
  89. <br> while getting sentence_ids matching phrases
  90. $phrase_id_string <br> at get-phrases.php l.64");
  91. }
  92. }
  93. }
  94. updateSentenceFilterTable();
  95. }
  96. $sentence_ids = array();
  97. if (!cache_results) {
  98. $sql = "SELECT * from $table_identifier $query_id_where ";
  99. $result = mysql_query($sql) or die (mysql_error()."<br>
  100. at get-phrases.php l.74");
  101. while ($row = mysql_fetch_assoc($result)) {
  102. array_push($sentence_ids, $row['id']);
  103. }
  104. $t2 = time();
  105. if ($timing) {
  106. echo "<br>Time to get ".count($sentence_ids)." sentence ID's
  107. matching phrases: ".($t2-$t1)."s<br>";
  108. }
  109. }
  110. return $sentence_ids;
  111. }
  112. }
  113. /** Gets the ID's of the phrases that match a particular phrase filter sent
  114. by the server.
  115. @param {String} $phrase A filter parameter sent by the client in the form
  116. class_id where class is either "word" or "phrase" and "id" is the id of the
  117. word or phrase.
  118. */
  119. function getPhraseIDs($phrase) {
  120. $components = explode("_", $phrase);
  121. $type = $components[0];
  122. $id = $components[1];
  123. $ids = array();
  124. if ($type == "phrase") {
  125. array_push($ids, "'".$id."'");
  126. } else if ($type == 'word') {
  127. $word_ids = explode(".", $id);
  128. foreach ($word_ids as $id) {
  129. array_push($ids, "'.$id.'");
  130. array_push($ids, "'l.$id.'");
  131. }
  132. }
  133. return $ids;
  134. }
  135. function dispatch_phrases($gov, $govtype, $dep, $deptype, $relation,
  136. $collection, $metadata, $phrases) {
  137. global $timing;
  138. $length = getGetParam('length');
  139. $function_words_value = getGetParam('has_function_words')== 'true'? 1: 0;
  140. $lemmatized = getGetParam('lemmatized') == 'true'? 1: 0;
  141. $sentence_ids = array(-2);
  142. if ($relation == "") {
  143. $results = getSentenceSearchResults($gov, $govtype, $collection, $metadata, $phrases);
  144. } else {
  145. $results = getDependencySentenceResults($gov, $govtype, $dp, $deptype,
  146. $relation, $collection, false, $metadata, $phrases);
  147. if ($timing != 0) {
  148. echo "<br> Number of matched sentences: ".mysql_num_rows($results);
  149. }
  150. }
  151. $first = true;
  152. echo "[
  153. ";
  154. if ($results == 'all') {
  155. $sentence_count_where_clause = " AND sentence_count > 1";
  156. $final_results = array();
  157. $ids = array();
  158. $results = array();
  159. $sql = "SELECT sentence_count as count,
  160. 0 as document_count,
  161. id
  162. FROM sequence USE INDEX(for_counts)
  163. WHERE length = $length
  164. AND NOT all_function_words
  165. AND sentence_count > 0
  166. AND has_function_words = $function_words_value
  167. ORDER BY sentence_count DESC
  168. LIMIT 150";
  169. $result = mysql_query($sql) or die("Error getting sequences for
  170. sentence IDs, l. 180:
  171. <br> ".mysql_error()."</br>
  172. <br> on query
  173. <br> $sql");
  174. $ids = array();
  175. while ($row = mysql_fetch_assoc($result)) {
  176. $id = "'".$row['id']."'";
  177. $results[$id] = $row;
  178. array_push($ids, $id);
  179. }
  180. $t3 = time();
  181. if ($timing != 0) {
  182. echo "<br>Time to count most frequent sequences of length $length: ".
  183. ($t3-$t2)."s";
  184. }
  185. if (count($ids) > 0) {
  186. $t3 = time();
  187. $id_string = join(", ", $ids);
  188. $sql = "SELECT sequence, length, lemmatized, has_function_words,
  189. all_function_words, id
  190. FROM sequence WHERE id in ($id_string);";
  191. $result = mysql_query($sql) or die("Error getting sequences for
  192. sequence IDs, l. 157:
  193. <br> ".mysql_error()."</br>
  194. <br> on query
  195. <br> $sql");
  196. while ($row = mysql_fetch_assoc($result)) {
  197. $id = "'".$row['id']."'";
  198. $results[$id] = array_merge($results[$id], $row);
  199. }
  200. foreach($ids as $id) {
  201. if ($results[$id]["lemmatized"] == 0) {
  202. if (!$first) {
  203. echo ",
  204. ";
  205. }
  206. echo json_encode($results[$id]);
  207. $first = false;
  208. }
  209. }
  210. $t4 = time();
  211. if ($timing != 0) {
  212. echo $sql;
  213. echo "<br> Time to get sequences for sequence id's: ".
  214. ($t4-$t3)."s <br>";
  215. }
  216. }
  217. } else {
  218. $table_identifier = 'filtered_sent_ids';
  219. $query_id_where = '';
  220. global $cache_results;
  221. global $query_id;
  222. if ($cache_results || $query_id) {
  223. $table_identifier = 'cached_filtered_sent_ids';
  224. $query_id_where = " query_id = $query_id ";
  225. }
  226. $sql = "SELECT sequence.id as id, sequence, length, lemmatized,
  227. has_function_words, all_function_words,
  228. count($table_identifier.id) as count,
  229. count(distinct $table_identifier.document_id) as document_count
  230. FROM
  231. $table_identifier, sequence_xref_sentence, sequence
  232. WHERE $table_identifier.id = sentence_id
  233. AND $query_id_where
  234. AND sequence_id = sequence.id
  235. AND length = $length
  236. AND has_function_words = $function_words_value
  237. AND sentence_count > 0
  238. AND NOT all_function_words
  239. GROUP BY sequence_id
  240. ORDER BY COUNT desc LIMIT 150;";
  241. $results = array();
  242. $t1 = time();
  243. $result = mysql_query($sql) or die("Error getting sequences for
  244. sentence IDs, l. 180:
  245. <br> ".mysql_error()."</br>
  246. <br> on query
  247. <br> $sql");
  248. $t2 = time();
  249. if ($timing != 0) {
  250. echo "<br> Time to count most frequent sequences of length $length: ".
  251. ($t2-$t1)."s <br>";
  252. }
  253. $ids = array();
  254. $first = true;
  255. while ($row = mysql_fetch_assoc($result)) {
  256. if($row['lemmatized'] == 0) {
  257. if (!$first) {
  258. echo ",
  259. ";
  260. }
  261. echo json_encode($row);
  262. $first = false;
  263. }
  264. }
  265. }
  266. echo "
  267. ]";
  268. }
  269. function getPhrase($id) {
  270. $sql = "SELECT sequence from sequence where id = '".$id."';";
  271. $result = mysql_query($sql) or die(mysql_error()."
  272. <br> on query $sql
  273. <br> at get-phrases.php l. 268");
  274. if (mysql_num_rows($result) > 0) {
  275. $row = mysql_fetch_assoc($result);
  276. return $row['sequence'];
  277. } else {
  278. return " ";
  279. }
  280. }
  281. function getMostFrequentContentPhrase($sentence_ids) {
  282. global $timing;
  283. $t1 = time();
  284. $id = getMostFrequentContentPhraseID($sentence_ids);
  285. return getPhrase($id);
  286. }
  287. function getMostFrequentContentPhraseID($sentence_ids) {
  288. global $timing;
  289. $table_identifier = 'filtered_sent_ids';
  290. $query_id_where = '';
  291. global $cache_results;
  292. global $query_id;
  293. if ($cache_results || $query_id) {
  294. $table_identifier = 'cached_filtered_sent_ids';
  295. $query_id_where = " AND query_id = $query_id ";
  296. }
  297. $t1 = time();
  298. if ($sentence_ids == "all") {
  299. $sql = "SELECT id as sequence_id, sentence_count as count
  300. FROM sequence USE INDEX(for_counts)
  301. WHERE NOT all_function_words
  302. AND length = 1
  303. ORDER BY count DESC
  304. LIMIT 1;";
  305. } else {
  306. $sql = "SELECT sequence.id as sequence_id, sequence
  307. count($table_identifier.id) as count,
  308. count(distinct $table_identifier.document_id) as document_count
  309. FROM
  310. $table_identifier, sequence_xref_sentence, sequence
  311. WHERE $table_identifier.id = sentence_id
  312. AND $query_id_where
  313. AND sequence_id = sequence.id
  314. AND length = 1
  315. AND NOT all_function_words
  316. GROUP BY sequence_id
  317. ORDER BY COUNT desc LIMIT 1;";
  318. }
  319. $result = mysql_query($sql) or die("Error getting sequences for
  320. sentence IDs:
  321. <br> ".mysql_error()."</br>
  322. <br> on query
  323. <br> $sql");
  324. $t2 = time();
  325. if ($timing != 0) {
  326. echo "<br> Time to get most frequent content phrase ID:
  327. ".($t2-$t1)."s<br>";
  328. }
  329. if ($result) {
  330. $row = mysql_fetch_assoc($result);
  331. return $row['sequence_id'];
  332. } else {
  333. return "";
  334. }
  335. }
  336. function makeTemporaryFilteredSequencesTable() {
  337. /** Create a temporary table to hold the results of sentence ID's that match
  338. various filters.
  339. */
  340. $sql = "DROP TEMPORARY TABLE IF EXISTS `filtered_sequences`;";
  341. mysql_query($sql) or die (mysql_error(). " On <br> $sql");
  342. $sql = "CREATE TEMPORARY TABLE `filtered_sequences` (
  343. `seq_id` varchar(50) NOT NULL DEFAULT '0',
  344. `count` int NOT NULL DEFAULT '1',
  345. `document_count` int NOT NULL DEFAULT '1',
  346. `ok` boolean NOT NULL DEFAULT '0',
  347. PRIMARY KEY (`seq_id`),
  348. KEY `count` (`ok`, `count` DESC, `seq_id`)
  349. ) ENGINE=MEMORY DEFAULT CHARSET=utf8";
  350. $result = mysql_query($sql) or die("Error creating temporary
  351. sequences table, get-phrases.php l. 151:
  352. <br> ".mysql_error()."</br>
  353. <br> on query
  354. <br> $sql");
  355. }
  356. ?>