PageRenderTime 75ms CodeModel.GetById 41ms RepoModel.GetById 2ms app.codeStats 0ms

/src/php/wordtree/get-tree.php

https://bitbucket.org/silverasm/wordseer
PHP | 472 lines | 441 code | 10 blank | 21 comment | 22 complexity | fed0e4123f5301212a02ebab308bd5b9 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
  1. <?php
  2. /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
  3. /****************************************************************************
  4. getcontext.php
  5. Called by getContext() in heatmap.js in service of heatmap.php
  6. Gets the concordance and grammatical context in which a heat map
  7. query occurs.
  8. ****************************************************************************/
  9. include_once '../util.php';
  10. include_once '../subsets/read.php';
  11. include_once '../document/get-metadata.php';
  12. $wordseer_instance = getGetParam('instance');
  13. $path = '../../../instances/'.$wordseer_instance.'/config.php';
  14. include_once $path;
  15. $gov = getGetParam('gov');
  16. $govtype = getGetParam('govtype');
  17. $dep = getGetParam('dep');
  18. $deptype = getGetParam('deptype');
  19. $relation = getGetParam('relation');
  20. $collection = getGetParam('collection');
  21. $statistics = getGetParam('statistics');
  22. $start = getGetParam('start');
  23. $limit = getGetParam('limit');
  24. $metadata = decodeGetJson('metadata');
  25. $phrases = decodeGetJson('phrases');
  26. $timing = (getGetParam('timing') == 1);
  27. $query = "";
  28. $context = array();
  29. $matches = getConcordance( $gov, $govtype, $dep, $deptype, $relation,
  30. $collection, $metadata, $phrases);
  31. $context["concordance"] = array();
  32. $context["concordance"]["num"] = $matches['numMatches'];
  33. $context["concordance"]["docs"] = (array_key_exists('numDocuments', $matches))?($matches['numDocuments']):(0);
  34. $context["concordance"]["matches"] = $matches['matches'];
  35. $context['concordance']['lefts'] = getSentences($context['concordance']['matches'], "left");
  36. $context['concordance']['rights'] = getSentences($context['concordance']['matches'], "right");
  37. $g = getGetParam('gov');
  38. $d = getGetParam('dep');
  39. if($govtype == 'word-set'){
  40. $g = getWordsFromWordSet($gov);
  41. }
  42. if($deptype == 'word-set'){
  43. $d = getWordsFromWordSet($dep);
  44. }
  45. if (!$query_string){
  46. $query_string = $g.' '.$d;
  47. if (!$gov && !$dep) {
  48. $query_string = $phrases[0];
  49. }
  50. }
  51. $context["query"] = $query_string;
  52. echo json_encode($context);
  53. /** Returns a list of concordances for the given search query and filters.
  54. @return A list of concordances: associative arrays with the following fields:
  55. - number -- The sentence number
  56. - left -- The text to the left of the match
  57. - right -- The text to the right of the match
  58. - id -- The ID of the sentence
  59. - match -- the matched text
  60. */
  61. function getConcordance($gov, $govtype, $dep, $deptype, $relation, $collection,
  62. $metadata, $phrases){
  63. global $q;
  64. global $query_string;
  65. global $timing;
  66. $table_identifier = 'filtered_sent_ids';
  67. $query_id_where = '';
  68. global $cache_results;
  69. global $query_id;
  70. if ($cache_results || $query_id) {
  71. $table_identifier = 'cached_filtered_sent_ids';
  72. $query_id_where = " AND query_id = $query_id ";
  73. }
  74. // Appply the filters.
  75. $filtered = getSentenceIDsForFilters($metadata, $collection, $phrases);
  76. $filtersAreActive = ($filtered != 'all');
  77. $results = null;
  78. $pattern = null;
  79. $t1 = time();
  80. // if just a regular word search or a phrase search
  81. if(strlen(trim($relation)) == 0){
  82. //default query
  83. $words = "";
  84. // If a search query has been specified, use it as the center
  85. // of the word tree.
  86. if ($gov) {
  87. $words = $gov;
  88. $q = $gov;
  89. if($govtype == 'word-set'){
  90. $words = explode(" ", getWordsFromWordSet($gov));
  91. $word_id_string = getWordIDsFromWordSet($gov);
  92. $q = "(".join("|", $words).")";
  93. $query_string = getSetName($gov);
  94. $words = join(" OR ", $words);
  95. $query = "SELECT sentence_id as id, sentence, number,
  96. sentence.document_id
  97. FROM sentence, sentence_xref_word
  98. WHERE sentence.id = sentence_id
  99. AND word_id in ($word_id_string) ";
  100. if ($filtersAreActive) {
  101. $query = "SELECT sentence_id, sentence
  102. FROM sentence_xref_word, $table_identifier, sentence
  103. WHERE sentence.id = sentence_id
  104. AND sentence_id = $table_identifier.id $query_id_where
  105. AND word_id in ($word_id_string) ";
  106. }
  107. } else if (strstr($words, " ") && !strstr($words, ",") && !strstr($words, "*")
  108. ) {
  109. $q = $words;
  110. $query_string = $words;
  111. if (!strstr($words, "\"")) {
  112. $words = "\"$words\"";
  113. }
  114. $query = "SELECT id, sentence, number, sentence.document_id
  115. FROM sentence
  116. WHERE match sentence against('$words' IN BOOLEAN MODE) ";
  117. if ($filtersAreActive) {
  118. $query = "SELECT sentence.id, sentence
  119. FROM sentence, $table_identifier
  120. WHERE match sentence against('$words' IN BOOLEAN MODE)
  121. AND sentence.id = $table_identifier.id $query_id_where ";
  122. }
  123. $q = $words;
  124. } else if (strstr($words, ",") || strstr($words, "*")) {
  125. $query_string = $words;
  126. $word_array = explode(",", $words);
  127. $word_strings = array();
  128. $word_ids = array();
  129. foreach($word_array as $word) {
  130. array_push($word_strings, trim($word));
  131. $ids = getWordIDs(trim($word));
  132. foreach($ids as $id) {
  133. array_push($word_ids, $id);
  134. }
  135. }
  136. $q = "(".join("|", $word_strings).")";
  137. $word_id_string = join(",", $word_ids);
  138. $query = "SELECT sentence_id as id, sentence, number,
  139. sentence.document_id
  140. FROM sentence, sentence_xref_word
  141. WHERE sentence.id = sentence_id
  142. AND word_id in ($word_id_string) ";
  143. if ($filtersAreActive) {
  144. $query = "SELECT sentence_id as id, sentence, number,
  145. sentence.document_id
  146. FROM sentence_xref_word, $table_identifier, sentence
  147. WHERE sentence.id = sentence_id
  148. AND sentence_id = $table_identifier.id $query_id_where
  149. AND word_id in ($word_id_string) ";
  150. }
  151. } else {
  152. $query_string = $words;
  153. $word_id_string = getWordID($words);
  154. $query = "SELECT sentence_id as id, sentence, number,
  155. sentence.document_id
  156. FROM sentence, sentence_xref_word
  157. WHERE sentence.id = sentence_id
  158. AND word_id in ($word_id_string) ";
  159. if ($filtersAreActive) {
  160. $query = "SELECT sentence_id as id, sentence, number,
  161. sentence.document_id
  162. FROM sentence_xref_word, $table_identifier, sentence
  163. WHERE sentence.id = sentence_id
  164. AND sentence_id = $table_identifier.id $query_id_where
  165. AND word_id in ($word_id_string) ";
  166. }
  167. $q = $words;
  168. }
  169. }
  170. // If no search query has been specified, but there are phrases
  171. // acting as filters, then use the first filter phrase as the
  172. // search query for the center of the word tree.
  173. else if (count($phrases) > 0) {
  174. $phraseIDs = getPhraseIDs($phrases[0]);
  175. $sql = "SELECT sequence from sequence
  176. WHERE id
  177. IN (".join(",", $phraseIDs).");";
  178. if ($timing) {
  179. echo $sql;
  180. }
  181. $result = mysql_query($sql) or die ("Error getting phrase with ID
  182. ".$phrases[0].";");
  183. $q = "";
  184. while ($row = mysql_fetch_assoc($result)) {
  185. $q = explode(" ", $row['sequence']);
  186. $q = $q[0];
  187. $query_string = $q;
  188. }
  189. $query = "SELECT * FROM $table_identifier, sentence
  190. WHERE $table_identifier.id = sentence.id
  191. $query_id_where";
  192. }
  193. // If no search query has been specified get the most frequent
  194. // content word (not stop word) from the set of documents matching
  195. // the filters (if any) or the whole collection (if there are no
  196. // filters), and use that as the search query for the center of
  197. // the word tree.
  198. else {
  199. $word_id = getMostFrequentContentWordID($filtered);
  200. $query_string = getWord($word_id); //util.php
  201. if ($timing) {
  202. echo "<br> Most frequent phrase: $query_string <br>";
  203. }
  204. if (strlen($query_string) > 0) {
  205. $q = $query_string;
  206. $cache_results = true;
  207. $sentence_ids = getSentenceIDsForWord($word_id);
  208. $cache_results = false;
  209. $query = "SELECT
  210. sentence.id, sentence.document_id, number, sentence
  211. FROM $table_identifier, sentence
  212. WHERE sentence.id = $table_identifier.id
  213. $query_id_where ";
  214. } else {
  215. // No search query.
  216. $q = "";
  217. $query_string = " ";
  218. $query = "SELECT * from sentence where FALSE ";
  219. }
  220. }
  221. // Get the sentences that match the query constructed above.
  222. $results = mysql_query($query." LIMIT 1000 ;") or die("<b>A fatal MySQL error occured</b>.
  223. <br/> Query: " . $query . "
  224. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  225. // Construct the pattern for the regular expression.
  226. $query = remove_spaces_before_punctuation($q);
  227. $query = str_replace( "*", "\w*",
  228. str_replace("\\", "",
  229. str_replace('"', "", $query)));
  230. $pattern = " ";
  231. if ($query) {
  232. $pattern = "/\b".$query."\b/i";
  233. global $PUNCTUATION;
  234. if (strstr($PUNCTUATION, $query)) {
  235. $pattern = "/$query/i";
  236. }
  237. }
  238. }
  239. // If it's a grammatical search, extract the gov, dep, and relation from
  240. // the GET parameters and issue a dependency relationship search.
  241. else{
  242. $govIDs = wordIDList($gov);
  243. if($govtype == "word-set"){
  244. $govIDs = getWordIDsFromWordSet($gov);
  245. }
  246. $depIDs = wordIDList($dep);
  247. if($deptype == 'word-set'){
  248. $depIDs = getWordIDsFromWordSet($dep);
  249. }
  250. $relations = relationshipIDList($relation);
  251. if($filtersAreActive){
  252. // To stave off syntax errors if its empty, put a -1 in the list.
  253. array_push($filtered, "-1");
  254. $withinSentence = true;
  255. $within = join(", ", $filtered);
  256. $sentence_where_clause = " AND id IN ($within) ";
  257. }
  258. $dependency_id_results = null;
  259. $dependency_id_results = getDependencyIDs($govIDs,
  260. $depIDs,
  261. $relations,
  262. false,
  263. $withinSentence,
  264. $within,
  265. false,
  266. false);
  267. $sentence_ids = array();
  268. while($row = mysql_fetch_array($dependency_id_results)){
  269. array_push($sentence_ids, $row['sentence_id']);
  270. }
  271. if(count($sentence_ids) > 0){
  272. $id_string = join(", ", $sentence_ids);
  273. $sql = "SELECT
  274. sentence.id, sentence.document_id, number, sentence
  275. FROM
  276. sentence
  277. WHERE id IN (".$id_string.");";
  278. $results = mysql_query($sql) or die("<b>Fatal MySQL error</b>.
  279. <br/> Query: " . $sql . "
  280. <br/> Error: (" . mysql_errno() . ") " . mysql_error());
  281. // pattern
  282. $govs = $gov;
  283. if($govtype == "word-set"){
  284. $govs = getWordsFromWordSet($gov);
  285. }
  286. $deps = $dep;
  287. if($deptype == 'word-set'){
  288. $deps = getWordsFromWordSet($dep);
  289. }
  290. if(strlen($govs) > 0){
  291. $query = str_replace(" ", "|", $govs);
  292. }else if(strlen($deps) > 0){
  293. $query = str_replace(" ", "|", $deps);
  294. }
  295. $pattern = "/\b(".$query.")\b/i";
  296. global $PUNCTUATION;
  297. if (strstr($PUNCTUATION, $query)) {
  298. $pattern = "/$query/i";
  299. }
  300. }
  301. }
  302. $pattern = str_replace(".", "", $pattern);
  303. $matched = array();
  304. $matched['numMatches'] = mysql_num_rows($results);
  305. $matched['matches'] = array();
  306. $split = array();
  307. $t2 = time();
  308. if ($timing != 0) {
  309. echo "<br> Time to get sentences matching the query: ".($t2-$t1)."s<br>";
  310. echo "<br>$pattern<br><br>";
  311. }
  312. $t1 = time();
  313. if($results){
  314. while($row = mysql_fetch_array($results)){
  315. $split = array();
  316. $matches = array();
  317. $sentence = preg_replace("/\s/", " ", $row['sentence']);
  318. $has_match = preg_match($pattern, $sentence, $matches,
  319. PREG_OFFSET_CAPTURE);
  320. if ($timing != 0) {
  321. if (!$has_match) {
  322. echo "<br>".$sentence;
  323. }
  324. }
  325. $match = ($has_match)?($matches[0][0]):('');
  326. $matched_text = preg_replace("/\W/", "", $match);
  327. $match_length = strlen($matched_text);
  328. $left = ($has_match)?(substr($sentence, 0, $matches[0][1])):('');
  329. $right = ($has_match)?(substr($sentence, $matches[0][1] + $match_length + 1)):('');
  330. $split['number'] = $row['number'];
  331. $split['document'] = $row['document_id'];
  332. $split['id'] = $row['id'];
  333. $split['left'] = utf8_encode($left);
  334. $split['right'] = utf8_encode($right);
  335. //$split['match'] = $match;
  336. $split['match'] = $matched_text;
  337. $split['match'] = utf8_encode(str_replace("|", "/", $split['match']));
  338. array_push($matched['matches'], $split);
  339. }
  340. }
  341. $t2 = time();
  342. if ($timing != 0) {
  343. echo "<br> Time to count match patterns for word tree: ".($t2-$t1)."s.
  344. <br>";
  345. }
  346. return $matched;
  347. }
  348. function getMostFrequentContentWordID($filtered) {
  349. global $timing;
  350. $table_identifier = 'filtered_sent_ids';
  351. $query_id_where = '';
  352. global $cache_results;
  353. global $query_id;
  354. if ($cache_results || $query_id) {
  355. $table_identifier = 'cached_filtered_sent_ids';
  356. $query_id_where = " AND query_id = $query_id ";
  357. }
  358. $t1 = time();
  359. if ($filtered == "all") {
  360. $sql = "SELECT word, id as word_id, sentence_count as count
  361. FROM word
  362. WHERE ((pos like 'N%') OR (pos like 'V%') OR (pos like 'J%'))
  363. ORDER BY count DESC
  364. LIMIT 100;";
  365. } else {
  366. $sql = "SELECT word, word_id, count(distinct sentence_id) as count
  367. FROM sentence_xref_word, $table_identifier, word
  368. WHERE word.id = word_id
  369. AND sentence_id = $table_identifier.id
  370. $query_id_where
  371. AND ((pos like 'N%') OR (pos like 'V%') OR (pos like 'J%'))
  372. GROUP BY word
  373. ORDER BY count DESC
  374. LIMIT 100;";
  375. }
  376. $result = mysql_query($sql) or die("Error getting sequences for
  377. sentence IDs:
  378. <br> ".mysql_error()."</br>
  379. <br> on query
  380. <br> $sql");
  381. $t2 = time();
  382. if ($timing != 0) {
  383. echo "<br> Time to get most frequent content phrase ID:
  384. ".($t2-$t1)."s<br>";
  385. }
  386. if ($result) {
  387. global $STOPWORDS;
  388. while($row = mysql_fetch_assoc($result)){
  389. $word = $row['word'];
  390. if (!in_array($word, $STOPWORDS)) {
  391. return $row['word_id'];
  392. }
  393. }
  394. } else {
  395. return "";
  396. }
  397. }
  398. function getSentenceIDsForWord($word_id) {
  399. global $num_filter_conditions;
  400. global $timing;
  401. $table_identifier = 'filtered_sent_ids';
  402. $insertion_fields = '(id, document_id)';
  403. $field_identifier = "DISTINCT sentence_id, document_id";
  404. $query_id_where = '';
  405. global $cache_results;
  406. global $query_id;
  407. if ($cache_results || $query_id) {
  408. $table_identifier = 'cached_filtered_sent_ids';
  409. $insertion_fields = '(id, document_id, query_id)';
  410. $field_identifier = "DISTINCT sentence_id, document_id, $query_id";
  411. $query_id_where = " AND query_id = $query_id ";
  412. }
  413. $num_filter_conditions += 1;
  414. $sql = "INSERT INTO $table_identifier $insertion_fields
  415. SELECT $field_identifier FROM sentence_xref_word
  416. WHERE word_id = $word_id
  417. ON DUPLICATE KEY update num_matched = num_matched + 1;";
  418. if ($timing) {
  419. echo $sql;
  420. }
  421. mysql_query($sql) or die (mysql_error()." On: <br> $sql
  422. <br> while getting sentence_ids matching word id
  423. $word_id <br> at get-tree.php l.428");
  424. updateSentenceFilterTable();
  425. }
  426. //Part 2: word tree
  427. function getSentences($matches, $which){
  428. $sentences = array();
  429. $length = 10;
  430. $i = 0;
  431. global $timing;
  432. $t1 = time();
  433. foreach($matches as $match){
  434. //TODO verify this regex
  435. $sentence = (strlen($match[$which]) > 0)?(
  436. preg_split('( )', $match[$which], null, PREG_SPLIT_NO_EMPTY)):
  437. (null);
  438. if(!is_array($sentence)){
  439. $sentence = array( 0 => '' );
  440. }
  441. if($which == 'left'){
  442. $sentence = array_reverse($sentence);
  443. }
  444. array_push($sentences, array("id"=>$match['id'], "sentence"=>$sentence));
  445. }
  446. $t2 = time();
  447. if ($timing != 0) {
  448. echo "<br> Time to split match patterns for word tree: ".($t2-$t1)."s.
  449. <br>";
  450. }
  451. return $sentences;
  452. }
  453. ?>