/src/php/phrases/get-phrases.php
PHP | 381 lines | 351 code | 16 blank | 14 comment | 41 complexity | 40afa0bd4ed414310442f9ef5dd2977c MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
- <?php
- /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
- /*******************************************************************************
- get-phrases.php
- Utilities for fetching frequent phrases.
- *******************************************************************************/
- include_once "../util.php";
- include_once "../grammaticalsearch/get-search-results.php";
- include_once '../document/get-metadata.php';
- //Query parameters
- $wordseer_instance = getGetParam('instance');
- $gov = '';
- $dep = '';
- $relation = '';
- $govtype = 'word';
- $deptype = 'word';
- $searches = decodeGetJson('search');
- if (count($searches) <= 1) {
- $gov = getGetParam('gov');
- $govtype = getGetParam('govtype');
- $dep = getGetParam('dep');
- $deptype = getGetParam('deptype');
- $relation = getGetParam('relation');
- }
- $collection = getGetParam('collection');
- $statistics = getGetParam('statistics');
- $start = getGetParam('start');
- $limit = getGetParam('limit');
- $metadata = decodeGetJson('metadata');
- $phrases = decodeGetJson('phrases');
- $timing = getGetParam('timing');
- $path = '../../../instances/'.$wordseer_instance.'/config.php';
- include_once $path;
- include_once '../subsets/read.php';
- $sql = "SET session tmp_table_size = 1073741824;";
- mysql_query($sql);
- $sql = "SET session max_heap_table_size = 1073741824;";
- mysql_query($sql);
- if (strstr($_SERVER['REQUEST_URI'], 'get-phrases.php')) {
- dispatch_phrases($gov, $govtype, $dep, $deptype, $relation,
- $collection, $metadata, $phrases);
- }
- /** Returns a list of sentence ID's in which all the given words or phrases
- occur.
- @param {Array{String}} An array of Word or phrase specifiers in the format
- word_<id> or phrase_<id>
- @return {Array[Number]} phrase ID's
- */
- function getSentenceIDsForPhrases($phrases) {
- global $timing;
- global $num_filter_conditions;
- $table_identifier = 'filtered_sent_ids';
- $insertion_fields = '(id, document_id )';
- $field_identifier = "DISTINCT sentence_id, document_id ";
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- if ($cache_results || $query_id) {
- $table_identifier = 'cached_filtered_sent_ids';
- $insertion_fields = '(id, document_id, query_id)';
- $field_identifier = "DISTINCT sentence_id, document_id $query_id";
- $query_id_where = " AND query_id = $query_id ";
- }
- if ($timing != 0) {
- echo "<br>Phrases: ".json_encode($phrases)."
- <br>";
- }
- if (count($phrases) == 0) {
- return "all";
- } else{
- if (!$query_id || $cache_results) {
- $t1 = time();
- $sentence_ids = array();
- $first = true;
- foreach ($phrases as $phrase) {
- $num_filter_conditions += 1;
- if (strlen(trim($phrase)) > 0) {
- $phrase_ids = getPhraseIDs($phrase);
- if (count($phrase_ids) > 0) {
- $phrase_id_string = join(", ", $phrase_ids);
- $sql = "INSERT INTO $table_identifier $insertion_fields
- SELECT $field_identifier FROM sequence_xref_sentence
- WHERE sequence_id in ($phrase_id_string)
- ON DUPLICATE KEY update num_matched = num_matched + 1;";
- if ($timing) {
- echo $sql;
- }
- mysql_query($sql) or die (mysql_error()." On: <br> $sql
- <br> while getting sentence_ids matching phrases
- $phrase_id_string <br> at get-phrases.php l.64");
- }
- }
- }
- updateSentenceFilterTable();
- }
- $sentence_ids = array();
- if (!cache_results) {
- $sql = "SELECT * from $table_identifier $query_id_where ";
- $result = mysql_query($sql) or die (mysql_error()."<br>
- at get-phrases.php l.74");
- while ($row = mysql_fetch_assoc($result)) {
- array_push($sentence_ids, $row['id']);
- }
- $t2 = time();
- if ($timing) {
- echo "<br>Time to get ".count($sentence_ids)." sentence ID's
- matching phrases: ".($t2-$t1)."s<br>";
- }
- }
- return $sentence_ids;
- }
- }
- /** Gets the ID's of the phrases that match a particular phrase filter sent
- by the server.
- @param {String} $phrase A filter parameter sent by the client in the form
- class_id where class is either "word" or "phrase" and "id" is the id of the
- word or phrase.
- */
- function getPhraseIDs($phrase) {
- $components = explode("_", $phrase);
- $type = $components[0];
- $id = $components[1];
- $ids = array();
- if ($type == "phrase") {
- array_push($ids, "'".$id."'");
- } else if ($type == 'word') {
- $word_ids = explode(".", $id);
- foreach ($word_ids as $id) {
- array_push($ids, "'.$id.'");
- array_push($ids, "'l.$id.'");
- }
- }
- return $ids;
- }
- function dispatch_phrases($gov, $govtype, $dep, $deptype, $relation,
- $collection, $metadata, $phrases) {
- global $timing;
- $length = getGetParam('length');
- $function_words_value = getGetParam('has_function_words')== 'true'? 1: 0;
- $lemmatized = getGetParam('lemmatized') == 'true'? 1: 0;
- $sentence_ids = array(-2);
- if ($relation == "") {
- $results = getSentenceSearchResults($gov, $govtype, $collection, $metadata, $phrases);
- } else {
- $results = getDependencySentenceResults($gov, $govtype, $dp, $deptype,
- $relation, $collection, false, $metadata, $phrases);
- if ($timing != 0) {
- echo "<br> Number of matched sentences: ".mysql_num_rows($results);
- }
- }
- $first = true;
- echo "[
- ";
- if ($results == 'all') {
- $sentence_count_where_clause = " AND sentence_count > 1";
- $final_results = array();
- $ids = array();
- $results = array();
- $sql = "SELECT sentence_count as count,
- 0 as document_count,
- id
- FROM sequence USE INDEX(for_counts)
- WHERE length = $length
- AND NOT all_function_words
- AND sentence_count > 0
- AND has_function_words = $function_words_value
- ORDER BY sentence_count DESC
- LIMIT 150";
- $result = mysql_query($sql) or die("Error getting sequences for
- sentence IDs, l. 180:
- <br> ".mysql_error()."</br>
- <br> on query
- <br> $sql");
- $ids = array();
- while ($row = mysql_fetch_assoc($result)) {
- $id = "'".$row['id']."'";
- $results[$id] = $row;
- array_push($ids, $id);
- }
- $t3 = time();
- if ($timing != 0) {
- echo "<br>Time to count most frequent sequences of length $length: ".
- ($t3-$t2)."s";
- }
- if (count($ids) > 0) {
- $t3 = time();
- $id_string = join(", ", $ids);
- $sql = "SELECT sequence, length, lemmatized, has_function_words,
- all_function_words, id
- FROM sequence WHERE id in ($id_string);";
- $result = mysql_query($sql) or die("Error getting sequences for
- sequence IDs, l. 157:
- <br> ".mysql_error()."</br>
- <br> on query
- <br> $sql");
- while ($row = mysql_fetch_assoc($result)) {
- $id = "'".$row['id']."'";
- $results[$id] = array_merge($results[$id], $row);
- }
- foreach($ids as $id) {
- if ($results[$id]["lemmatized"] == 0) {
- if (!$first) {
- echo ",
- ";
- }
- echo json_encode($results[$id]);
- $first = false;
- }
- }
- $t4 = time();
- if ($timing != 0) {
- echo $sql;
- echo "<br> Time to get sequences for sequence id's: ".
- ($t4-$t3)."s <br>";
- }
- }
- } else {
- $table_identifier = 'filtered_sent_ids';
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- if ($cache_results || $query_id) {
- $table_identifier = 'cached_filtered_sent_ids';
- $query_id_where = " query_id = $query_id ";
- }
- $sql = "SELECT sequence.id as id, sequence, length, lemmatized,
- has_function_words, all_function_words,
- count($table_identifier.id) as count,
- count(distinct $table_identifier.document_id) as document_count
- FROM
- $table_identifier, sequence_xref_sentence, sequence
- WHERE $table_identifier.id = sentence_id
- AND $query_id_where
- AND sequence_id = sequence.id
- AND length = $length
- AND has_function_words = $function_words_value
- AND sentence_count > 0
- AND NOT all_function_words
- GROUP BY sequence_id
- ORDER BY COUNT desc LIMIT 150;";
- $results = array();
- $t1 = time();
- $result = mysql_query($sql) or die("Error getting sequences for
- sentence IDs, l. 180:
- <br> ".mysql_error()."</br>
- <br> on query
- <br> $sql");
- $t2 = time();
- if ($timing != 0) {
- echo "<br> Time to count most frequent sequences of length $length: ".
- ($t2-$t1)."s <br>";
- }
- $ids = array();
- $first = true;
- while ($row = mysql_fetch_assoc($result)) {
- if($row['lemmatized'] == 0) {
- if (!$first) {
- echo ",
- ";
- }
- echo json_encode($row);
- $first = false;
- }
- }
- }
- echo "
- ]";
- }
- function getPhrase($id) {
- $sql = "SELECT sequence from sequence where id = '".$id."';";
- $result = mysql_query($sql) or die(mysql_error()."
- <br> on query $sql
- <br> at get-phrases.php l. 268");
- if (mysql_num_rows($result) > 0) {
- $row = mysql_fetch_assoc($result);
- return $row['sequence'];
- } else {
- return " ";
- }
- }
- function getMostFrequentContentPhrase($sentence_ids) {
- global $timing;
- $t1 = time();
- $id = getMostFrequentContentPhraseID($sentence_ids);
- return getPhrase($id);
- }
- function getMostFrequentContentPhraseID($sentence_ids) {
- global $timing;
- $table_identifier = 'filtered_sent_ids';
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- if ($cache_results || $query_id) {
- $table_identifier = 'cached_filtered_sent_ids';
- $query_id_where = " AND query_id = $query_id ";
- }
- $t1 = time();
- if ($sentence_ids == "all") {
- $sql = "SELECT id as sequence_id, sentence_count as count
- FROM sequence USE INDEX(for_counts)
- WHERE NOT all_function_words
- AND length = 1
- ORDER BY count DESC
- LIMIT 1;";
- } else {
- $sql = "SELECT sequence.id as sequence_id, sequence
- count($table_identifier.id) as count,
- count(distinct $table_identifier.document_id) as document_count
- FROM
- $table_identifier, sequence_xref_sentence, sequence
- WHERE $table_identifier.id = sentence_id
- AND $query_id_where
- AND sequence_id = sequence.id
- AND length = 1
- AND NOT all_function_words
- GROUP BY sequence_id
- ORDER BY COUNT desc LIMIT 1;";
- }
- $result = mysql_query($sql) or die("Error getting sequences for
- sentence IDs:
- <br> ".mysql_error()."</br>
- <br> on query
- <br> $sql");
- $t2 = time();
- if ($timing != 0) {
- echo "<br> Time to get most frequent content phrase ID:
- ".($t2-$t1)."s<br>";
- }
- if ($result) {
- $row = mysql_fetch_assoc($result);
- return $row['sequence_id'];
- } else {
- return "";
- }
- }
- function makeTemporaryFilteredSequencesTable() {
- /** Create a temporary table to hold the results of sentence ID's that match
- various filters.
- */
- $sql = "DROP TEMPORARY TABLE IF EXISTS `filtered_sequences`;";
- mysql_query($sql) or die (mysql_error(). " On <br> $sql");
- $sql = "CREATE TEMPORARY TABLE `filtered_sequences` (
- `seq_id` varchar(50) NOT NULL DEFAULT '0',
- `count` int NOT NULL DEFAULT '1',
- `document_count` int NOT NULL DEFAULT '1',
- `ok` boolean NOT NULL DEFAULT '0',
- PRIMARY KEY (`seq_id`),
- KEY `count` (`ok`, `count` DESC, `seq_id`)
- ) ENGINE=MEMORY DEFAULT CHARSET=utf8";
- $result = mysql_query($sql) or die("Error creating temporary
- sequences table, get-phrases.php l. 151:
- <br> ".mysql_error()."</br>
- <br> on query
- <br> $sql");
- }
- ?>