/src/php/grammaticalsearch/get-search-results.php
PHP | 689 lines | 666 code | 9 blank | 14 comment | 35 complexity | 8cf9a18150cf5c80e53837813bd91662 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
- <?php
- /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
- include_once "../util.php";
- include_once '../document/get-metadata.php';
- //Query parameters
- $wordseer_instance = getGetParam('instance');
- $gov = '';
- $dep = '';
- $relation = '';
- $govtype = 'word';
- $deptype = 'word';
- $searches = decodeGetJson('search');
- if (count($searches) <= 1) {
- $gov = getGetParam('gov');
- $govtype = getGetParam('govtype');
- $dep = getGetParam('dep');
- $deptype = getGetParam('deptype');
- $relation = getGetParam('relation');
- }
- $collection = getGetParam('collection');
- $statistics = getGetParam('statistics');
- $phrasess = decodeGetJson('phrases');
- $metadata = decodeGetJson('metadata');
- $timing = getGetParam('timing');
- $path = '../../../instances/'.$wordseer_instance.'/config.php';
- include_once $path;
- include_once '../subsets/read.php';
- // A variable that controls whether the results of searching (as opposed to
- // earlier steps like metadata filtering) are cached. Useful in cases such as
- // the column vis or the word frequencies where there can be multiple searches
- // over the same base filters.
- global $dont_cache_search_results;
- $dont_cache_search_results = false;
- if (strstr($_SERVER['REQUEST_URI'], 'get-search-results.php')) {
- dispatch($gov, $govtype, $dep, $deptype,
- $relation, $collection, $statistics, $metadata, $phrases);
- }
- function dispatch($gov, $govtype, $dep, $deptype, $relation,
- $collection, $statistics, $metadata, $phrases){
- global $wordseer_instance;
- if($wordseer_instance){
- $timing = getGetParam('timing');
- $t1 = time();
- if($relation == ""){
- if(!$statistics){
- $results = getSearchResults($gov, $govtype, $collection,
- $metadata, $phrases);
- }else{
- $results = array('statistics'=>array(), 'sentences'=>array());
- }
- }else{
- $results = getDependencySearchResults($gov, $govtype, $dep,
- $deptype, $relation, $collection, $statistics, $metadata, $phrases);
- }
- $t2 = time();
- if ($timing) {
- echo "
- Total: ".($t2 - $t1)."s<br><br>
- ";
- }
- echo json_encode($results);
- $t3 = time();
- if ($timing) {
- echo '<br><br> Time to render JSON:'.($t3-$t2).'s<br>';
- }
- }
- }
- function getSearchResults($gov, $govtype, $collection, $metadata, $phrases) {
- $timing = getGetParam('timing');
- $all = true;
- // Get the ID's of the sentences that match the query, along with the match
- // positions.
- $results = getSentenceSearchResults($gov, $govtype,
- $collection, $metadata, $phrases);
- $sentenceIDs = array();
- $sentenceInfo = array();
- $sentences = array();
- $total = 0;
- if ($results != 'all') {
- $all = false;
- $sql = 'SELECT FOUND_ROWS() as total;';
- $r = mysql_fetch_assoc(mysql_query($sql));
- $total = $r['total'];
- while($row = mysql_fetch_assoc($results)){
- $sentence_id = $row['sentence_id'];
- array_push($sentenceIDs, $sentence_id);
- if (!array_key_exists($sentence_id, $sentenceInfo)) {
- $sentenceInfo[$sentence_id] = array(
- 'id'=> $sentence_id,
- 'document_id'=> $row['document_id'],
- 'sentence'=>array(
- 'words'=>array(),
- 'dep_index'=> -1,
- 'gov_index'=> array(),
- )
- );
- }
- array_push(
- $sentenceInfo[$sentence_id]['sentence']['gov_index'],
- $row['position']);
- }
- $sentence_id_string = join(", ", $sentenceIDs);
- // Get the sentence details: words and metadata.
- if(getGetParam("onlyMetadata") != "true"){
- $sentenceInfo = populateSentenceInfo($sentence_id_string, $sentenceInfo);
- foreach ($sentenceIDs as $sentence_id) {
- array_push($sentences, $sentenceInfo[$sentence_id]);
- }
- }
- }
- $metadata = array();
- if (getGetParam("onlyMetadata") == "true") {
- $metadata = getMetadataTreeFromSentenceIDs($sentenceIDs, $all);
- }
- $data = array(
- 'statistics'=>array(),
- 'sentences'=>$sentences,
- 'total'=>$total,
- 'metadata'=>$metadata
- );
- return $data;
- }
- function getSentenceSearchResults($gov, $govtype, $collection, $metadata,
- $phrases) {
- $timing = getGetParam('timing');
- global $num_filter_conditions;
- global $num_search_conditions;
- if ($timing) {
- echo "<br>Starting search filter with $num_filter_conditions
- conditions <br>";
- }
- $table_identifier = 'filtered_sent_ids';
- $insertion_fields = '(id, document_id, num_matched, num_searches_matched)';
- $field_identifier = "DISTINCT id, document_id, 0, 1";
- $destination_table = $table_identifier;
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- global $dont_cache_search_results;
- if (($cache_results || $query_id) && !$dont_cache_search_results) {
- $table_identifier = 'cached_filtered_sent_ids';
- $insertion_fields =
- '(id, document_id, query_id, num_matched, num_searches_matched)';
- $field_identifier = "DISTINCT id, document_id, $query_id, 0, 1";
- $query_id_where = " AND query_id = $query_id ";
- }
- // Apply metadata, collection and phrase filters, and get the list of
- //sentence ids that match those filters.
- $filter_clause_active = false;
- $filtered_sentence_ids = getSentenceIDsForFilters($metadata,
- $collection, $phrases); // in get-metadata.php
- if ($filtered_sentence_ids != "all") {
- $filter_clause_active = true;
- if ($dont_cache_search_results && $query_id) {
- $num_filter_conditions += 1;
- // transfer the permanent cached results into an in-memory
- // table before proceeding to add the filtered sentences.
- $sql = "INSERT INTO filtered_sent_ids (id, document_id, num_matched)
- SELECT id, document_id, $num_filter_conditions
- FROM cached_filtered_sent_ids
- WHERE query_id = $query_id";
- mysql_query($sql) or die (mysql_error()."
- On <br> $sql <br> get-search-result.php l.146");
- }
- }
- global $searches;
- $search_clause_active = false;
- $word_where_clause = "";
- if ($gov || count($searches) > 0) {
- $search_clause_active = true;
- // Get the word id's corresponding to the query terms.
- $word_id_string = "";
- $exact_phrase = false;
- if ((strstr($gov, " ") || strstr($gov, "+")) && !strstr($gov, ", ")) {
- $exact_phrase = true;
- }
- if ($govtype == 'word') {
- $word_id_string = wordIDList($gov);
- } else if ($govtype == 'word-set') {
- $word_id_string = getWordIDsFromWordSet($gov);
- }
- if ($exact_phrase) {
- if (!(strstr($gov, "\"")|| strstr($gov, "+"))) {
- $query = "\"$gov\"";
- } else {
- $query = $gov;
- }
- $sentence_where_clause =
- " match(sentence) AGAINST('$query' IN BOOLEAN MODE) ";
- }
- if (strlen($word_id_string) > 0) {
- $word_where_clause = " word_id in ($word_id_string) ";
- } else {
- $word_where_clause = " word_id in (-1) ";
- }
- global $num_search_conditions;
- $num_search_conditions += 1;
-
- if (!$query_id || $cache_results || $dont_cache_search_results) {
- // Filter the sentence ID's stored in the temporary table
- // filtered_sent_ids to keep only the ID's that match this search.
- if ($exact_phrase) {
- $sql = "INSERT INTO $table_identifier $insertion_fields
- SELECT $field_identifier
- from sentence WHERE $sentence_where_clause
- ON DUPLICATE KEY
- UPDATE num_searches_matched = num_searches_matched +1 ;";
- } else {
- $field_identifier = str_replace(' id', ' sentence_id', $field_identifier);
- $sql = "INSERT INTO $table_identifier $insertion_fields
- SELECT $field_identifier
- from sentence_xref_word WHERE
- $word_where_clause
- ON DUPLICATE KEY
- UPDATE num_searches_matched = num_searches_matched + 1;";
- }
- if ($timing) {
- echo $sql."
- <br> With $num_filter_conditions conditions <br>";
- }
- mysql_query($sql) or die (mysql_error()."
- On <br> $sql <br> get-search-result.php l.143");
-
- if ($dont_cache_search_results && $query_id) {
- updateTemporarySentenceFilterTable();
- } else {
- updateSentenceFilterTable();
- }
- }
- }
- if (!$cache_results) {
- if ($search_clause_active || $filter_clause_active) {
- $t1 = time();
- $sql = "";
- if ($search_clause_active) {
- $sql = "SELECT
- SQL_CALC_FOUND_ROWS
- DISTINCT sentence_id, sentence_xref_word.document_id, position
- FROM sentence_xref_word, $table_identifier
- WHERE sentence_id = $table_identifier.id
- AND $word_where_clause
- $query_id_where
- LIMIT 1000;";
- } else if ($filter_clause_active) {
- $sql = "SELECT SQL_CALC_FOUND_ROWS
- DISTINCT id as sentence_id, -1 as position, document_id
- FROM $table_identifier
- WHERE true
- $query_id_where
- LIMIT 1000;";
- }
- $results = mysql_query($sql) or die(mysql_error()."
- <br> On query
- <br> $sql
- <br> at get-search-results.php line 98");
- $t2 = time();
- if ($timing) {
- echo "Time to get search results: ".($t2 - $t1)."s<br><br>";
- }
- return $results;
- } else {
- return "all";
- }
- }
- }
- function getDependencySentenceResults ($g, $gt, $d, $dt, $r,
- $collection, $stats, $metadata, $phrases) {
- $search_clause_active = false;
- $relation = relationshipIDList($r);
- global $searches;
- if ($g != '' || $d != '' || strlen($relation) != 0 || count($searches) > 0) {
- $search_clause_active = true;
- if ($gt == "word") {
- $gov = wordIDList($g);
- } else if($gt == 'word-set'){
- $gov = getWordIDsFromWordSet($g);
- }
- if ($dt == "word") {
- $dep = wordIDList($d);
- } else if($dt == 'word-set'){
- $dep = getWordIDsFromWordSet($d);
- }
- }
-
- if ($search_clause_active) {
- $table_identifier = 'filtered_sent_ids';
- $insertion_fields = '(id, document_id)';
- $field_identifier = "DISTINCT sentence_id, document_id ";
- $query_id_where = '';
- global $cache_results;
- global $num_filter_conditions;
- global $query_id;
- global $dont_cache_search_results;
- if ($cache_results || $query_id && !$dont_cache_search_results) {
- $table_identifier = 'cached_filtered_sent_ids';
- $insertion_fields = '(id, document_id, query_id)';
- $field_identifier = "DISTINCT sentence_id, document_id, $query_id";
- $query_id_where = " AND query_id = $query_id ";
- }
- $withinSentence = false;
- $within = "";
- $filtered_sentence_ids = getSentenceIDsForFilters($metadata,
- $collection, $phrases);
- $filter_clause_active = ($filtered_sentence_ids != "all");
- if ($filter_clause_active) {
- array_push($filtered_sentence_ids, "-2");
- $withinSentence = "table";
- $within = $table_identifier;
- if ($dont_cache_search_results && $query_id) {
- // transfer the permanent cached results into an in-memory
- // table before proceeding to add the filtered sentences.
- $num_filter_conditions += 1;
- $sql = "INSERT INTO filtered_sent_ids
- (id, document_id, num_matched)
- SELECT id, document_id, $num_filter_conditions
- FROM cached_filtered_sent_ids
- WHERE query_id = $query_id";
- mysql_query($sql) or die (mysql_error()."
- On <br> $sql <br> get-search-result.php l.184");
- }
- }
- $withinDocument = false;
- $dependency_id_result = getDependencyIDs($gov,
- $dep,
- $relation,
- $withinDocument,
- $withinSentence,
- $within,
- $start,
- $limit);
- global $cache_results;
- global $query_id;
- if (!$cache_results) {
- mysql_data_seek($dependency_id_result, 0);
- return $dependency_id_result;
- }
- } else {
- return 'all';
- }
- }
- function getDependencySearchResults($g, $gt, $d, $dt, $r, $collection, $stats,
- $metadata, $phrases){
- global $timing;
- if ($gt == "word") {
- $gov = wordIDList($g);
- } else if($gt == 'word-set'){
- $gov = getWordIDsFromWordSet($g);
- }
- if ($dt == "word") {
- $dep = wordIDList($d);
- } else if($dt == 'word-set'){
- $dep = getWordIDsFromWordSet($d);
- }
- if ($timing != 0) {
- echo "<br> dep: $d <br>";
- }
- $relation = relationshipIDList($r);
- $withinSentence = false;
- $within = "";
- $filtered_sentence_ids = getSentenceIDsForFilters($metadata,
- $collection, $phrases);
- if($filtered_sentence_ids != "all"){
- array_push($filtered_sentence_ids, "-2");
- $withinSentence = true;
- $within = join(", ", $filtered_sentence_ids);
- }
- $withinDocument = false;
- $statistics = array();
- $total = 0;
- $sentenceIDs = array();
- $sentenceInfo = array();
- $sentences = array();
- if ($stats) { // Used by the bar chart module to get counts.
- $statistics = getStatistics($gov,
- $dep,
- $relation,
- $withinDocument,
- $withinSentence,
- $within);
- $total = $statistics['gov']['value'];
- } else { // Used by the search modules.
- $t1 = time();
- $dependency_id_result = getDependencyIDs($gov,
- $dep,
- $relation,
- $withinDocument,
- $withinSentence,
- $within,
- $start,
- $limit);
- $sql = 'SELECT FOUND_ROWS() as total;';
- $r = mysql_fetch_assoc(mysql_query($sql));
- $total = $r['total'];
- while ($row = mysql_fetch_assoc($dependency_id_result)) {
- $sentence_id = $row['sentence_id'];
- array_push($sentenceIDs, $sentence_id);
- if (!array_key_exists($sentence_id, $sentenceInfo)) {
- $sentenceInfo[$sentence_id] = array(
- 'id'=> $sentence_id,
- 'document_id'=> $row['document_id'],
- 'sentence'=>array(
- 'words'=>array(),
- 'dep_index'=> -1,
- 'gov_index'=> array(),
- )
- );
- }
- array_push(
- $sentenceInfo[$sentence_id]['sentence']['gov_index'],
- $row['gov_index']);
- $sentenceInfo[$sentence_id]['sentence']['dep_index'] =
- $row['dep_index'];
- }
- $sentence_id_string = join(", ", $sentenceIDs);
- $sentenceInfo = populateSentenceInfo($sentence_id_string, $sentenceInfo);
- foreach ($sentenceIDs as $id) {
- array_push($sentences, $sentenceInfo[$id]);
- }
- $t2 = time();
- //echo "Time to get dependency search results: ".($t2-$t1)."s<br><br>";
- }
- $metadata = getMetadataTreeFromSentenceIDs($sentenceIDs, false);
- $results = array('sentences'=>$sentences,
- 'statistics'=>$statistics,
- 'total'=>$total,
- 'metadata'=>$metadata);
- return $results;
- }
- function populateSentenceInfo($sentence_id_string, $sentenceInfo) {
- $timing = getGetParam('timing');
- $t1 = time();
- // Get the words in the sentences.
- $table_identifier = 'filtered_sent_ids';
- $insertion_fields = '(id, document_id)';
- $field_identifier = "DISTINCT id, document_id ";
- $destination_table = $table_identifier;
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- global $dont_cache_search_results;
- if (($cache_results || $query_id) && !$dont_cache_search_results) {
- $table_identifier = 'cached_filtered_sent_ids';
- $insertion_fields = '(id, document_id, query_id)';
- $field_identifier = "DISTINCT id, document_id, $query_id";
- $query_id_where = " AND query_id = $query_id ";
- }
- $sql = "SELECT * FROM
- $table_identifier, sentence
- WHERE sentence.id = $table_identifier.id $query_id_where;";
- //$t2 = time();
- // if ($timing) {
- // echo "
- // Time to query for words in sentences: ".($t2-$t1)."s
- // <br>";
- // }
- if ($timing) {
- echo "--- Final words query ---
- <br> $sql
- <br>";
- }
- $t2 = time();
- $result = mysql_query($sql);
-
- while($row = mysql_fetch_assoc($result)){
- // $word = array(
- // 'word'=>replaceWeirdCharacters($row['surface']),
- // 'word_id'=>$row['word_id']
- // );
- // array_push(
- // $sentenceInfo[$row['sentence_id']]['sentence']['words'],
- // $word);
- $sentenceInfo[$row['id']]['sentence']['words'] = $row["html"];
- $sentenceInfo[$row['id']]['sentence']['text'] = $row['sentence'];
- }
- $t3 = time();
- if ($timing) {
- echo "
- <br><br>
- Time to assemble words in sentences: ".($t3-$t2)."s
- <br>";
- }
- // Get the metadata fields associated with the sentences.
- $info = getMetadataInformationForSentences(false);
- foreach ($info['sentences'] as $row) {
- $sentence_id = $row['sentence_id'];
- if ($sentenceInfo[$sentence_id]) {
- foreach (array_keys($row) as $identifier) {
- if ($identifier != "document_id" && $identifier != "words") {
- $components = explode("__", $identifier);
- $property = $components[1];
- $value = $row[$identifier];
- if (!array_key_exists($property,
- $sentenceInfo[$sentence_id])) {
- $sentenceInfo[$sentence_id][$property] = "";
- }
- if (!strstr($sentenceInfo[$sentence_id][$property],
- $value." ")) {
- $sentenceInfo[$sentence_id][$property] =
- $sentenceInfo[$sentence_id][$property].$value." ";
- }
- }
- }
-
- }
- }
- $t4 = time();
- if ($timing) {
- echo "
- <br><br>
- Time to get metadata details for sentences: ".($t4-$t3)."s
- <br>";
- }
- return $sentenceInfo;
- }
- function getStatistics($gov, $dep, $relation, $withinDocument, $withinSentence,
- $within){
- $tablenames = "dependency_xref_sentence ";
- $where = "";
- if($withinSentence && strlen($within)>0){
- $where = "AND sentence_id in (".$within.")";
- }else if ($withinDocument && strlen($within)>0){
- $tablenames = "dependency_xref_sentence, sentence ";
- $where = " AND sentence.id = sentence_id
- AND document_id in (".$within.")";
- }
- $r = strlen($relation)>0;
- $g = strlen($gov)>0;
- $d = strlen($dep)>0;
- $rel_w = "";
- $gov_w ="";
- $dep_w = "";
- if($r){
- $rel_w = "relation_id IN (".$relation.")";
- }
- if($g){
- $gov_w = "gov_id IN (".$gov.") ";
- }
- if($d){
- $dep_w = "dep_id IN (".$dep.") ";
- }
- $statistics_query = "SELECT *, COUNT(sentence_id) as value
- FROM ".$tablenames." WHERE ";
- if($r || $g || $d ){
- if($r && $g && $d){
- $statistics_query = "$statistics_query $rel_w AND $gov_w AND $dep_w";
- }
- else if($r && $g){
- $statistics_query = "$statistics_query $rel_w AND $gov_w";
- }
- else if($r && $d){
- $statistics_query = "$statistics_query $rel_w AND $dep_w";
- }
- else if($g && $d){
- $statistics_query = $statistics_query."((".$gov_w." AND ".$dep_w.") OR ";
- $statistics_query = $statistics_query." (dep_id IN (".$gov.") AND gov_id IN (".$dep."))) ";
- }
- else if ($r){
- $statistics_query = $statistics_query.$rel_w;
- }
- else if ($g){
- $statistics_query = $statistics_query." (".$gov_w;
- $statistics_query = $statistics_query." OR dep_id IN (".$gov.")) ";
- }
- else{
- $statistics_query = $statistics_query." (".$dep_w;
- $statistics_query = $statistics_query." OR gov_id IN (".$dep.")) ";
- }
- $statistics_query = $statistics_query.$where;
- }
- else if($withinSentence){
- $statistics_query = "SELECT *, COUNT(sentence_id) FROM ".$tablenames.$where;
- }
- $relationship_query = $statistics_query."
- GROUP BY relation_id, gov_id, dep_id ORDER BY value desc;";
- $gov_query = $statistics_query."
- GROUP BY gov_id, dep_id, relation_id ORDER BY value desc;";
- $dep_query = $statistics_query."
- GROUP BY dep_id, gov_id, relation_id ORDER BY value desc;";
- $rel_result = mysql_query($relationship_query) or die (
- "get-search-results.php 184
- <br> $relationship_query
- <br> mysql error <br>".mysql_error());
- $gov_result = mysql_query($gov_query) or die (
- "get-search-results.php 185
- <br> $gov_query
- </br> mysql error <br>".mysql_error());
- $dep_result = mysql_query($dep_query) or die (
- "get-search-results.php 186
- <br> $dep_query
- <br> mysql error
- <br>".mysql_error());
- $dep_statistics = countStatistics($dep_result, 'dep_id', 'gov_id',
- 'relation_id');
- $rel_statistics = countStatistics($rel_result, 'relation_id', 'gov_id',
- 'dep_id');
- $gov_statistics = countStatistics($gov_result, 'gov_id', 'dep_id',
- 'relation_id');
- return array("relationship"=>$rel_statistics,
- "gov"=>$gov_statistics,
- "dep"=>$dep_statistics);
- }
- function countStatistics($statistics, $category, $subCategory, $subSubCategory){
- $counter = 0;
- $counted = array();
- $maxVal = 0;
- $total = 0;
- $data = array();
- while($row = mysql_fetch_assoc($statistics)){
- $cat = array_key_exists($category, $row) ? getWord($row[$category])
- : null;
- $subCat = array_key_exists($subCategory, $row) ? getWord($row[$subCategory]):null;
- $subSubCat = array_key_exists($subSubCategory, $row)?getWord($row[$subSubCategory]):null;
- $value = intval($row['value']);
- if($cat != null){
- if(!array_key_exists($cat, $data)){
- $data[$cat] = array("children"=>array(), "childMax"=>0,"value"=>0, "name"=>$cat);
- }
- if($subCat != null){
- if(!array_key_exists($subCat, $data[$cat]['children'])){
- $data[$cat]['children'][$subCat] = array("children"=>array(), "childMax"=>0, "value"=>0, "name"=>$subCat);
- }
- if($subSubCat != null){
- $data[$cat]['children'][$subCat]['children'][$subSubCat] = array("value"=>$value, "name"=>$subSubCat);
- if($value > $data[$cat]['children'][$subCat]['childMax']){
- $data[$cat]['children'][$subCat]['childMax'] = $value;
- }
- }
- $data[$cat]['children'][$subCat]['value'] += $value;
- if($data[$cat]['children'][$subCat]['value'] > $data[$cat]['childMax']){
- $data[$cat]['childMax'] = $data[$cat]['children'][$subCat]['value'];
- }
- }
- $data[$cat]['value'] += $value;
- if($data[$cat]['value'] > $maxVal){
- $maxVal = $data[$cat]['value'];
- }
- }
- $total += $value;
- }
- $sortedData = array();
- uasort($data, 'compareValues');
- foreach($data as $c){
- uasort($c['children'], 'compareValues');
- $sortedCategory = array();
- foreach($c['children'] as $sc){
- $sortedSubSubCategory = array();
- uasort($sc['children'], 'compareValues');
- foreach($sc['children'] as $ssc){
- array_push($sortedSubSubCategory, $ssc);
- }
- $sc['children'] = $sortedSubSubCategory;
- array_push($sortedCategory, $sc);
- }
- $c['children'] = $sortedCategory;
- array_push($sortedData, $c);
- }
- return array("value"=>$total, "childMax"=>$maxVal, "children"=>$sortedData);
- }
- function compareValues($a, $b){
- if($a['value'] > $b['value']){
- return -1;
- }else if($a['value'] == $b['value']){
- return 0;
- }else{
- return 1;
- }
- }
- ?>