/src/php/wordtree/get-tree.php
PHP | 472 lines | 441 code | 10 blank | 21 comment | 22 complexity | fed0e4123f5301212a02ebab308bd5b9 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
- <?php
- /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
- /****************************************************************************
- getcontext.php
- Called by getContext() in heatmap.js in service of heatmap.php
- Gets the concordance and grammatical context in which a heat map
- query occurs.
- ****************************************************************************/
- include_once '../util.php';
- include_once '../subsets/read.php';
- include_once '../document/get-metadata.php';
- $wordseer_instance = getGetParam('instance');
- $path = '../../../instances/'.$wordseer_instance.'/config.php';
- include_once $path;
- $gov = getGetParam('gov');
- $govtype = getGetParam('govtype');
- $dep = getGetParam('dep');
- $deptype = getGetParam('deptype');
- $relation = getGetParam('relation');
- $collection = getGetParam('collection');
- $statistics = getGetParam('statistics');
- $start = getGetParam('start');
- $limit = getGetParam('limit');
- $metadata = decodeGetJson('metadata');
- $phrases = decodeGetJson('phrases');
- $timing = (getGetParam('timing') == 1);
- $query = "";
- $context = array();
- $matches = getConcordance( $gov, $govtype, $dep, $deptype, $relation,
- $collection, $metadata, $phrases);
- $context["concordance"] = array();
- $context["concordance"]["num"] = $matches['numMatches'];
- $context["concordance"]["docs"] = (array_key_exists('numDocuments', $matches))?($matches['numDocuments']):(0);
- $context["concordance"]["matches"] = $matches['matches'];
- $context['concordance']['lefts'] = getSentences($context['concordance']['matches'], "left");
- $context['concordance']['rights'] = getSentences($context['concordance']['matches'], "right");
- $g = getGetParam('gov');
- $d = getGetParam('dep');
- if($govtype == 'word-set'){
- $g = getWordsFromWordSet($gov);
- }
- if($deptype == 'word-set'){
- $d = getWordsFromWordSet($dep);
- }
- if (!$query_string){
- $query_string = $g.' '.$d;
- if (!$gov && !$dep) {
- $query_string = $phrases[0];
- }
- }
- $context["query"] = $query_string;
- echo json_encode($context);
- /** Returns a list of concordances for the given search query and filters.
- @return A list of concordances: associative arrays with the following fields:
- - number -- The sentence number
- - left -- The text to the left of the match
- - right -- The text to the right of the match
- - id -- The ID of the sentence
- - match -- the matched text
- */
- function getConcordance($gov, $govtype, $dep, $deptype, $relation, $collection,
- $metadata, $phrases){
- global $q;
- global $query_string;
- global $timing;
- $table_identifier = 'filtered_sent_ids';
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- if ($cache_results || $query_id) {
- $table_identifier = 'cached_filtered_sent_ids';
- $query_id_where = " AND query_id = $query_id ";
- }
- // Appply the filters.
- $filtered = getSentenceIDsForFilters($metadata, $collection, $phrases);
- $filtersAreActive = ($filtered != 'all');
-
- $results = null;
- $pattern = null;
- $t1 = time();
- // if just a regular word search or a phrase search
- if(strlen(trim($relation)) == 0){
- //default query
- $words = "";
- // If a search query has been specified, use it as the center
- // of the word tree.
- if ($gov) {
- $words = $gov;
- $q = $gov;
- if($govtype == 'word-set'){
- $words = explode(" ", getWordsFromWordSet($gov));
- $word_id_string = getWordIDsFromWordSet($gov);
- $q = "(".join("|", $words).")";
- $query_string = getSetName($gov);
- $words = join(" OR ", $words);
- $query = "SELECT sentence_id as id, sentence, number,
- sentence.document_id
- FROM sentence, sentence_xref_word
- WHERE sentence.id = sentence_id
- AND word_id in ($word_id_string) ";
- if ($filtersAreActive) {
- $query = "SELECT sentence_id, sentence
- FROM sentence_xref_word, $table_identifier, sentence
- WHERE sentence.id = sentence_id
- AND sentence_id = $table_identifier.id $query_id_where
- AND word_id in ($word_id_string) ";
- }
- } else if (strstr($words, " ") && !strstr($words, ",") && !strstr($words, "*")
- ) {
- $q = $words;
- $query_string = $words;
- if (!strstr($words, "\"")) {
- $words = "\"$words\"";
- }
- $query = "SELECT id, sentence, number, sentence.document_id
- FROM sentence
- WHERE match sentence against('$words' IN BOOLEAN MODE) ";
- if ($filtersAreActive) {
- $query = "SELECT sentence.id, sentence
- FROM sentence, $table_identifier
- WHERE match sentence against('$words' IN BOOLEAN MODE)
- AND sentence.id = $table_identifier.id $query_id_where ";
- }
- $q = $words;
- } else if (strstr($words, ",") || strstr($words, "*")) {
- $query_string = $words;
- $word_array = explode(",", $words);
- $word_strings = array();
- $word_ids = array();
- foreach($word_array as $word) {
- array_push($word_strings, trim($word));
- $ids = getWordIDs(trim($word));
- foreach($ids as $id) {
- array_push($word_ids, $id);
- }
- }
- $q = "(".join("|", $word_strings).")";
- $word_id_string = join(",", $word_ids);
- $query = "SELECT sentence_id as id, sentence, number,
- sentence.document_id
- FROM sentence, sentence_xref_word
- WHERE sentence.id = sentence_id
- AND word_id in ($word_id_string) ";
- if ($filtersAreActive) {
- $query = "SELECT sentence_id as id, sentence, number,
- sentence.document_id
- FROM sentence_xref_word, $table_identifier, sentence
- WHERE sentence.id = sentence_id
- AND sentence_id = $table_identifier.id $query_id_where
- AND word_id in ($word_id_string) ";
- }
- } else {
- $query_string = $words;
- $word_id_string = getWordID($words);
- $query = "SELECT sentence_id as id, sentence, number,
- sentence.document_id
- FROM sentence, sentence_xref_word
- WHERE sentence.id = sentence_id
- AND word_id in ($word_id_string) ";
- if ($filtersAreActive) {
- $query = "SELECT sentence_id as id, sentence, number,
- sentence.document_id
- FROM sentence_xref_word, $table_identifier, sentence
- WHERE sentence.id = sentence_id
- AND sentence_id = $table_identifier.id $query_id_where
- AND word_id in ($word_id_string) ";
- }
- $q = $words;
- }
- }
- // If no search query has been specified, but there are phrases
- // acting as filters, then use the first filter phrase as the
- // search query for the center of the word tree.
- else if (count($phrases) > 0) {
- $phraseIDs = getPhraseIDs($phrases[0]);
- $sql = "SELECT sequence from sequence
- WHERE id
- IN (".join(",", $phraseIDs).");";
- if ($timing) {
- echo $sql;
- }
- $result = mysql_query($sql) or die ("Error getting phrase with ID
- ".$phrases[0].";");
- $q = "";
- while ($row = mysql_fetch_assoc($result)) {
- $q = explode(" ", $row['sequence']);
- $q = $q[0];
- $query_string = $q;
- }
- $query = "SELECT * FROM $table_identifier, sentence
- WHERE $table_identifier.id = sentence.id
- $query_id_where";
- }
- // If no search query has been specified get the most frequent
- // content word (not stop word) from the set of documents matching
- // the filters (if any) or the whole collection (if there are no
- // filters), and use that as the search query for the center of
- // the word tree.
- else {
- $word_id = getMostFrequentContentWordID($filtered);
- $query_string = getWord($word_id); //util.php
- if ($timing) {
- echo "<br> Most frequent phrase: $query_string <br>";
- }
- if (strlen($query_string) > 0) {
- $q = $query_string;
- $cache_results = true;
- $sentence_ids = getSentenceIDsForWord($word_id);
- $cache_results = false;
- $query = "SELECT
- sentence.id, sentence.document_id, number, sentence
- FROM $table_identifier, sentence
- WHERE sentence.id = $table_identifier.id
- $query_id_where ";
- } else {
- // No search query.
- $q = "";
- $query_string = " ";
- $query = "SELECT * from sentence where FALSE ";
- }
- }
- // Get the sentences that match the query constructed above.
- $results = mysql_query($query." LIMIT 1000 ;") or die("<b>A fatal MySQL error occured</b>.
- <br/> Query: " . $query . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- // Construct the pattern for the regular expression.
- $query = remove_spaces_before_punctuation($q);
- $query = str_replace( "*", "\w*",
- str_replace("\\", "",
- str_replace('"', "", $query)));
- $pattern = " ";
- if ($query) {
- $pattern = "/\b".$query."\b/i";
- global $PUNCTUATION;
- if (strstr($PUNCTUATION, $query)) {
- $pattern = "/$query/i";
- }
- }
- }
- // If it's a grammatical search, extract the gov, dep, and relation from
- // the GET parameters and issue a dependency relationship search.
- else{
- $govIDs = wordIDList($gov);
- if($govtype == "word-set"){
- $govIDs = getWordIDsFromWordSet($gov);
- }
- $depIDs = wordIDList($dep);
- if($deptype == 'word-set'){
- $depIDs = getWordIDsFromWordSet($dep);
- }
- $relations = relationshipIDList($relation);
- if($filtersAreActive){
- // To stave off syntax errors if its empty, put a -1 in the list.
- array_push($filtered, "-1");
- $withinSentence = true;
- $within = join(", ", $filtered);
- $sentence_where_clause = " AND id IN ($within) ";
- }
- $dependency_id_results = null;
- $dependency_id_results = getDependencyIDs($govIDs,
- $depIDs,
- $relations,
- false,
- $withinSentence,
- $within,
- false,
- false);
- $sentence_ids = array();
- while($row = mysql_fetch_array($dependency_id_results)){
- array_push($sentence_ids, $row['sentence_id']);
- }
- if(count($sentence_ids) > 0){
- $id_string = join(", ", $sentence_ids);
- $sql = "SELECT
- sentence.id, sentence.document_id, number, sentence
- FROM
- sentence
- WHERE id IN (".$id_string.");";
- $results = mysql_query($sql) or die("<b>Fatal MySQL error</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- // pattern
- $govs = $gov;
- if($govtype == "word-set"){
- $govs = getWordsFromWordSet($gov);
- }
- $deps = $dep;
- if($deptype == 'word-set'){
- $deps = getWordsFromWordSet($dep);
- }
- if(strlen($govs) > 0){
- $query = str_replace(" ", "|", $govs);
- }else if(strlen($deps) > 0){
- $query = str_replace(" ", "|", $deps);
- }
- $pattern = "/\b(".$query.")\b/i";
- global $PUNCTUATION;
- if (strstr($PUNCTUATION, $query)) {
- $pattern = "/$query/i";
- }
- }
- }
- $pattern = str_replace(".", "", $pattern);
- $matched = array();
- $matched['numMatches'] = mysql_num_rows($results);
- $matched['matches'] = array();
- $split = array();
- $t2 = time();
- if ($timing != 0) {
- echo "<br> Time to get sentences matching the query: ".($t2-$t1)."s<br>";
- echo "<br>$pattern<br><br>";
- }
- $t1 = time();
- if($results){
- while($row = mysql_fetch_array($results)){
- $split = array();
- $matches = array();
- $sentence = preg_replace("/\s/", " ", $row['sentence']);
- $has_match = preg_match($pattern, $sentence, $matches,
- PREG_OFFSET_CAPTURE);
- if ($timing != 0) {
- if (!$has_match) {
- echo "<br>".$sentence;
- }
- }
- $match = ($has_match)?($matches[0][0]):('');
- $matched_text = preg_replace("/\W/", "", $match);
- $match_length = strlen($matched_text);
- $left = ($has_match)?(substr($sentence, 0, $matches[0][1])):('');
- $right = ($has_match)?(substr($sentence, $matches[0][1] + $match_length + 1)):('');
- $split['number'] = $row['number'];
- $split['document'] = $row['document_id'];
- $split['id'] = $row['id'];
- $split['left'] = utf8_encode($left);
- $split['right'] = utf8_encode($right);
- //$split['match'] = $match;
- $split['match'] = $matched_text;
- $split['match'] = utf8_encode(str_replace("|", "/", $split['match']));
- array_push($matched['matches'], $split);
- }
- }
- $t2 = time();
- if ($timing != 0) {
- echo "<br> Time to count match patterns for word tree: ".($t2-$t1)."s.
- <br>";
- }
- return $matched;
- }
- function getMostFrequentContentWordID($filtered) {
- global $timing;
- $table_identifier = 'filtered_sent_ids';
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- if ($cache_results || $query_id) {
- $table_identifier = 'cached_filtered_sent_ids';
- $query_id_where = " AND query_id = $query_id ";
- }
- $t1 = time();
- if ($filtered == "all") {
- $sql = "SELECT word, id as word_id, sentence_count as count
- FROM word
- WHERE ((pos like 'N%') OR (pos like 'V%') OR (pos like 'J%'))
- ORDER BY count DESC
- LIMIT 100;";
- } else {
- $sql = "SELECT word, word_id, count(distinct sentence_id) as count
- FROM sentence_xref_word, $table_identifier, word
- WHERE word.id = word_id
- AND sentence_id = $table_identifier.id
- $query_id_where
- AND ((pos like 'N%') OR (pos like 'V%') OR (pos like 'J%'))
- GROUP BY word
- ORDER BY count DESC
- LIMIT 100;";
- }
- $result = mysql_query($sql) or die("Error getting sequences for
- sentence IDs:
- <br> ".mysql_error()."</br>
- <br> on query
- <br> $sql");
- $t2 = time();
- if ($timing != 0) {
- echo "<br> Time to get most frequent content phrase ID:
- ".($t2-$t1)."s<br>";
- }
- if ($result) {
- global $STOPWORDS;
- while($row = mysql_fetch_assoc($result)){
- $word = $row['word'];
- if (!in_array($word, $STOPWORDS)) {
- return $row['word_id'];
- }
- }
- } else {
- return "";
- }
- }
- function getSentenceIDsForWord($word_id) {
- global $num_filter_conditions;
- global $timing;
- $table_identifier = 'filtered_sent_ids';
- $insertion_fields = '(id, document_id)';
- $field_identifier = "DISTINCT sentence_id, document_id";
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- if ($cache_results || $query_id) {
- $table_identifier = 'cached_filtered_sent_ids';
- $insertion_fields = '(id, document_id, query_id)';
- $field_identifier = "DISTINCT sentence_id, document_id, $query_id";
- $query_id_where = " AND query_id = $query_id ";
- }
- $num_filter_conditions += 1;
- $sql = "INSERT INTO $table_identifier $insertion_fields
- SELECT $field_identifier FROM sentence_xref_word
- WHERE word_id = $word_id
- ON DUPLICATE KEY update num_matched = num_matched + 1;";
- if ($timing) {
- echo $sql;
- }
- mysql_query($sql) or die (mysql_error()." On: <br> $sql
- <br> while getting sentence_ids matching word id
- $word_id <br> at get-tree.php l.428");
- updateSentenceFilterTable();
- }
- //Part 2: word tree
- function getSentences($matches, $which){
- $sentences = array();
- $length = 10;
- $i = 0;
- global $timing;
- $t1 = time();
- foreach($matches as $match){
- //TODO verify this regex
- $sentence = (strlen($match[$which]) > 0)?(
- preg_split('( )', $match[$which], null, PREG_SPLIT_NO_EMPTY)):
- (null);
- if(!is_array($sentence)){
- $sentence = array( 0 => '' );
- }
- if($which == 'left'){
- $sentence = array_reverse($sentence);
- }
- array_push($sentences, array("id"=>$match['id'], "sentence"=>$sentence));
- }
- $t2 = time();
- if ($timing != 0) {
- echo "<br> Time to split match patterns for word tree: ".($t2-$t1)."s.
- <br>";
- }
- return $sentences;
- }
- ?>