/src/php/associated-words/get-associated-words.php
PHP | 211 lines | 152 code | 13 blank | 46 comment | 30 complexity | 57c5970d1180dc2cfa6d24cfc2f26a6e MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
- <?php
- /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
- /*****************************************************************************
- get-associated-words.php
- Return adjectives, nouns, and verbs with high TF-IDF scores that
- tend to occur within 10 sentences of this word
- *****************************************************************************/
- include_once '../util.php';
- $wordseer_instance = getGetParam('instance');
- $path = '../../../instances/'.$wordseer_instance.'/config.php';
- include_once $path;
- $N = 0;
- //Query parameters
- $wordseer_instance = getGetParam('instance');
- $gov = '';
- $dep = '';
- $relation = '';
- $govtype = 'word';
- $deptype = 'word';
- $searches = decodeGetJson('search');
- if (count($searches) <= 1) {
- $gov = getGetParam('gov');
- $govtype = getGetParam('govtype');
- $dep = getGetParam('dep');
- $deptype = getGetParam('deptype');
- $relation = getGetParam('relation');
- }
- $collection = getGetParam('collection');
- $statistics = getGetParam('statistics');
- $phrasess = decodeGetJson('phrases');
- $metadata = decodeGetJson('metadata');
- $timing = getGetParam('timing');
- dispatch();
- function dispatch(){
- if (array_key_exists('id', $_GET)) {
- $id = getGetParam('id');
- $ids = array();
- array_push($ids, $id);
- } else if (array_key_exists('word', $_GET)) {
- $cls = getGetParam('class');
- if ($cls == 'word-set') {
- $ids = explode(", ", getWordIDsFromWordSet(getGetParam('word')));
- } else if ($cls == 'word') {
- $ids = getWordIDs(getGetParam('word'));
- } else if ($cls == "phrase") {
- $ids = explode(" ", getGetParam('word'));
- }
- }
- $id_string = join(",", $ids);
- $words = getAssociatedWords($id_string, $ids, getGetParam('word'));
- echo json_encode($words);
- }
- function getAssociatedWords($idString, $ids, $word){
- global $STOPWORDS; // util.php
- global $query_id;
- global $timing;
- $context_conditions = "";
- $table_identifier = "";
- $query_id_where = "";
- if ($query_id) {
- $table = 'filtered_sent_ids';
- $query_id_where = '';
- global $cache_results;
- global $query_id;
- global $dont_cache_search_results;
- if (($cache_results || $query_id) && !$dont_cache_search_results) {
- $table = 'cached_filtered_sent_ids';
- $query_id_where = " AND query_id = $query_id ";
- }
- $table_identifier = ", $table ";
- $context_conditions = "AND $table.id = sentence_id
- $query_id_where";
- }
- $sql = "SELECT sentence_id, sentence_number, document_id
- FROM sentence_xref_word $table_identifier
- WHERE word_id IN (".$idString.")
- $context_conditions;";
- if ($timing) {
- echo $sql."<br>
- ";
- }
- $result = mysql_query($sql) or die("get-associated-words.php L39 <b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- $sentence_ids = array();
- $sentence_numbers = array();
- $document_ids = array();
- while($row = mysql_fetch_array($result)){
- array_push($sentence_ids, $row['sentence_id']);
- array_push($sentence_numbers, $row['sentence_number']);
- array_push($document_ids, $row['document_id']);
- }
- $number_of_sentences = count($sentence_ids);
- $sentence_id_string = join(",", $sentence_ids);
- // COUNT(word_id)*LOG(".$number_of_sentences."/COUNT(DISTINCT(sentence_id)))
- $sql = "SELECT count(distinct sentence_id) as score, word, word_id, pos
- FROM word, sentence_xref_word
- WHERE word.id = word_id
- AND sentence_id IN ($sentence_id_string)
- AND word_id NOT IN ($idString)
- AND ((pos like 'N%') OR (pos like 'V%') OR (pos like 'J%'))
- GROUP BY word
- ORDER BY score DESC
- LIMIT 500;";
- $result = mysql_query($sql) or die("get-associated-words.php L82 <b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- $words = array("Adverbs"=>array(), "Adjectives"=>array(), "Nouns"=>array(), "Verbs"=>array(), ""=>array());
- $word_set_memberships = getWordSetMemberships();
- while($row = mysql_fetch_array($result)){
- if ($row['score'] > 0 && !in_array($row['word'], $STOPWORDS)) {
- $pos = "";
- if(substr($row['pos'], 0, 1) == "V"){
- $pos = "Verbs";
- }
- if(substr($row['pos'], 0, 1) == "J"){
- $pos = "Adjectives";
- }
- if(substr($row['pos'], 0, 1) == "R"){
- $pos = "Adverbs";
- }
- if(substr($row['pos'], 0, 1) == "N"){
- $pos = "Nouns";
- }
- $word = array(
- "id"=>$row['word_id'],
- "word"=>$row['word'],
- "score"=>$row['score']);
- if (array_key_exists($row['word_id'], $word_set_memberships)) {
- $word['word_set'] = join(" ",
- $word_set_memberships[$row['word_id']]);
- }
- array_push(
- $words[$pos], $word);
- }
- }
- // Get synonyms.
- $sql = "SELECT *, sum(score) as total from synsets
- WHERE word1_id IN ($idString)
- GROUP BY word2
- ORDER BY total DESC;";
- $result = $result = mysql_query($sql) or die("get-associated-words.php l.157
- <b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- $words["Synsets"] = array();
- while ($row = mysql_fetch_assoc($result)) {
- $word = array(
- "id"=>$row['word2_id'],
- "word"=>$row['word2'],
- "score"=>$row['total']
- );
- if (array_key_exists($row['word2_id'], $word_set_memberships)) {
- $word['word_set'] = join(" ",
- $word_set_memberships[$row['word2_id']]);
- }
- array_push($words["Synsets"], $word);
- }
- // Get frequent phrases containing that word in this context
- // $id_strings = array();
- // foreach ($ids as $id) {
- // array_push($id_strings, ".$id.");
- // }
- // $id_strings = join(" ", $id_strings);
- // $sql = "SELECT id from sequence
- // where match(id) against(\"$id_strings\" IN BOOLEAN MODE) ;";
- // $result = $result = mysql_query($sql) or die("get-associated-words.php l.169
- // <b>A fatal MySQL error occured</b>.
- // <br/> Query: " . $sql . "
- // <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- // $sequence_ids = array();
- // while ($row = mysql_fetch_assoc($result)) {
- // array_push($sequence_ids, "'".$row['id']."'");
- // }
- // $sequence_id_string = join(",", $sequence_ids);
- // $sql = "SELECT sequence as word, sequence_id, count(sequence_id) as score
- // FROM sequence_xref_sentence, sequence
- // WHERE sequence.id = sequence_id
- // AND length > 1
- // AND lemmatized = 0
- // AND sentence_count > 1
- // AND sequence.id in ($sequence_id_string)
- // AND sentence_id in ($sentence_id_string)
- // GROUP BY sequence_id
- // ORDER BY score DESC
- // LIMIT 150;";
- // $result = $result = mysql_query($sql) or die("get-associated-words.php l.185
- // <b>A fatal MySQL error occured</b>.
- // <br/> Query: " . $sql . "
- // <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- // $words["Sequences"] = array();
- // while ($row = mysql_fetch_assoc($result)) {
- // array_push($words["Sequences"], $row);
- // }
- return $words;
- }
- ?>