/src/php/similarsentences/similarsentences.php
PHP | 607 lines | 410 code | 34 blank | 163 comment | 37 complexity | d34c93edb60f8f91158e42159aa48950 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
- <?php
- /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
- /*****************************************************************************
- similarsentences.php
- Calculates a result set of sentences in response to a query using relevance
- feedback, specifically the Rocchio algorithm.
- The top-level dispatch procedure works with either a string query,
- or with a vector query and a 2 set of sentences: those judged by the user to
- be relevant, and non-relevant. In return it always sends back a set of
- sentences.
- The Rochhio algorithm uses the vector-space model of information retrieval
- to refine a query based on relevance feedback.
- *****************************************************************************/
- include_once '../../../config.php';
- include_once '../dbsetup.php';
- include_once '../util.php';
- include_once 'sparsevector.php';
- include_once '../priorityqueue.php';
- /** Algorithm parameters **/
- // weight with which query should be adjusted towards relevant sentences
- $ALPHA_plus = 0.1;
- // weight with which query should be adjusted towards relevant words
- $ALPHA_w_plus = 1; // treat like a search term
- // weight with which query should be adjusted away from irrelevant sentences
- $ALPHA_minus = $ALPHA_plus*0.1;
- // weight with which query should be adjusged away from irrelevant words
- $ALPHA_w_minus = $ALPHA_w_plus*0.1;
- // number of returned sentences
- $LIMIT = 500;
- /** dispatch procedure
- Sends back sentences based on the $_POST data sent by the client.
- Arguments:
- -- 'string_query': "true", or "false". "true" indicates that the query is
- a string query, and the query will be interpreted as
- such
- -- 'vector_query': "true" or "false". "true" indicates that the query is
- a vector query, and the query will be interpreted as
- a vector.
- -- 'query': a String (if string_query is "true"), or (if 'vector_query'
- is "true") an object containing
- features:
- a map from from string feature ID's to floating-point
- values
- relevant: a list of relevant sentences represented
- irrelevant: a list of irrelevant sentences represented
-
- -- 'relevant': a JSON _list_ of integer sentence ID's judged by the user
- to be relevant.
- -- 'irrelevant': a JSON _list_ of integer sentence ID's judged by the user
- to be NOT relevant.
- -- 'relevant_words': a JSON _list_ of integer word ID's judged by the user
- to be relevant.
- -- 'irrelevant_words': a JSON _list_ of integer word ID's judged by the user
- to be NOT relevant.
- Return values
- A JSON response is sent to the client, containing the following data:
- -- sentences : a list of sentences. Where each sentence has an ID, a
- narrativeID, title, date, and a string.
- -- query : a feature-vector representing the current query, to be stored
- by the client and sent back with further query refinements.
- {sentences:
- [{
- sentenceID: integer
- sentence: string
- }
- ...,
- ....]
- vector_query: {featureID: floating-point value, ..., ...}
- relevant: [ the list of sentence ID's incorporated as relevant],
- irrelevant: [ the list of sentence ID's incorporated as irrelevant],
- relevant_words: [ the list of word ID's incorporated as relevant],
- irrelevant_words: [ the list of word ID's incorporated as irrelevant]
- }
- **/
- dispatch();
- function dispatch(){
- $result = array();
- if($_POST['string_query'] == "true"){
- $current_query = (array) json_decode($_POST['vector-query']);
- $result = process_string_query(mysql_escape_string(strtolower($_POST['query'])), $current_query);
- }else if($_POST['vector_query'] == "true"){
-
- // unpack the sent data
- // unpack sentences
- $relevant_sentences = array();
- $irrelevant_sentences = array();
- if(array_key_exists( 'relevant', $_POST)){
- $relevant_sentences = json_decode($_POST['relevant']);
- }
- if(array_key_exists('irrelevant', $_POST)){
- $irrelevant_sentences = json_decode($_POST['irrelevant']);
- }
- // unpack words
- $relevant_words = array();
- $irrelevant_words = array();
- if(array_key_exists('relevant_words', $_POST)){
- $relevant_words = json_decode($_POST['relevant_words']);
- }
- if(array_key_exists('irrelevant_words', $_POST)){
- $irrelevant_words = json_decode($_POST['irrelevant_words']);
- }
- $query = (array) json_decode($_POST['query']);
-
- // calculate required information
- if(array_key_exists('calculate_new_query_words', $_POST)){
- // calculate what the new vector query would be
- // given a set of sentences marked relevant/irrelevant
- // and the previous vector query
- $new_query = calculate_new_vector_query($query, $relevant_sentences, $irrelevant_sentences, $relevant_words, $irrelevant_words);
- $words = convert_vector_query_to_relevance_words($new_query);
- $result = $words;
- }else{
- // perform relevance feedback and
- // send back new sentences based on a set of
- // marked sentences and words
- $result = process_relevance_feedback($query, $relevant_sentences, $irrelevant_sentences, $relevant_words, $irrelevant_words);
- }
- }
- echo json_encode($result);
- }
- /**Searches for sentences that match the given search query, and returns them
- along with a vector representation, which is the string query translated into vector form.
- Arguments:
- -- query : the string query typed in by the user, escaped for MYSQL
- safety.
- Return:
- A php array() with the following key-value pairs:
- {sentences:[
- {id:sentenceID, sentence:string sentence},
- ...]
- query:{featureID:floating-point value, ...}
- }
- */
- function process_string_query($query, $old_query){
- $old_features = (array) $old_query['features'];
- $old_vector_query = new SparseVector($old_features);
- $vect_query = convert_query_to_sparse_vector($query);
- $new_query = $vect_query->vectorAdd($old_vector_query);
- $sentences = retrieve_sentences_from_vector_query($new_query);
- $result = array();
- $result['query'] = array();
- $result['sentences'] = $sentences;
- $result['query']['features'] = $new_query->features;
- $result['query']['relevant'] = $old_query['relevant'];
- $result['query']['irrelevant'] = $old_query['irrelevant'];
- $result['query']['irrelevant_words'] = $old_query['irrelevant_words'];
- $relevant_words = $old_query['relevant_words'];
- $words = explode(" ", $query);
- foreach($words as $word){
- $wordIDs = explode(", ", getWordID($word));
- $relevant_words = array_merge($relevant_words, $wordIDs);
- }
- $result['query']['relevant_words'] = $relevant_words;
- return $result;
- }
- /* Convert a string query to a sparse vector by assigning weights to the words in the query, where words are determined by splitting on whitespace. */
- function convert_query_to_sparse_vector($query){
- global $STOPS;
- $words = explode(" ", $query);
- $vector = new SparseVector();
- $wordID = -1;
- foreach($words as $word){
- $wordIDs = explode(", ", getWordIDsAndPOS($word));
- if(!strstr($STOPS, strtolower($word))){
- // stopwords defined in dbsetup.php, line 403
- foreach($wordIDs as $wordID){
- $id = explode("-", $wordID);
- $id = $id[0];
- $pos = explode("-", $wordID);
- $pos = $pos[1];
- if($word == replaceWeirdCharacters($word)){
- add_search_word_feature($id, $pos, $word, 1, $vector);
- }
- }
- }
- }
- return $vector;
- }
- /* get ID's and parts of speech of a surface word */
- function getWordIDsAndPOS($word){
- $query = "";
- if(!(strstr($word, "*"))){
- $query = "SELECT pos, id FROM word WHERE word ='".mysql_escape_string(trim($word))."';";
- }else{
- $query = "SELECT pos, id FROM word WHERE word like '".mysql_escape_string(trim(str_replace("*", "%", $word)))."';";
- }
- $result = mysql_query($query);
- if(mysql_num_rows($result)>0){
- $ids = array();
- while($row = mysql_fetch_array($result)){
- array_push($ids, $row['id']."-".$row['pos']);
- }
- return join(", ", $ids);
- }else{
- return -1;
- }
- }
- /* Adds a feature corresponding to a word to a sparse vector*/
- function add_word_feature($wordID, $pos, $word, $weight, $vector){
- $vector->setFeatureValue(make_word_feature_name($wordID, $pos, $word), $weight);
- }
- /* Adds a feature corresponding to a searched word to a sparse vector*/
- function add_search_word_feature($wordID, $pos, $word, $weight, $vector){
- $vector->setFeatureValue(make_search_word_feature_name($wordID, $pos, $word), $weight);
- }
- /** A top-level wrapper for the relevance feedback computation functions.
- Helper functions fetch sentences by updating the current query to reflect the relevance feedback given by the user.
- - convert the irrelevant and relevant sentences into vectors v_+ and v_-
- - calculate the new query q' = q + (a_+v_+) - (a_-v_-)
- - calculate the sentences that match the new query
- Arguments:
- -- query : the (sparse) vector query sent by the client
- {featureID:floating-point value, ....}
- -- relevant: a list of sentenceID's marked relevant
- -- irrelevant: a list of sentenceID's marked irrelevant
-
- Return:
- A php array() with the following key-value pairs.
- {sentences:[
- {id:sentenceID, sentence:string sentence},
- ...]
- query:{featureID:floating-point value, ...}
- }
- */
- function process_relevance_feedback($query, $relevant, $irrelevant, $relevant_words, $irrelevant_words){
- $new_query = calculate_new_vector_query($query, $relevant, $irrelevant, $relevant_words, $irrelevant_words);
- $sentences = retrieve_sentences_from_vector_query($new_query);
- $result = array();
- $result['sentences'] = $sentences;
- $result['query'] = array();
- $result['query']['features'] = $new_query->features;
- $result['query']['relevant'] = $relevant;
- $result['query']['irrelevant'] = $irrelevant;
- $result['query']['relevant_words'] = $relevant_words;
- $result['query']['irrelevant_words'] = $irrelevant_words;
- return $result;
- }
- function calculate_new_vector_query($query, $relevant, $irrelevant, $relevant_words, $irrelevant_words){
- $features = (array) $query['features'];
- $vector_query = new SparseVector($features);
- $vector_query->normalize();
- $sentence_adjustment = calculate_sentence_adjustment($query, $vector_query, $relevant, $irrelevant);
- $word_adjustment = calculate_word_adjustment($query,$vector_query, $relevant_words, $irrelevant_words);
- $adjustment = $sentence_adjustment->vectorAdd($word_adjustment);
- $new_query = $vector_query->vectorAdd($adjustment);
- $new_query->normalize();
- return $new_query;
- }
- function calculate_sentence_adjustment($query, $vector_query, $relevant, $irrelevant){
- global $ALPHA_plus; // relevant sentences weight
- global $ALPHA_minus; // irrelevant sentences weight
- $already_relevant = $query['relevant'];
- $already_irrelevant = $query['irrelevant'];
- $new_relevant = array_subtract($already_relevant, $relevant);
- $no_longer_relevant = array_subtract($relevant, $already_relevant);
- $new_irrelevant = array_subtract($already_irrelevant, $irrelevant);
- $no_longer_irrelevant = array_subtract($irrelevant, $already_irrelevant);
- $relevant_vect = convert_sentence_IDs_to_sparse_vector($new_relevant);
- $relevant_vect->normalize();
- $no_longer_relevant_vect = convert_sentence_IDs_to_sparse_vector($no_longer_relevant);
- $no_longer_relevant_vect->normalize();
- $irrelevant_vect = convert_sentence_IDs_to_sparse_vector($new_irrelevant);
- $irrelevant_vect->normalize();
- $no_longer_irrelevant_vect = convert_sentence_IDs_to_sparse_vector($no_longer_irrelevant);
- $no_longer_irrelevant_vect->normalize();
- $positive_adjustment = $relevant_vect->scalarMultiply($ALPHA_plus);
- $no_longer_positive_adjustment = $no_longer_relevant_vect->scalarMultiply(-1*$ALPHA_plus);
- $negative_adjustment = $irrelevant_vect->scalarMultiply(-1*$ALPHA_minus);
- $no_longer_negative_adjustment = $no_longer_irrelevant_vect->scalarMultiply($ALPHA_minus);
- $adjustment = $positive_adjustment->vectorAdd($negative_adjustment);
- $adjustment = $adjustment->vectorAdd($no_longer_positive_adjustment);
- $adjustment = $adjustment->vectorAdd($no_longer_negative_adjustment);
- return $adjustment;
- }
- function calculate_word_adjustment($query,$vector_query, $relevant, $irrelevant){
- global $ALPHA_w_plus; // relevant words weight
- global $ALPHA_w_minus; // irrelevant words weight
- $previous_word_features = array();
- $features = $vector_query->features;
- foreach(array_keys($features) as $feature){
- if(is_search_word_feature($feature)){
- //echo $feature;
- $previous_word_features[$feature] = -1*$features[$feature];
- }
- }
- $cancellation_adjustment = new SparseVector($previous_word_features);
- $relevant_vect = convert_word_IDs_to_sparse_vector($relevant);
- //$relevant_vect->normalize();
- $irrelevant_vect = convert_word_IDs_to_sparse_vector($irrelevant);
- //$irrelevant_vect->normalize();
- $positive_adjustment = $relevant_vect->scalarMultiply($ALPHA_w_plus);
- $negative_adjustment = $irrelevant_vect->scalarMultiply(-1*$ALPHA_w_minus);
- $adjustment = $positive_adjustment->vectorAdd($negative_adjustment);
- $adjustment->normalize();
- $adjustment = $adjustment->vectorAdd($cancellation_adjustment);
- return $adjustment;
- }
- function array_subtract($to_subtract, $subtract_from){
- $result = array();
- foreach($subtract_from as $item){
- if(!in_array($item, $to_subtract)){
- array_push($result, $item);
- }
- }
- return $result;
- }
- function array_add($array1, $array2){
- $result = array();
- foreach($array1 as $item){
- array_push($result, $item);
- }
- foreach($array2 as $item){
- array_push($result, $item);
- }
- return $result;
- }
- function convert_sentence_IDs_to_sparse_vector($sentence_ids){
- global $STOPS;
- $features = array();
- if(count($sentence_ids) > 0){
- $sentence_id_string = join(", ", $sentence_ids);
- // get all the words in these sentence and add them to the feature vector
- $sql = "SELECT * from sentence_word_tf_idf, word
- where sentence_id in (".$sentence_id_string.") and word_id = word.id;";
- $words_in_sentences = mysql_query($sql)
- or die("<b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- $word_id = -1;
- $weight = -1;
- // add word features
- while($word_in_sentence = mysql_fetch_array($words_in_sentences)){
- // exclude stopwords and weird characters
- $word = $word_in_sentence['word'];
- if(!strstr($STOPS, strtolower($word)) && $word == replaceWeirdCharacters($word)){
- $word_feature = make_word_feature_name($word_in_sentence['word_id'], $word_in_sentence['pos'], $word_in_sentence['word']);
- $weight = $word_in_sentence['tf_idf'];
- if(array_key_exists($word_feature, $features)){
- $features[$word_feature] += $weight;
- }else{
- $features[$word_feature] = $weight;
- }
- }
- }
- // add dependency features? synonym features? Maybe, if needed.
- // TODO
- }
- // create the vector
- $vector = new SparseVector($features);
- return $vector;
- }
- function convert_word_IDs_to_sparse_vector($word_ids){
- global $STOPS;
- $features = array();
- if(count($word_ids) > 0){
- $word_id_string = join(", ", $word_ids);
- // get all the words in these sentence and add them to the feature vector
- $sql = "SELECT * from word
- where id in (".$word_id_string.");";
- $words = mysql_query($sql)
- or die("<b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- $word_id = -1;
- $weight = -1;
- // add word features
- while($word = mysql_fetch_array($words)){
- // exclude stopwords
- if(!strstr($STOPS, strtolower($word['word']) && $word['word'] == replaceWeirdCharacters($word['word']))){
- $word_feature = make_search_word_feature_name($word['id'], $word['pos'], $word['word']);
- $features[$word_feature] = 1;
-
- }
- }
- }
- // create the vector
- $vector = new SparseVector($features);
- return $vector;
- }
- /* Use the vector space model of information retrieval to return sentences
- that match a given vector query.
- Arguments:
- -- query: a vector query {featureID:floating-point value, ....}
- Return:
- A list of N=$LIMIT sentences ordered by best match first
- [{id:sentenceID, sentence:string sentence}, ...]
- */
- function retrieve_sentences_from_vector_query($query){
- global $LIMIT;
- $sentence_scores = array();
- $sentences = array();
- // get sentences that match the word-based features
- $word_ids = array();
- foreach(array_keys($query->features) as $featureID){
- if(is_word_feature($featureID) || is_search_word_feature($featureID)){
- array_push($word_ids, get_id_from_feature_name($featureID));
- }
- }
- if(count($word_ids) > 0){
- $string_word_ids = join(", ", $word_ids);
- $score_case = convert_to_case_expression($query->features);
- // alternate score formula: "SUM(tf_idf*".$score_case.")/SUM(tf_idf) as score"
- $sql = "SELECT sentence_id,
- SUM(".$score_case.") as score
- from sentence_word_tf_idf
- WHERE word_id in (".$string_word_ids.")
- GROUP BY sentence_id ORDER BY score desc LIMIT ".$LIMIT.";";
- //echo $sql;
- $words_in_sentences = mysql_query($sql)
- or die("<b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- while($scores = mysql_fetch_array($words_in_sentences)){
- if($scores['score'] > 0){
- array_push($sentences, $scores['sentence_id']);
- }
- }
- }
- return fetch_top_n_sentences($sentences);
- }
- function convert_to_case_expression($features){
- $totals = array();
- foreach(array_keys($features) as $feature){
- $word_id = get_id_from_feature_name($feature);
- $score = $features[$feature];
- if(!array_key_exists($word_id, $totals)){
- $totals[$word_id] = 0;
- }
- $totals[$word_id] += $score;
- }
- $sql = "(CASE";
- foreach(array_keys($totals) as $id){
- $score = $totals[$id];
- $sql = $sql."
- WHEN word_id = ".$id."
- THEN ".$score;
- }
- $sql = $sql." ELSE 0 END)";
- return $sql;
- }
- /* Fetches the sentences corresponding to the top N sentence ID's in order.
- Arguments:
- -- sentenceIDs: the list of sentence id's to fetch in order
- -- N: the number of sentences to fetch, starting from the beginning of the
- given list.
- Return:
- A list of N sentences ordered by best match first
- [{id:sentenceID, sentence:string sentence, [and other metadata]}, ...]
- */
- function fetch_top_n_sentences($top_n){
- $sentences = array();
- if(count($top_n) > 0){
- $top_n_id_string = join(", ", $top_n);
- $sql = "SELECT
- sentence.id as id, sentence.narrative_id, sentence, title, date, full as author
- from sentence, narrative,
- author_xref_narrative as axn, author
- WHERE sentence.id in (".$top_n_id_string.")
- AND sentence.narrative_id = narrative.id
- AND axn.narrative_id = narrative.id
- AND axn.author_id = author.id;";
- $sentences_result = mysql_query($sql)
- or die("<b>A fatal MySQL error occured</b>.
- <br/> Query: " . $sql . "
- <br/> Error: (" . mysql_errno() . ") " . mysql_error());
- $sentence = array();
- while($sentence_result = mysql_fetch_array($sentences_result)){
- $sentence = array();
- $sentence['id'] = $sentence_result['id'];
- $sentence['words'] = getWordsInSentence($sentence['id']);
- // ... and whatever other metadata here
- $sentence['narrative_id'] = $sentence_result['narrative_id'];
- $sentence['title'] = $sentence_result['title'];
- $sentence['date'] = $sentence_result['date'];
- $sentence['author'] = $sentence_result['author'];
- // store the information
- if(count($sentence['words']) > 0){
- $sentences[$sentence['id']] = $sentence;
- }
- }
- }
- $ordered = array();
- foreach($top_n as $id){
- array_push($ordered, $sentences[$id]);
- }
- return $ordered;
- }
- function is_word_feature($featureID){
- return starts_with($featureID, "w");
- }
- function is_search_word_feature($featureID){
- return starts_with($featureID, "s");
- }
- function get_id_from_feature_name($featureID){
- $components = explode("_", $featureID);
- return $components[1];
- }
- function get_pos_from_feature_name($featureID){
- $components = explode("_", $featureID);
- return $components[2];
- }
- function get_word_from_feature_name($featureID){
- $components = explode("_", $featureID);
- return $components[3];
- }
- function starts_with($haystack, $needle){
- $length = strlen($needle);
- return (substr($haystack, 0, $length) === $needle);
- }
- function make_word_feature_name($wordID, $pos, $word){
- return "w_".$wordID.'_'.$pos.'_'.$word;
- }
- function make_search_word_feature_name($wordID, $pos, $word){
- return "s_".$wordID.'_'.$pos.'_'.$word;
- }
- /**************************************************************
- Relevance words
- ***************************************************************/
- function convert_vector_query_to_relevance_words($query){
- $words = array();
- $features = $query->features();
- foreach(array_keys($features) as $featureID){
- if(is_word_feature($featureID) || is_search_word_feature($featureID)){
- $id = get_id_from_feature_name($featureID);
- $pos = get_pos_from_feature_name($featureID);
- $word = get_word_from_feature_name($featureID);
- $weight = $features[$featureID];
- if(!array_key_exists($word, $words)){
- $words[$word] = array();
- $words[$word]['total'] = 0;
- $words[$word]['ids'] = array();
- $words[$word]['words'] = array();
- }
- array_push($words[$word]['words'], array("pos"=>$pos, "id"=>$id, "weight"=>$weight));
- $words[$word]['total'] += $weight;
- }
- }
- uasort($words, 'compare_words');
- $relevance = array("relevant"=>array(), "irrelevant"=>array());
- foreach(array_keys($words) as $word){
- if($words[$word]['total'] >= 0){
- array_push($relevance['relevant'], array("word"=>$word,
- "total"=>$words[$word]['total'],
- "words"=>$words[$word]['words']));
-
- }else{
- array_push($relevance['irrelevant'], array("word"=>$word,
- "total"=>$words[$word]['total'],
- "words"=>$words[$word]['words']));
- }
- }
- return $relevance;
- }
- function compare_words($word1, $word2){
- if($word1['total'] > $word2['total']){
- return 0;
- }
- else{
- return $word1['total'] > $word2['total'] ? -1 : 1;
- }
- }
- ?>