similarsentences.php

/src/php/similarsentences/similarsentences.php

https://bitbucket.org/silverasm/wordseer · PHP · 607 lines · 410 code · 34 blank · 163 comment · 37 complexity · d34c93edb60f8f91158e42159aa48950 MD5 · raw file

<?php
/* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */

/*****************************************************************************
similarsentences.php

Calculates a result set of sentences in response to a query using relevance 
feedback, specifically the Rocchio algorithm.

The top-level dispatch procedure works with either a string query,
or with a vector query and a 2 set of sentences: those judged by the user to
be relevant, and non-relevant. In return it always sends back a set of
sentences.

The Rochhio algorithm uses the vector-space model of information retrieval
to refine a query based on relevance feedback.

*****************************************************************************/
include_once '../../../config.php';
include_once '../dbsetup.php';
include_once '../util.php';
include_once 'sparsevector.php';
include_once '../priorityqueue.php';

/** Algorithm parameters **/
// weight with which query should be adjusted towards relevant sentences
$ALPHA_plus = 0.1;
// weight with which query should be adjusted towards relevant words
$ALPHA_w_plus = 1; // treat like a search term
// weight with which query should be adjusted away from irrelevant sentences 
$ALPHA_minus = $ALPHA_plus*0.1; 
// weight with which query should be adjusged away from irrelevant words
$ALPHA_w_minus = $ALPHA_w_plus*0.1;
// number of returned sentences
$LIMIT = 500;

/** dispatch procedure

Sends back sentences based on the $_POST data sent by the client.

Arguments:
	-- 'string_query': "true", or "false". "true" indicates that the query is
						a string query, and the query will be interpreted as             		
						such
	-- 'vector_query': "true" or "false". "true" indicates that the query is 
						a vector query, and the query will be interpreted as
						a vector.
	-- 'query':  a String (if string_query is "true"), or (if 'vector_query' 
				is "true") an object containing 
				features:
					a map from from string feature ID's to floating-point 		
					values 
				relevant: a list of relevant sentences represented 
				irrelevant: a list of irrelevant sentences represented
					
	-- 'relevant': a JSON _list_ of integer sentence ID's judged by the user 		
					to be relevant.
	-- 'irrelevant': a JSON _list_ of integer sentence ID's judged by the user 
					to be NOT relevant.
	-- 'relevant_words': a JSON _list_ of integer word ID's judged by the user 		
					to be relevant.
	-- 'irrelevant_words': a JSON _list_ of integer word ID's judged by the user 
					to be NOT relevant.

Return values

A JSON response is sent to the client, containing the following data:
	-- sentences : a list of sentences. Where each sentence has an ID, a 	
				  narrativeID, title, date, and a string.
	-- query : a feature-vector representing the current query, to be stored
				by the client and sent back with further query refinements.
{sentences:
	[{
	sentenceID: integer
	sentence: string
	}
	...,
	....]
 vector_query: {featureID: floating-point value, ..., ...}
 relevant: [ the list of sentence ID's incorporated as relevant],
 irrelevant: [ the list of sentence ID's incorporated as irrelevant],
 relevant_words: [ the list of word ID's incorporated as relevant],
 irrelevant_words: [ the list of word ID's incorporated as irrelevant]
}
**/
dispatch();
function dispatch(){
	$result = array();
	if($_POST['string_query'] == "true"){
		$current_query = (array) json_decode($_POST['vector-query']);
		$result = process_string_query(mysql_escape_string(strtolower($_POST['query'])), $current_query);
	}else if($_POST['vector_query'] == "true"){
		
		// unpack the sent data
		// unpack sentences
		$relevant_sentences = array();
		$irrelevant_sentences = array();
		if(array_key_exists( 'relevant', $_POST)){
			$relevant_sentences = json_decode($_POST['relevant']);
		}
		if(array_key_exists('irrelevant', $_POST)){
			$irrelevant_sentences = json_decode($_POST['irrelevant']);
		}
		// unpack words
		$relevant_words = array();
		$irrelevant_words = array();
		if(array_key_exists('relevant_words', $_POST)){
			$relevant_words = json_decode($_POST['relevant_words']);
		}
		if(array_key_exists('irrelevant_words', $_POST)){
			$irrelevant_words = json_decode($_POST['irrelevant_words']);
		}
		$query = (array) json_decode($_POST['query']);
		
		// calculate required information
		if(array_key_exists('calculate_new_query_words', $_POST)){
			// calculate what the new vector query would be
			// given a set of sentences marked relevant/irrelevant
			// and the previous vector query
			$new_query = calculate_new_vector_query($query, $relevant_sentences, $irrelevant_sentences, $relevant_words, $irrelevant_words);
			$words = convert_vector_query_to_relevance_words($new_query);
			$result = $words;
		}else{
			// perform relevance feedback and 
			// send back new sentences based on a set of
			// marked sentences and words
			$result = process_relevance_feedback($query, $relevant_sentences, $irrelevant_sentences, $relevant_words, $irrelevant_words);
		}
	}
	echo json_encode($result);
}

/**Searches for sentences that match the given search query, and returns them
along with a vector representation, which is the string query translated into vector form.

Arguments:
	-- query : the string query typed in by the user, escaped for MYSQL 
	           safety.
Return:
A php array() with the following key-value pairs:
	{sentences:[
	           {id:sentenceID, sentence:string sentence}, 
	           ...]
	query:{featureID:floating-point value, ...}
	}
*/
function process_string_query($query, $old_query){
	$old_features = (array) $old_query['features'];
	$old_vector_query = new SparseVector($old_features);
	$vect_query = convert_query_to_sparse_vector($query);
	$new_query = $vect_query->vectorAdd($old_vector_query);
	$sentences = retrieve_sentences_from_vector_query($new_query);
	$result = array();
	$result['query'] = array();
	$result['sentences'] = $sentences;
	$result['query']['features'] = $new_query->features;
	$result['query']['relevant'] = $old_query['relevant'];
	$result['query']['irrelevant'] = $old_query['irrelevant'];
	$result['query']['irrelevant_words'] = $old_query['irrelevant_words'];
	$relevant_words = $old_query['relevant_words'];
	$words = explode(" ", $query);
	foreach($words as $word){
		$wordIDs = explode(", ", getWordID($word));
		$relevant_words = array_merge($relevant_words, $wordIDs);
	}
	$result['query']['relevant_words'] = $relevant_words;
	return $result;
}

/* Convert a string query to a sparse vector by assigning weights to the words in the query, where words are determined by splitting on whitespace. */
function convert_query_to_sparse_vector($query){
	global $STOPS;
	$words = explode(" ", $query);
	$vector = new SparseVector();
	$wordID = -1;
	foreach($words as $word){
		$wordIDs = explode(", ", getWordIDsAndPOS($word));
		if(!strstr($STOPS, strtolower($word))){ 
			// stopwords defined in dbsetup.php, line 403
			foreach($wordIDs as $wordID){
				$id = explode("-", $wordID);
				$id = $id[0];
				$pos = explode("-", $wordID);
				$pos = $pos[1];
				if($word == replaceWeirdCharacters($word)){
					add_search_word_feature($id, $pos, $word, 1, $vector);
				}				
			}
		}
	}
	return $vector;
}

/* get ID's and parts of speech of a surface word */
function getWordIDsAndPOS($word){
	$query = "";
	if(!(strstr($word, "*"))){
		$query = "SELECT pos, id FROM word WHERE word ='".mysql_escape_string(trim($word))."';";	
	}else{
		$query = "SELECT pos, id FROM word WHERE word like '".mysql_escape_string(trim(str_replace("*", "%", $word)))."';";
	}
  $result = mysql_query($query);
  if(mysql_num_rows($result)>0){
    $ids = array();
    while($row =  mysql_fetch_array($result)){
      array_push($ids, $row['id']."-".$row['pos']);
    }
    return join(", ", $ids);
  }else{
    return -1;
  }
}

/* Adds a feature corresponding to a word to a sparse vector*/
function add_word_feature($wordID, $pos, $word, $weight, $vector){
	$vector->setFeatureValue(make_word_feature_name($wordID, $pos, $word), $weight);
}

/* Adds a feature corresponding to a searched word to a sparse vector*/
function add_search_word_feature($wordID, $pos, $word, $weight, $vector){
	$vector->setFeatureValue(make_search_word_feature_name($wordID, $pos, $word), $weight);
}

/** A top-level wrapper for the relevance feedback computation functions.
 Helper functions fetch sentences by updating the current query to reflect the relevance feedback given by the user.

- convert the irrelevant and relevant sentences into vectors v_+ and v_-
- calculate the new query q' = q + (a_+v_+) - (a_-v_-)
- calculate the sentences that match the new query

Arguments:
	-- query : the (sparse) vector query sent by the client 
	           {featureID:floating-point value, ....}
	-- relevant: a list of sentenceID's marked relevant
	-- irrelevant: a list of sentenceID's marked irrelevant
	
Return:
	A php array() with the following key-value pairs.
	{sentences:[
	           {id:sentenceID, sentence:string sentence}, 
	           ...]
	query:{featureID:floating-point value, ...}
	}
*/
function process_relevance_feedback($query, $relevant, $irrelevant, $relevant_words, $irrelevant_words){
	$new_query = calculate_new_vector_query($query, $relevant, $irrelevant, $relevant_words, $irrelevant_words);
	$sentences = retrieve_sentences_from_vector_query($new_query);
	$result = array();
	$result['sentences']  = $sentences;
	$result['query'] = array();
	$result['query']['features'] = $new_query->features;
	$result['query']['relevant'] = $relevant;
	$result['query']['irrelevant'] = $irrelevant;
	$result['query']['relevant_words'] = $relevant_words;
	$result['query']['irrelevant_words'] = $irrelevant_words;
	return $result;
}

function calculate_new_vector_query($query, $relevant, $irrelevant, $relevant_words, $irrelevant_words){
	$features = (array) $query['features'];
	$vector_query = new SparseVector($features);
	$vector_query->normalize();
	$sentence_adjustment = calculate_sentence_adjustment($query, $vector_query, $relevant, $irrelevant);
	$word_adjustment = calculate_word_adjustment($query,$vector_query, $relevant_words, $irrelevant_words);
	$adjustment = $sentence_adjustment->vectorAdd($word_adjustment);
	$new_query = $vector_query->vectorAdd($adjustment);
	$new_query->normalize();
	return $new_query;
}

function calculate_sentence_adjustment($query, $vector_query, $relevant, $irrelevant){
	global $ALPHA_plus; // relevant sentences weight
	global $ALPHA_minus; // irrelevant sentences weight
	$already_relevant = $query['relevant'];
	$already_irrelevant = $query['irrelevant'];
	$new_relevant = array_subtract($already_relevant, $relevant);
	$no_longer_relevant = array_subtract($relevant, $already_relevant);
	$new_irrelevant = array_subtract($already_irrelevant, $irrelevant);
	$no_longer_irrelevant = array_subtract($irrelevant, $already_irrelevant);
	$relevant_vect = convert_sentence_IDs_to_sparse_vector($new_relevant);
	$relevant_vect->normalize();
	$no_longer_relevant_vect = convert_sentence_IDs_to_sparse_vector($no_longer_relevant);
	$no_longer_relevant_vect->normalize();
	$irrelevant_vect = convert_sentence_IDs_to_sparse_vector($new_irrelevant);
	$irrelevant_vect->normalize();
	$no_longer_irrelevant_vect = convert_sentence_IDs_to_sparse_vector($no_longer_irrelevant);
	$no_longer_irrelevant_vect->normalize();
	$positive_adjustment = $relevant_vect->scalarMultiply($ALPHA_plus);
	$no_longer_positive_adjustment = $no_longer_relevant_vect->scalarMultiply(-1*$ALPHA_plus);
	$negative_adjustment = $irrelevant_vect->scalarMultiply(-1*$ALPHA_minus);
	$no_longer_negative_adjustment = $no_longer_irrelevant_vect->scalarMultiply($ALPHA_minus);
	$adjustment = $positive_adjustment->vectorAdd($negative_adjustment);
	$adjustment = $adjustment->vectorAdd($no_longer_positive_adjustment);
	$adjustment = $adjustment->vectorAdd($no_longer_negative_adjustment);
	return $adjustment;
}

function calculate_word_adjustment($query,$vector_query, $relevant, $irrelevant){
	global $ALPHA_w_plus; // relevant words weight
	global $ALPHA_w_minus; // irrelevant words weight
	$previous_word_features = array();
	$features =  $vector_query->features;
	foreach(array_keys($features) as $feature){
		if(is_search_word_feature($feature)){
			//echo $feature;
			$previous_word_features[$feature] = -1*$features[$feature];
		}
	}
	$cancellation_adjustment = new SparseVector($previous_word_features);
	$relevant_vect = convert_word_IDs_to_sparse_vector($relevant);
	//$relevant_vect->normalize();
	$irrelevant_vect = convert_word_IDs_to_sparse_vector($irrelevant);
	//$irrelevant_vect->normalize();
	$positive_adjustment = $relevant_vect->scalarMultiply($ALPHA_w_plus);
	$negative_adjustment = $irrelevant_vect->scalarMultiply(-1*$ALPHA_w_minus);
	$adjustment = $positive_adjustment->vectorAdd($negative_adjustment);
	$adjustment->normalize();
	$adjustment = $adjustment->vectorAdd($cancellation_adjustment);
	return $adjustment;
}

function array_subtract($to_subtract, $subtract_from){
	$result = array();
	foreach($subtract_from as $item){
		if(!in_array($item, $to_subtract)){
			array_push($result, $item);
		}
	}
	return $result;
}

function array_add($array1, $array2){
	$result = array();
	foreach($array1 as $item){
		array_push($result, $item);
	}
	foreach($array2 as $item){
		array_push($result, $item);
	}
	return $result;
}

function convert_sentence_IDs_to_sparse_vector($sentence_ids){
	global $STOPS;
	$features = array();
	if(count($sentence_ids) > 0){
		$sentence_id_string = join(", ", $sentence_ids);
		// get all the words in these sentence and add them to the feature vector
		$sql = "SELECT * from sentence_word_tf_idf, word
		where sentence_id in (".$sentence_id_string.") and word_id = word.id;";
	$words_in_sentences = mysql_query($sql)
		or die("<b>A fatal MySQL error occured</b>.
		<br/> Query: " . $sql . "
		<br/> Error: (" . mysql_errno() . ") " . mysql_error());
	$word_id = -1;
	$weight = -1;
	// add word features
	while($word_in_sentence = mysql_fetch_array($words_in_sentences)){
		// exclude stopwords and weird characters
		$word = $word_in_sentence['word'];
		if(!strstr($STOPS, strtolower($word)) && $word == replaceWeirdCharacters($word)){ 
			$word_feature = make_word_feature_name($word_in_sentence['word_id'], $word_in_sentence['pos'], $word_in_sentence['word']);
			$weight = $word_in_sentence['tf_idf'];
			if(array_key_exists($word_feature, $features)){
				$features[$word_feature] += $weight;
			}else{
				$features[$word_feature] = $weight;
			}
		}
	}
	// add dependency features? synonym features? Maybe, if needed.
	// TODO
	}
	// create the vector
	$vector = new SparseVector($features);
	return $vector;
}

function convert_word_IDs_to_sparse_vector($word_ids){
	global $STOPS;
	$features = array();
	if(count($word_ids) > 0){
		$word_id_string = join(", ", $word_ids);
		// get all the words in these sentence and add them to the feature vector
		$sql = "SELECT * from word
		where id in (".$word_id_string.");";
		$words = mysql_query($sql)
			or die("<b>A fatal MySQL error occured</b>.
			<br/> Query: " . $sql . "
			<br/> Error: (" . mysql_errno() . ") " . mysql_error());
		$word_id = -1;
		$weight = -1;
		// add word features
		while($word = mysql_fetch_array($words)){
			// exclude stopwords
			if(!strstr($STOPS, strtolower($word['word']) && $word['word'] == replaceWeirdCharacters($word['word']))){ 
				$word_feature = make_search_word_feature_name($word['id'], $word['pos'], $word['word']);
				$features[$word_feature] = 1;
			
			}
		}
	}
	// create the vector
	$vector = new SparseVector($features);
	return $vector;
}

/* Use the vector space model of information retrieval to return sentences
that match a given vector query.

Arguments:
	-- query: a vector query {featureID:floating-point value, ....}

Return:
	A list of N=$LIMIT sentences ordered by best match first
	[{id:sentenceID, sentence:string sentence}, ...] 
*/
function retrieve_sentences_from_vector_query($query){
	global $LIMIT;
	$sentence_scores = array();
	$sentences = array();
	// get sentences that match the word-based features
	$word_ids = array();
	foreach(array_keys($query->features) as $featureID){
		if(is_word_feature($featureID) || is_search_word_feature($featureID)){
			array_push($word_ids, get_id_from_feature_name($featureID));
		}
	}
	if(count($word_ids) > 0){
	$string_word_ids = join(", ", $word_ids);
	$score_case = convert_to_case_expression($query->features);
	// alternate score formula: "SUM(tf_idf*".$score_case.")/SUM(tf_idf) as score"	
	$sql = "SELECT sentence_id,
	SUM(".$score_case.") as score
	from sentence_word_tf_idf
	WHERE word_id in (".$string_word_ids.")
	GROUP BY sentence_id ORDER BY score desc LIMIT ".$LIMIT.";";
	//echo $sql;
	$words_in_sentences = mysql_query($sql)
			or die("<b>A fatal MySQL error occured</b>.
			<br/> Query: " . $sql . "
			<br/> Error: (" . mysql_errno() . ") " . mysql_error());
	while($scores = mysql_fetch_array($words_in_sentences)){
		if($scores['score'] > 0){
			array_push($sentences, $scores['sentence_id']);
		}
	}
	}
	return fetch_top_n_sentences($sentences);
}

function convert_to_case_expression($features){
	$totals = array();
	foreach(array_keys($features) as $feature){
		$word_id = get_id_from_feature_name($feature);
		$score = $features[$feature];
		if(!array_key_exists($word_id, $totals)){
			$totals[$word_id] = 0;
		}
		$totals[$word_id] += $score;
	}
	$sql = "(CASE";
	foreach(array_keys($totals) as $id){
		$score = $totals[$id];
		$sql = $sql." 
		WHEN word_id = ".$id." 
			THEN ".$score;
	}
	$sql = $sql." ELSE 0 END)";
	return $sql;
}


/* Fetches the sentences corresponding to the top N sentence ID's in order.

Arguments:
	-- sentenceIDs: the list of sentence id's to fetch in order
	-- N: the number of sentences to fetch, starting from the beginning of the
		  given list.

Return:
	A list of N sentences ordered by best match first
	[{id:sentenceID, sentence:string sentence, [and other metadata]}, ...]
*/
function fetch_top_n_sentences($top_n){
	$sentences = array();
	if(count($top_n) > 0){
	$top_n_id_string = join(", ", $top_n);
	$sql = "SELECT 
		sentence.id as id, sentence.narrative_id, sentence, title, date, full as author 
	 	from sentence, narrative, 
		author_xref_narrative as axn, author
		WHERE sentence.id in (".$top_n_id_string.")
		AND sentence.narrative_id = narrative.id
		AND axn.narrative_id = narrative.id
		AND axn.author_id = author.id;";
	$sentences_result = mysql_query($sql)
			or die("<b>A fatal MySQL error occured</b>.
			<br/> Query: " . $sql . "
			<br/> Error: (" . mysql_errno() . ") " . mysql_error());
	$sentence = array();
	while($sentence_result = mysql_fetch_array($sentences_result)){
		$sentence = array();
		$sentence['id'] = $sentence_result['id'];
		$sentence['words'] = getWordsInSentence($sentence['id']);
		// ... and whatever other metadata here
		$sentence['narrative_id'] = $sentence_result['narrative_id'];
		$sentence['title'] = $sentence_result['title'];
		$sentence['date'] = $sentence_result['date'];
		$sentence['author'] = $sentence_result['author'];
		// store the information
		if(count($sentence['words']) > 0){
			$sentences[$sentence['id']] = $sentence;
		}
	}
	}
	$ordered = array();
	foreach($top_n as $id){
		array_push($ordered, $sentences[$id]);
	}
	return $ordered;
}
function is_word_feature($featureID){
	return starts_with($featureID, "w");
}

function is_search_word_feature($featureID){
	return starts_with($featureID, "s");
}

function get_id_from_feature_name($featureID){
	$components = explode("_", $featureID);
	return $components[1];
}

function get_pos_from_feature_name($featureID){
	$components = explode("_", $featureID);
	return $components[2];
}

function get_word_from_feature_name($featureID){
	$components = explode("_", $featureID);
	return $components[3];
}

function starts_with($haystack, $needle){
    $length = strlen($needle);
    return (substr($haystack, 0, $length) === $needle);
}

function make_word_feature_name($wordID, $pos, $word){
	return "w_".$wordID.'_'.$pos.'_'.$word;
}

function make_search_word_feature_name($wordID, $pos, $word){
	return "s_".$wordID.'_'.$pos.'_'.$word;
}

/**************************************************************
Relevance words
***************************************************************/

function convert_vector_query_to_relevance_words($query){
	$words = array();
	$features = $query->features();
	foreach(array_keys($features) as $featureID){
		if(is_word_feature($featureID) || is_search_word_feature($featureID)){
			$id = get_id_from_feature_name($featureID);
			$pos = get_pos_from_feature_name($featureID);
			$word = get_word_from_feature_name($featureID);
			$weight = $features[$featureID];
			if(!array_key_exists($word, $words)){
				$words[$word] = array();
				$words[$word]['total'] = 0;
				$words[$word]['ids'] = array();
				$words[$word]['words'] = array();
			}
			array_push($words[$word]['words'], array("pos"=>$pos, "id"=>$id, "weight"=>$weight));
			$words[$word]['total'] += $weight;
		}
	}
	uasort($words, 'compare_words');
	$relevance = array("relevant"=>array(), "irrelevant"=>array());
	foreach(array_keys($words) as $word){
		if($words[$word]['total'] >= 0){
			array_push($relevance['relevant'], array("word"=>$word, 
									"total"=>$words[$word]['total'], 
									"words"=>$words[$word]['words']));
			
		}else{
			array_push($relevance['irrelevant'], array("word"=>$word, 
									"total"=>$words[$word]['total'], 
									"words"=>$words[$word]['words']));
		}
	}
	return $relevance;
}

function compare_words($word1, $word2){
	if($word1['total'] > $word2['total']){
		return 0;
	}
	else{
		return $word1['total'] > $word2['total'] ? -1 : 1;
	}
}
?>
Alerts (29)

'$_POST[' Unbounded request input detected; limit size (e.g., check Content-Length or use max input vars) to prevent memory exhaustion
89 90 91 92 99 102 108 111 113
'query(' Potential SQL injection risk; use PDO or MySQLi with prepared statements
91
'global $' Use of global variables; prefer dependency injection or function parameters
172 272 273 299 300 344 380 419
'mysql_fetch_array(' Deprecated mysql_* functions; use PDO or MySQLi for modern database access
205 358 394 443 502
'die(' Abrupt termination detected; use try-catch or custom error handlers for better control
352 388 440 498
Complexity hotspot; line 361 (total complexity: 3)
361
Complexity hotspot; line 396 (total complexity: 3)
396