/www/includes/easyparliament/searchengine.php
PHP | 641 lines | 524 code | 38 blank | 79 comment | 130 complexity | a0dca896dadcf1c5b3e0aae3fa984d3e MD5 | raw file
- <?php
- # vim:sw=4:ts=4:et:nowrap
- /*
- SEARCHENGINE class 2004-05-26
- francis@flourish.org
- Example usage:
- include_once INCLUDESPATH."easyparliament/searchengine.php";
- $searchengine = new SEARCHENGINE($searchstring);
- $description = $searchengine->query_description();
- $short_description = $searchengine->query_description_short();
- $count = $searchengine->run_count();
- // $first_result begins at 0
- $searchengine->run_search($first_result, $results_per_page);
- $gids = $searchengine->get_gids();
- $relevances = $searchengine->get_relevances();
- $bestpos = $searchengine->position_of_first_word($body);
- $extract = $searchengine->highlight($extract);
- */
- include_once INCLUDESPATH . 'dbtypes.php';
- if (defined('XAPIANDB') && XAPIANDB) {
- if (file_exists('/usr/local/share/php5/xapian.php'))
- include_once '/usr/local/share/php5/xapian.php';
- if (file_exists('/usr/local/share/xapian-bindings/php5/xapian.php'))
- include_once '/usr/local/share/xapian-bindings/php5/xapian.php';
- if (file_exists('/usr/share/php5/xapian.php'))
- include_once '/usr/share/php5/xapian.php';
- if (file_exists('/usr/share/php/xapian.php'))
- include_once '/usr/share/php/xapian.php';
- }
- global $xapiandb;
- class SEARCHENGINE {
- function SEARCHENGINE ($query) {
- if (!defined('XAPIANDB') || !XAPIANDB)
- return null;
- $this->query = $query;
- $this->stemmer = new XapianStem('english');
- $this->enquire = null;
- // Any characters other than this are treated as, basically, white space
- // (apart from quotes and minuses, special case below)
- // The colon is in here for prefixes speaker:10043 and so on.
- $this->wordchars = "A-Za-z0-9:";
- // An array of normal words.
- $this->words = array();
- // All quoted phrases, as an (array of (arrays of words in each phrase)).
- $this->phrases = array();
- // Items prefixed with a colon (speaker:10024) as an (array of (name, value))
- $this->prefixed = array();
- // Words you don't want
- $this->excluded = array();
- // Stemmed words // doesn't work yet
- // $this->rough = array();
-
- // Split words up into individual words, and quoted phrases
- preg_match_all('/(' .
- '"|' . # match either a quote, or...
- '(?:(?<![' .$this->wordchars. '])-)?' . # optionally a - (exclude)
- # if at start of word (i.e. not preceded by a word character, in
- # which case it is probably a hyphenated-word)
- '['.$this->wordchars.']+' . # followed by a string of word-characters
- ')/', $query, $all_words);
- if ($all_words) {
- $all_words = $all_words[0];
- } else {
- $all_words = array();
- }
- $in_quote = false;
- foreach ($all_words as $word) {
- if ($word == '"') {
- $in_quote = !$in_quote;
- if ($in_quote) {
- array_push($this->phrases, array());
- }
- continue;
- }
- if ($word == '') {
- continue;
- }
-
- if (strpos($word, ':') !== false) {
- $items = split(":", strtolower($word));
- $type = $items[0];
- $value = join(":", array_slice($items,1));
- if ($type == "section") {
- # Adding section:representatives but not removing debates & debate in case they are used anywhere
- if ($value == "debates" || $value == "debate" || $value == "representatives") $value = 1;
- elseif ($value == 'whall' || $value == 'westminster' || $value == 'westminhall') $value = 2;
- elseif ($value == "wrans" || $value == "wran") $value = 3;
- elseif ($value == 'wms' || $value == 'statements' || $value == 'statement') $value = 4;
- # Adding section:senate but not removing lords & lordsdebates in case they are used anywhere
- elseif ($value == 'lordsdebates' || $value == 'lords' || $value == 'senate') $value = 101;
- elseif ($value == 'ni') $value = 5;
- elseif ($value == 'pbc' || $value == 'standing') $value = 6;
- $type = "major";
- }
- if ($type == "groupby") {
- if ($value == "date" || $value == "day") $value = "day";
- if ($value == "debates" || $value == "debate" || $value == "department" || $value == "departments" || $value == "dept") $value = "debate";
- if ($value == "speech" || $value == "speeches") $value = "speech";
- }
- array_push($this->prefixed, array($type, $value));
- } elseif (strpos($word, '-') !== false) {
- array_push($this->excluded, str_replace("-", "", strtolower($word)));
- } /*else if (strpos($word, '~') !== false) {
- array_push($this->rough, str_replace("~", "", strtolower($word)));
- } */ elseif ($in_quote) {
- array_push($this->phrases[count($this->phrases) - 1], strtolower($word));
- } else {
- array_push($this->words, strtolower($word));
- }
- }
- twfy_debug("SEARCH", "words: " . var_export($this->words, true));
- twfy_debug("SEARCH", "phrases: " . var_export($this->phrases, true));
- twfy_debug("SEARCH", "prefixed: " . var_export($this->prefixed, true));
- twfy_debug("SEARCH", "excluded: " . var_export($this->excluded, true));
- // twfy_debug("SEARCH", "rough: " . var_export($this->rough, true));
- }
- function make_phrase($phrasearray) {
- return '"' . join(' ', $phrasearray) . '"';
- }
- function query_description_internal($long) {
- global $PAGE, $hansardmajors;
-
- if (!defined('XAPIANDB') || !XAPIANDB)
- return '';
- $description = "";
- if (count($this->words) > 0) {
- if ($long and $description == "") {
- $description .= " containing";
- }
- $description .= " the ". make_plural("word", count($this->words));
- $description .= " '";
- if (count($this->words) > 2) {
- $description .= join("', '", array_slice($this->words, 0, -2));
- $description .= "', '";
- $description .= $this->words[count($this->words)-2] . "', and '" . $this->words[count($this->words)-1];
- } elseif (count($this->words) == 2) {
- $description .= $this->words[0] . "' and '" . $this->words[1];
- } else {
- $description .= $this->words[0];
- }
- $description .= "'";
- }
- if (count($this->phrases) > 0) {
- if ($description == "") {
- if ($long) {
- $description .= " containing";
- }
- } else {
- $description .= " and";
- }
- $description .= " the ". make_plural("phrase", count($this->phrases)) . " ";
- $description .= join(', ', array_map(array($this, "make_phrase"), $this->phrases));
- }
- if (count($this->excluded) > 0) {
- if (count($this->words) > 0 or count($this->phrases) > 0) {
- $description .= " but not";
- } else {
- $description .= " excluding";
- }
- $description .= " the ". make_plural("word", count($this->excluded));
- $description .= " '" . join(' ', $this->excluded) . "'";
- }
- /* if (count($this->rough) > 0) {
- if ($description == "") {
- if ($long) {
- $description .= " containing ";
- }
- }
- $description .= " roughly words '" . join(' ', $this->rough) . "'";
- } */
- $major = array(); $speaker = array();
- foreach( $this->prefixed as $items ) {
- if ($items[0] == 'speaker') {
- $member = new MEMBER(array('person_id' => $items[1]));
- $name = $member->full_name();
- $speaker[] = $name;
- } elseif ($items[0] == 'major') {
- if (isset($hansardmajors[$items[1]]['title'])) {
- $major[] = $hansardmajors[$items[1]]['title'];
- } else {
- $PAGE->error_message("Unknown major section '$items[1]' ignored");
- }
- } elseif ($items[0] == 'groupby') {
- if ($items[1] == 'day') {
- $description .= ' grouped by day';
- } elseif ($items[1] == 'debate') {
- $description .= ' grouped by debate/department';
- } elseif ($items[1] == 'speech') {
- $description .= ' showing all speeches';
- } else {
- $PAGE->error_message("Unknown group by '$items[1]' ignored");
- }
- } elseif ($items[0] == "bias") {
- list($weight, $halflife) = explode(":", $items[1]);
- $description .= " bias by $weight halflife $halflife seconds";
- } elseif ($items[0] == 'date') {
- $description .= ' spoken on ' . $items[1];
- } elseif ($items[0] == 'batch') {
- # silently ignore, as description goes in email alerts
- #$description .= ' in search batch ' . $items[1];
- } else {
- $PAGE->error_message("Unknown search prefix '$items[0]' ignored");
- }
- }
- if (sizeof($speaker)) $description .= ' by ' . join(' or ', $speaker);
- if (sizeof($major)) $description .= ' in ' . join(' or ', $major);
- return trim($description);
- }
- // Return textual description of search
- function query_description_short() {
- return $this->query_description_internal(false);
- }
- // Return textual description of search
- function query_description_long() {
- return $this->query_description_internal(true);
- }
- // Return stem of a word
- function stem($word) {
- return $this->stemmer->stem_word(strtolower($word));
- }
- // Internal use mainly - you probably want query_description. Converts
- // parsed form of query that PHP knows into a full textual form again (for
- // feeding to Xapian's queryparser).
- function query_remade() {
- $remade = array();
- foreach( $this->phrases as $phrase ) {
- $remade[] = '"' . join(' ', $phrase) . '"';
- }
- if ($this->words) {
- $remade = array_merge($remade, $this->words);
- }
- $prefixes = array();
- foreach( $this->prefixed as $items ) {
- if (!isset($prefixes[$items[0]])) $prefixes[$items[0]] = array();
- if ($items[0] != 'groupby' && $items[0] != 'bias') {
- $prefixes[$items[0]][] = $items[0] . ':' . $items[1];
- }
- }
- foreach ($prefixes as $prefix) {
- if (count($prefix))
- $remade[] = '(' . join(' OR ', $prefix) . ')';
- }
- $query = trim(join(' AND ', $remade));
- if ($this->excluded) {
- $query .= ' NOT (' . join(' AND ', $this->excluded) . ')';
- }
- // $remade .= ' ' . join(' ', array_map(array($this, "stem"), $this->rough));
- return $query;
- }
- // Perform partial query to get a count of number of matches
- function run_count () {
- if (!defined('XAPIANDB') || !XAPIANDB)
- return null;
- $start = getmicrotime();
- global $xapiandb;
- if (!$xapiandb) {
- $xapiandb = new XapianDatabase(XAPIANDB);
- }
- if (!$this->enquire) {
- $this->enquire = new XapianEnquire($xapiandb);
- }
- $queryparser = new XapianQueryParser();
- $queryparser->set_stemming_strategy(QueryParser_STEM_NONE);
- $queryparser->set_default_op(Query_OP_AND);
- $queryparser->add_prefix("speaker", "speaker:");
- $queryparser->add_prefix("major", "major:");
- $queryparser->add_prefix('date', 'date:');
- $queryparser->add_prefix('batch', 'batch:');
- twfy_debug("SEARCH", "query remade -- ". $this->query_remade());
- // We rebuild (with query_remade) our query and feed that text string to
- // the query parser. This is because the error handling in the query parser
- // is a bit knackered, and we want to be sure our highlighting etc. exactly
- // matches. XXX don't need to do this for more recent Xapians
- $query = $queryparser->parse_query($this->query_remade());
- twfy_debug("SEARCH", "queryparser description -- " . $query->get_description());
- $this->enquire->set_query($query);
- // Set collapsing and sorting
- global $PAGE;
- $collapsed = false;
- foreach( $this->prefixed as $items ) {
- if ($items[0] == 'groupby') {
- $collapsed = true;
- if ($items[1] == 'day')
- $this->enquire->set_collapse_key(2);
- else if ($items[1] == 'debate')
- $this->enquire->set_collapse_key(3);
- else if ($items[1] == 'speech')
- ; // no collapse key
- else
- $PAGE->error_message("Unknown group by '$items[1]' ignored");
- } elseif ($items[0] == 'bias') {
- list($weight, $halflife) = explode(":", $items[1]);
- $this->enquire->set_bias($weight, intval($halflife));
- } elseif ($items[0] == 'speaker') {
- # Don't do any collapsing if we're searching for a person's speeches
- $collapsed = true;
- }
- }
- // default to grouping by subdebate, i.e. by page
- if (!$collapsed)
- $this->enquire->set_collapse_key(7);
-
- $matches = $this->enquire->get_mset(0, 500);
- // Take either: 1) the estimate which is sometimes too large or 2) the
- // size which is sometimes too low (it is limited to the 500 in the line
- // above). We get the exact mset we need later, according to which page
- // we are on.
- if ($matches->size() < 500) {
- $count = $matches->size();
- } else {
- $count = $matches->get_matches_estimated();
- }
- $duration = getmicrotime() - $start;
- twfy_debug ("SEARCH", "Search count took $duration seconds.");
- return $count;
- }
- // Perform the full search...
- function run_search ($first_result, $results_per_page, $sort_order='relevance') {
- $start = getmicrotime();
- // NOTE: this is to do sort by date
- switch ($sort_order) {
- case 'date':
- $this->enquire->set_sort_by_value_then_relevance(0, true);
- break;
- case 'created':
- $this->enquire->set_sort_by_value_then_relevance(6, true);
- default:
- //do nothing, default ordering is by relevance
- break;
- }
- $matches = $this->enquire->get_mset($first_result, $results_per_page);
- $this->gids = array();
- $this->created = array();
- $this->relevances = array();
- $iter = $matches->begin();
- $end = $matches->end();
- while (!$iter->equals($end))
- {
- $relevancy = $iter->get_percent();
- $weight = $iter->get_weight();
- $doc = $iter->get_document();
- $gid = $doc->get_data();
- if ($sort_order=='created') {
- array_push($this->created, $doc->get_value(6));
- }
- twfy_debug("SEARCH", "gid: $gid relevancy: $relevancy% weight: $weight");
- array_push($this->gids, "uk.org.publicwhip/".$gid);
- array_push($this->relevances, $relevancy);
- $iter->next();
- }
- $duration = getmicrotime() - $start;
- twfy_debug ("SEARCH", "Run search took $duration seconds.");
- }
- // ... use these to get the results
- function get_gids() {
- return $this->gids;
- }
- function get_relevances() {
- return $this->relevances;
- }
- function get_createds() {
- return $this->created;
- }
- // Puts HTML highlighting round all the matching words in the text
- function highlight($body) {
- // Contents will be used in preg_replace() to highlight the search terms.
- $findwords = array();
- $replacewords = array();
-
- foreach ($this->words as $word) {
- if (ctype_digit($word)) {
- array_push($findwords, "/\b($word|" . number_format($word) . ")\b/");
- } else {
- array_push($findwords, "/\b($word)\b/i");
- }
- array_push($replacewords, "<span class=\"hi\">\\1</span>");
- //array_push($findwords, "/([^>\.\'])\b(" . $word . ")\b([^<\'])/i");
- //array_push($replacewords, "\\1<span class=\"hi\">\\2</span>\\3");
- }
- foreach( $this->phrases as $phrase ) {
- $phrasematch = join($phrase, '[^'.$this->wordchars.']+');
- array_push($findwords, "/\b($phrasematch)\b/i");
- $replacewords[] = "<span class=\"hi\">\\1</span>";
- }
-
- // Highlight search words.
- $hlbody = preg_replace($findwords, $replacewords, $body);
- // Remove any highlighting within HTML.
- $hlbody = preg_replace('#<(a|phrase)\s([^>]*?)<span class="hi">(.*?)</span>([^>]*?)">#', "<\\1 \\2\\3\\4\">", $hlbody);
- $hlbody = preg_replace('#<(/?)<span class="hi">a</span>([^>]*?)>#', "<\\1a\\2>", $hlbody); # XXX Horrible hack
- // Collapse duplicates
- $hlbody = preg_replace("#</span>(\W+)<span class=\"hi\">#", "\\1", $hlbody);
- return $hlbody;
- }
- // Find the position of the first of the search words/phrases in $body.
- function position_of_first_word($body) {
- $lcbody = ' ' . strtolower($body) . ' '; // spaces to make regexp mapping easier
- $pos = -1;
- // look for phrases
- foreach( $this->phrases as $phrase ) {
- $phrasematch = join($phrase, '[^'.$this->wordchars.']+');
- if (preg_match('/([^'.$this->wordchars.']' . $phrasematch . '[^'.$this->wordchars. '])/', $lcbody, $matches))
- {
- $wordpos = strpos( $lcbody, $matches[0] );
- if ($wordpos) {
- if ( ($wordpos < $pos) || ($pos==-1) ) {
- $pos = $wordpos;
- }
- }
- }
- }
- // only look for earlier words if phrases weren't found
- if ($pos == -1)
- {
- foreach( $this->words as $word ) {
- if (ctype_digit($word)) $word = '(?:'.$word.'|'.number_format($word).')';
- if (preg_match('/([^'.$this->wordchars.']' . $word . '[^'.$this->wordchars. '])/', $lcbody, $matches))
- {
- $wordpos = strpos( $lcbody, $matches[0] );
- if ($wordpos) {
- if ( ($wordpos < $pos) || ($pos==-1) ) {
- $pos = $wordpos;
- }
- }
- }
- }
- }
- if ($pos == -1) {
- $pos = 0;
- }
-
- return $pos;
- }
- /*
- old stemming code (does syntax highlighting with stemming, but does it badly)
- $splitextract = preg_split("/([a-zA-Z]+)/", $extract, -1, PREG_SPLIT_DELIM_CAPTURE);
- $hlextract = "";
- foreach( $splitextract as $extractword) {
- $hl = false;
- foreach( $searchstring_stemwords as $word ) {
- if ($word == '') {
- continue;
- }
-
- $matchword = $searchengine->stem($extractword);
- #print "$extractword : $matchword : $word<br>";
- if ($matchword == $word) {
- $hl = true;
- break;
- }
- }
- if ($hl)
- $hlextract .= "<span class=\"hi\">$extractword</span>";
- else
- $hlextract .= $extractword;
- }
- $hlextract = preg_replace("#</span>\s+<span class=\"hi\">#", " ", $hlextract);
- */
- /* This doesn't work yet as PHP bindings are knackered - the idea is
- to do all parsing here and replace queryparser, so we can do stuff
- how we want more. e.g. sync highlighting with the queries better */
- // Instead we are now parsing in PHP, and rebuilding something to feed to
- // query parser. Yucky but works.
- /* $querydummy = new XapianQuery("dummy");
- $query1 = new XapianQuery("ethiopia");
- $query2 = new XapianQuery("economic");
- #$query = $querydummy->querycombine(Query_OP_AND, $query1, $query2);
- $query = new_QueryCombine(Query_OP_AND, $query1, $query2);
- #new_QueryCombine
- # $query = $query1->querycombine(Query_OP_OR, $query1, $query2);
- # foreach ($this->words as $word) {
- # $query = new XapianQuery(Query_OP_OR, $query, new XapianQuery($word));
- # }
- print "description:" . $query->get_description() . "<br>"; */
- }
- global $SEARCHENGINE;
- $SEARCHENGINE = null;
- function search_by_usage($search, $house = 0) {
- $data = array();
- $SEARCHENGINE = new SEARCHENGINE($search);
- $data['pagetitle'] = $SEARCHENGINE->query_description_short();
- $SEARCHENGINE = new SEARCHENGINE($search . ' groupby:speech');
- $count = $SEARCHENGINE->run_count();
- if ($count <= 0) {
- $data['error'] = 'No results';
- return $data;
- }
- $SEARCHENGINE->run_search(0, 10000, 'date');
- $gids = $SEARCHENGINE->get_gids();
- if (count($gids) <= 0) {
- $data['error'] = 'No results';
- return $data;
- }
- if (count($gids) == 10000)
- $data['limit_reached'] = true;
- # Fetch all the speakers of the results, count them up and get min/max date usage
- $speaker_count = array();
- $gids = join('","', $gids);
- $db = new ParlDB;
- $q = $db->query('SELECT gid,speaker_id,hdate FROM hansard WHERE gid IN ("' . $gids . '")');
- for ($n=0; $n<$q->rows(); $n++) {
- $gid = $q->field($n, 'gid');
- $speaker_id = $q->field($n, 'speaker_id'); # This is member ID
- $hdate = $q->field($n, 'hdate');
- if (!isset($speaker_count[$speaker_id])) {
- $speaker_count[$speaker_id] = 0;
- $maxdate[$speaker_id] = '1001-01-01';
- $mindate[$speaker_id] = '9999-12-31';
- }
- $speaker_count[$speaker_id]++;
- if ($hdate < $mindate[$speaker_id]) $mindate[$speaker_id] = $hdate;
- if ($hdate > $maxdate[$speaker_id]) $maxdate[$speaker_id] = $hdate;
- }
- # Fetch details of all the speakers
- if (count($speaker_count)) {
- $speaker_ids = join(',', array_keys($speaker_count));
- $q = $db->query('SELECT member_id, person_id, title,first_name,last_name,constituency,house,party,
- moffice_id, dept, position, from_date, to_date, left_house
- FROM member LEFT JOIN moffice ON member.person_id = moffice.person
- WHERE member_id IN (' . $speaker_ids . ')
- ' . ($house ? " AND house=$house" : '') . '
- ORDER BY left_house DESC');
- for ($n=0; $n<$q->rows(); $n++) {
- $mid = $q->field($n, 'member_id');
- if (!isset($pids[$mid])) {
- $title = $q->field($n, 'title');
- $first = $q->field($n, 'first_name');
- $last = $q->field($n, 'last_name');
- $cons = $q->field($n, 'constituency');
- $house = $q->field($n, 'house');
- $party = $q->field($n, 'party');
- $full_name = ucfirst(member_full_name($house, $title, $first, $last, $cons));
- $pid = $q->field($n, 'person_id');
- $pids[$mid] = $pid;
- $speakers[$pid]['house'] = $house;
- $speakers[$pid]['left'] = $q->field($n, 'left_house');
- }
- $dept = $q->field($n, 'dept');
- $posn = $q->field($n, 'position');
- $moffice_id = $q->field($n, 'moffice_id');
- if ($dept && $q->field($n, 'to_date') == '9999-12-31')
- $speakers[$pid]['office'][$moffice_id] = prettify_office($posn, $dept);
- if (!isset($speakers[$pid]['name'])) {
- $speakers[$pid]['name'] = ($house==2?'Senator ':'') . $full_name . ($house==1?' MP':'');
- $speakers[$pid]['party'] = $party;
- }
- }
- }
- $pids[0] = 0;
- $speakers[0] = array('party'=>'', 'name'=>'Headings, procedural text, etc.', 'house'=>0, 'count'=>0);
- $party_count = array();
- $ok = 0;
- foreach ($speaker_count as $speaker_id => $count) {
- if (!isset($pids[$speaker_id])) continue;
- $pid = $pids[$speaker_id];
- if (!isset($speakers[$pid]['pmindate'])) {
- $speakers[$pid]['count'] = 0;
- $speakers[$pid]['pmaxdate'] = '1001-01-01';
- $speakers[$pid]['pmindate'] = '9999-12-31';
- $ok = 1;
- }
- if (!isset($party_count[$speakers[$pid]['party']]))
- $party_count[$speakers[$pid]['party']] = 0;
- $speakers[$pid]['count'] += $count;
- $party_count[$speakers[$pid]['party']] += $count;
- if ($mindate[$speaker_id] < $speakers[$pid]['pmindate']) $speakers[$pid]['pmindate'] = $mindate[$speaker_id];
- if ($maxdate[$speaker_id] > $speakers[$pid]['pmaxdate']) $speakers[$pid]['pmaxdate'] = $maxdate[$speaker_id];
- }
- function sort_by_count($a, $b) {
- if ($a['count'] > $b['count']) return -1;
- if ($a['count'] < $b['count']) return 1;
- return 0;
- }
- if ($speakers[0]['count']==0) unset($speakers[0]);
- uasort($speakers, 'sort_by_count');
- arsort($party_count);
- if (!$ok) {
- $data['error'] = 'No results';
- return $data;
- }
- $data['party_count'] = $party_count;
- $data['speakers'] = $speakers;
- return $data;
- }