/microsync.php
PHP | 330 lines | 243 code | 42 blank | 45 comment | 74 complexity | 808b7a39490f067f0fdfa248983bdc1f MD5 | raw file
- #!/usr/bin/php
- <?PHP
- require_once ( 'public_html/php/common.php' ) ;
- error_reporting(E_ERROR|E_CORE_ERROR|E_ALL|E_COMPILE_ERROR);
- require_once ( 'opendb.inc' ) ; // $db = openMixNMatchDB() ;
- require_once ( '../listeria/Chris-G-botclasses/botclasses.php' );
- $ext_cache = array() ;
- function getLinkedExternalID ( $catalog , $extid ) {
- global $db , $ext_cache ;
-
- if ( !isset ( $ext_cache[$catalog] ) ) {
- $ext_cache = array() ;
- $ext_cache[$catalog] = array() ;
- $sql = "SELECT ext_id,ext_url FROM entry WHERE catalog=$catalog" ;// AND ext_id='" . $db->real_escape_string($extid) . "'" ;
- if(!$result = $db->query($sql)) die('There was an error running the query X [' . $db->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()){
- if ( !isset($o->ext_url) or $o->ext_url == null ) continue ;
- $ext_cache[$catalog][$o->ext_id] = $o->ext_url ;
- }
- }
- /*
- $sql = "SELECT * FROM entry WHERE catalog=$catalog AND ext_id='" . $db->real_escape_string($extid) . "'" ;
- if(!$result = $db->query($sql)) die('There was an error running the query X [' . $db->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()){
- if ( $o->ext_url != '' ) return "[" . $o->ext_url . " $extid]" ;
- }
- */
-
- $extid_nolink = preg_replace ( '/[\[\]]/' , '' , $extid ) ;
- if ( isset ( $ext_cache[$catalog][$extid] ) ) return "[" . $ext_cache[$catalog][$extid] . " $extid_nolink]" ;
- return "''$extid''" ; // Fallback
- }
- function formatLabel ( $label ) {
- $label = trim ( $label ) ;
- if ( !preg_match ( '/[a-z]/' , $label ) ) $label = ucwords ( strtolower ( $label ) ) ; // First letter uppercase, if no lower-case letters detected
- $label = preg_replace ( '/ +/' , ' ' , $label ) ; // Remove multiple spaces
- return '"' . $label . '"' ;
- }
- function checkCatalog ( $catalog , $prop ) {
- global $db , $logs , $qs , $catalogs ;
- print "Checking $catalog/$prop\n" ;
- $db = openMixNMatchDB() ;
- $lang = $catalogs[$catalog]->search_wp ;
- $extid2q = array() ;
- $sql = "SELECT * FROM entry WHERE catalog=$catalog" ;
- if(!$result = $db->query($sql)) die('There was an error running the query A [' . $db->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()){
- $extid2q[$o->ext_id] = $o ;
- }
-
- $hadthat = array() ;
- $j = getSPARQL ( "SELECT ?item ?value { ?item wdt:P$prop ?value } ORDER BY ?item" ) ;
-
- // Fix deleted/redirected items
- $items = array() ;
- foreach ( $j->results->bindings AS $d ) {
- if ( !preg_match ( '/\/Q(\d+)$/' , $d->item->value , $m ) or $d->item->type != 'uri' or $d->value->type != 'literal' ) continue ;
- $q = $m[1] ;
- $items[] = "Q$q" ;
- }
- # print "Testing " . count($items) . ' - ' ;
-
- $use_q = array() ;
- if ( count($items) > 0 ) {
- $sql = "SELECT DISTINCT page_title FROM page WHERE page_namespace=0 AND page_is_redirect=0 AND page_title IN ('" . implode("','",$items)."')" ;
- unset($items) ;
- $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
- if(!$result = $dbwd->query($sql)) die('There was an error running the query X [' . $dbwd->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()) $use_q[$o->page_title] = $o->page_title ;
- }
- print count($use_q) . " left\n" ;
-
-
- // if ( !$db->ping() )
- $db = openMixNMatchDB() ;
- foreach ( $j->results->bindings AS $d ) {
- if ( !preg_match ( '/\/Q(\d+)$/' , $d->item->value , $m ) or $d->item->type != 'uri' or $d->value->type != 'literal' ) {
- print_r ( $d ) ;
- continue ;
- }
- $q = $m[1] ;
- $v = $d->value->value ;
- if ( !isset($use_q["Q$q"]) ) continue ;
- $hadthat[$v][] = $q ;
-
- if ( !isset($extid2q[$v]) ) {
- $logs['Unknown external ID'][$catalog][] = "External ID [[Q$q|" . getLinkedExternalID($catalog,$v) . "]] is not in Mix'n'match" ;
- continue ;
- }
-
- if ( $extid2q[$v]->q == $q ) { // All right!
- if ( $extid2q[$v]->user == 0 ) { // Confirmed in Wikidata, set in mix'n'match
- $ts = date ( 'YmdHis' ) ;
- $sql = "UPDATE entry SET user=4,timestamp='$ts' where id=" . $extid2q[$v]->id . " AND q=$q" ;
- if(!$result = $db->query($sql)) die('There was an error running the query B [' . $db->error . ']'."\n$sql\n\n");
- }
- continue ;
- }
-
- if ( $extid2q[$v]->q == -1 ) { // Set in Wikidata, but not in Mix'n'match.
- $ts = date ( 'YmdHis' ) ;
- $sql = "UPDATE entry SET q=$q,user=4,timestamp='$ts' where id=" . $extid2q[$v]->id . " AND (q is null or q < 0)" ;
- if(!$result = $db->query($sql)) die('There was an error running the query C [' . $db->error . ']'."\n$sql\n\n");
- $extid2q[$v]->q = $q ; // To not create a new item additionally later
- continue ;
- }
- if ( $extid2q[$v]->q != $q ) { // Mismatch
- if ( $extid2q[$v]->q== null or $extid2q[$v]->q <= 0 or $extid2q[$v]->user == 0 ) { // Just the automatcher, overwrite
- $ts = date ( 'YmdHis' ) ;
- $sql = "UPDATE entry SET q=$q,user=4,timestamp='$ts' where id=" . $extid2q[$v]->id ; // . " AND q=" . $extid2q[$v]->q ;
- if(!$result = $db->query($sql)) die('There was an error running the query D [' . $db->error . ']'."\n$sql\n\n");
- } else {
- $logs["Mismatch between Wikidata and Mix'n'match"][$catalog][] = "Wikidata says the external ID " . getLinkedExternalID($catalog,$v) . " belongs to {{Q|$q}}, but [https://tools.wmflabs.org/mix-n-match/#/entry/".$extid2q[$v]->id." mix'n'match] says {{Q|" . $extid2q[$v]->q . "}}" ;
- }
- }
-
- }
-
- // Find multiple ID usage
- foreach ( $hadthat AS $v => $h ) {
- if ( count($h) == 1 ) continue ;
- $logs['Multiple items with same external ID'][$catalog][] = "Multiple Wikidata items with external ID " . getLinkedExternalID($catalog,$v) . " : {{Q|" . implode('}}, {{Q|',$h) . "}}" ;
- }
- // Find missing usage
- $to_check = array() ;
- foreach ( $extid2q AS $extid => $o ) {
- $q = $o->q ;
- if ( preg_match ( '/^fake_id/' , $extid ) ) continue ;
- if ( $o->user == 0 ) continue ;
- if ( !isset($q) or $q == null or $q <= 0 ) continue ;
- if ( isset($hadthat[$extid]) ) continue ;
- $to_check["Q$q"] = $extid ;
- }
- if ( count($to_check) > 0 ) {
- $to_check2 = "'" . implode ( "','" , array_keys($to_check) ) . "'" ;
- $skip = array() ;
- $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
-
- $exists = array() ;
- $sql = "SELECT page_title FROM page WHERE page_title IN ($to_check2) AND page_namespace=0" ;
- if(!$result = $dbwd->query($sql)) {
- print 'There was an error running the query Y1 [' . $dbwd->error . ']'."\n$sql\n\n" ;
- return ;
- }
- while($o = $result->fetch_object()) {
- $exists[$o->page_title] = 1 ;
- }
-
- $sql = "select page_title,rev_comment from revision,page where page_title IN ($to_check2) and rev_page=page_id AND page_namespace=0 AND rev_comment LIKE '%[[Property:P$prop]]%'" ;
- $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
- if(!$result = $dbwd->query($sql)) {
- print 'There was an error running the query Y2 [' . $dbwd->error . ']'."\n";//$sql\n\n";
- return ;
- }
- while($o = $result->fetch_object()) {
- $q = $o->page_title ;
- $c = $o->rev_comment ;
- if ( !isset($to_check[$q]) ) continue ;
- $check = ']]: ' . $to_check[$q] ;
- if ( FALSE === stristr ( $c , $check ) ) continue ;
- // print "$q\t$c\t$check\n" ;
- $skip[$q] = 1 ;
- }
-
- // Single value only, enforced for all properties here!
- $sql = "SELECT page_title FROM page,pagelinks WHERE page_title IN ($to_check2) AND page_namespace=0 AND pl_from=page_id AND pl_namespace=120 AND pl_title='P$prop'" ;
- if(!$result = $dbwd->query($sql)) die('There was an error running the query Y3 [' . $dbwd->error . ']'."\n");//$sql\n\n");
- while($o = $result->fetch_object()) {
- $q = $o->page_title ;
- $skip[$q] = 1 ;
- }
-
- foreach ( $to_check AS $q => $extid ) {
- if ( isset($skip[$q]) ) continue ;
- if ( !isset($exists[$q]) ) continue ;
- $qs[] = "$q\tP$prop\t\"$extid\"" ;
- }
- }
- /*
- $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
- foreach ( $extid2q AS $extid => $o ) {
- $q = $o->q ;
- if ( preg_match ( '/^fake_id/' , $extid ) ) continue ;
- if ( $o->user == 0 ) continue ;
- if ( !isset($q) or $q == null or $q <= 0 ) continue ;
- if ( isset($hadthat[$extid]) ) continue ;
- $skip = false ;
- $sql = "select rev_comment from revision,page where page_title='Q$q' and rev_page=page_id AND page_namespace=0 AND rev_comment LIKE '%[[Property:P$prop]]%" . $dbwd->real_escape_string($extid) . "%' LIMIT 1" ;
- if(!$result = $dbwd->query($sql)) die('There was an error running the query Y [' . $dbwd->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()) $skip = true ;
- if ( $skip ) continue ;
- $qs[] = "Q$q\tP$prop\t\"$extid\"" ;
- }
- */
-
- foreach ( $extid2q AS $o ) {
- if ( $o->q != -1 ) continue ;
- if ( preg_match ( '/^fake_id/' , $o->ext_id ) ) continue ;
- $qs[] = "CREATE" ;
- $qs[] = "LAST\tL$lang\t" . formatLabel($o->ext_name) ;
- if ( $lang != 'en' and $o->type == 'person' ) $qs[] = "LAST\tLen\t" . formatLabel($o->ext_name) ;
- if ( $o->type == 'person' ) $qs[] = "LAST\tP31\tQ5" ;
- $qs[] = "LAST\tP$prop\t\"{$o->ext_id}\"" ;
- }
- }
- $qs_file = '/data/project/mix-n-match/public_html/qs.txt' ;
- @unlink ( $qs_file ) ;
- # Init
- $sql_ignore_catalogs = " AND id NOT IN (506)" ; // Hard ignore, because data problem in source
- $db = openMixNMatchDB() ;
- #$dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
- $logs = array () ;
- $qs = array() ;
- $catalogs = array() ;
- $sql = "SELECT * FROM catalog WHERE `active`=1" ;
- $sql .= $sql_ignore_catalogs ;
- if(!$result = $db->query($sql)) die('There was an error running the query 1 [' . $db->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()) $catalogs[$o->id] = $o ;
- # Check all catalogs with property
- $todo = array() ;
- $sql = "SELECT * FROM catalog WHERE `active`=1 AND wd_prop is not null and wd_qual is null" ;
- $sql .= $sql_ignore_catalogs ;
- if(!$result = $db->query($sql)) die('There was an error running the query 2 [' . $db->error . ']'."\n$sql\n\n");
- while($o = $result->fetch_object()){
- $todo[$o->id] = $o->wd_prop ;
- }
- #$catalog = 506 ;
- #print_r ( $catalogs['506'] ) ;
- #checkCatalog ( $catalog , $todo[$catalog] ) ; exit ( 0 ) ; # TESTING
- $the_catalog = $argv[1] ;
- $specific_catalog = isset($the_catalog) ;
- if ( $the_catalog == 'random' ) {
- $the_catalog = $catalogs[array_rand($catalogs)]->id ;
- }
- foreach ( $todo AS $catalog => $prop ) {
- if ( $specific_catalog and $the_catalog!=$catalog ) continue ;
- print "Running catalog #" . $catalog . ", P" . $prop . "\n" ;
- checkCatalog ( $catalog , $prop ) ;
- }
- // Generate quick_statements file
- $fh = fopen ( $qs_file , 'wb' ) ;
- fwrite ( $fh , "\xEF\xBB\xBF" . implode ( "\n" , $qs ) ) ; # UTF8 header
- fclose ( $fh ) ;
- // Update Wikidata report
- $max_lines = 400 ;
- $ts = date ( 'Ymd' ) ;
- $wiki_texts = array() ;
- $wiki_header = "A report for the [https://tools.wmflabs.org/mix-n-match/ Mix'n'match] tool. '''This page will be replaced regularly!'''\n" ;
- foreach ( $logs as $title => $v0 ) {
- $first = true ;
- foreach ( $v0 AS $catalog => $lines ) {
- if ( !isset($wiki_texts[$catalog]) ) {
- $wiki_texts[$catalog] = $wiki_header ;
- $wiki_texts[$catalog] .= "''Please note: If you fix something from this list on Wikidata, please fix it on Mix'n'match as well, if applicable. Otherwise, the error might be re-introduced from there.''\n" ;
- $wiki_texts[$catalog] .= "==[https://tools.wmflabs.org/mix-n-match/#/catalog/$catalog " . $catalogs[$catalog]->name . "]==" ;
- $wiki_texts[$catalog] .= "\n" . $catalogs[$catalog]->desc ;
- // $wiki_texts[$catalog] .= "\nLast update: '''$ts'''\n" ;
- }
- if ( $first ) { $wiki_texts[$catalog] .= "\n== $title ==\n" ; $first = false ; }
- if ( count($lines) > $max_lines ) {
- $wiki_texts[$catalog] .= "* " . count($lines) . " entries for this, not showing\n" ;
- } else {
- foreach ( $lines AS $l ) $wiki_texts[$catalog] .= "# $l\n" ;
- }
- }
- }
- function updateCatalog ( $wiki , $catalog ) {
- // global $specific_catalog , $the_catalog ;
- global $ts ;
- $ini = parse_ini_file ( '/data/project/mix-n-match/bot.ini' ) ;
- $wiki_user = $ini['user'] ;
- $wiki_pass = $ini['pass'] ;
- $w = new wikipedia ;
- $w->quiet = true ;
- $w->url = "https://www.wikidata.org/w/api.php";
- $w->setUserAgent( 'User-Agent: '.$wiki_user.' (http://www.wikidata.org/wiki/User:' . str_replace(' ','_',$wiki_user) . ')' );
- $w->login( $wiki_user , $wiki_pass );
- $page = "User:Magnus Manske/Mix'n'match report" ;
- if ( $catalog > 0 ) $page .= '/' . $catalog ;
- $title = str_replace(' ','_',$page) ;
- $p = $w->getpage ( $title ) ;
- // print "$p\n-----\n$wiki\n" ;
- if ( trim($p) == trim($wiki) ) { print "Skipping $page\n" ; return ; }
- $w->edit( $title, $wiki, "Update $ts" );
- }
- $wiki = $wiki_header ;
- $wiki .= "\n''red links indicate there never was an issue with that catalog''\n" ;
- $wiki .= "\n{| class='wikitable'" ;
- $wiki .= "\n!Catalog!!Report!!Wikidata Property" ;
- foreach ( $catalogs AS $catalog => $v ) {
- if ( !isset($catalogs[$catalog]->wd_prop) or isset($catalogs[$catalog]->wd_qual) ) continue ;
- $wiki .= "\n|-" ;
- $wiki .= "\n|[https://tools.wmflabs.org/mix-n-match/#/catalog/catalog=$catalog #$catalog]" ;
- $wiki .= "\n|[[/$catalog|{$v->name}]]" ;
- if ( isset($catalogs[$catalog]->wd_prop) and !isset($catalogs[$catalog]->wd_qual) ) $wiki .= "\n|".'{{'."P|{$catalogs[$catalog]->wd_prop}".'}}' ;
- else $wiki .= "||" ;
- if ( !isset($wiki_texts[$catalog]) ) continue ;
- updateCatalog ( $wiki_texts[$catalog] , $catalog ) ;
- }
- $wiki .= "\n|}\n" ;
- updateCatalog ( $wiki , 0 ) ;
- //file_get_contents ( 'https://tools.wmflabs.org/mix-n-match/api.php?query=update_overview' ) ; // Update stats
- ?>