PageRenderTime 22ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/microsync.php

https://bitbucket.org/waldyrious/mixnmatch
PHP | 330 lines | 243 code | 42 blank | 45 comment | 74 complexity | 808b7a39490f067f0fdfa248983bdc1f MD5 | raw file
  1. #!/usr/bin/php
  2. <?PHP
  3. require_once ( 'public_html/php/common.php' ) ;
  4. error_reporting(E_ERROR|E_CORE_ERROR|E_ALL|E_COMPILE_ERROR);
  5. require_once ( 'opendb.inc' ) ; // $db = openMixNMatchDB() ;
  6. require_once ( '../listeria/Chris-G-botclasses/botclasses.php' );
  7. $ext_cache = array() ;
  8. function getLinkedExternalID ( $catalog , $extid ) {
  9. global $db , $ext_cache ;
  10. if ( !isset ( $ext_cache[$catalog] ) ) {
  11. $ext_cache = array() ;
  12. $ext_cache[$catalog] = array() ;
  13. $sql = "SELECT ext_id,ext_url FROM entry WHERE catalog=$catalog" ;// AND ext_id='" . $db->real_escape_string($extid) . "'" ;
  14. if(!$result = $db->query($sql)) die('There was an error running the query X [' . $db->error . ']'."\n$sql\n\n");
  15. while($o = $result->fetch_object()){
  16. if ( !isset($o->ext_url) or $o->ext_url == null ) continue ;
  17. $ext_cache[$catalog][$o->ext_id] = $o->ext_url ;
  18. }
  19. }
  20. /*
  21. $sql = "SELECT * FROM entry WHERE catalog=$catalog AND ext_id='" . $db->real_escape_string($extid) . "'" ;
  22. if(!$result = $db->query($sql)) die('There was an error running the query X [' . $db->error . ']'."\n$sql\n\n");
  23. while($o = $result->fetch_object()){
  24. if ( $o->ext_url != '' ) return "[" . $o->ext_url . " $extid]" ;
  25. }
  26. */
  27. $extid_nolink = preg_replace ( '/[\[\]]/' , '' , $extid ) ;
  28. if ( isset ( $ext_cache[$catalog][$extid] ) ) return "[" . $ext_cache[$catalog][$extid] . " $extid_nolink]" ;
  29. return "''$extid''" ; // Fallback
  30. }
  31. function formatLabel ( $label ) {
  32. $label = trim ( $label ) ;
  33. if ( !preg_match ( '/[a-z]/' , $label ) ) $label = ucwords ( strtolower ( $label ) ) ; // First letter uppercase, if no lower-case letters detected
  34. $label = preg_replace ( '/ +/' , ' ' , $label ) ; // Remove multiple spaces
  35. return '"' . $label . '"' ;
  36. }
  37. function checkCatalog ( $catalog , $prop ) {
  38. global $db , $logs , $qs , $catalogs ;
  39. print "Checking $catalog/$prop\n" ;
  40. $db = openMixNMatchDB() ;
  41. $lang = $catalogs[$catalog]->search_wp ;
  42. $extid2q = array() ;
  43. $sql = "SELECT * FROM entry WHERE catalog=$catalog" ;
  44. if(!$result = $db->query($sql)) die('There was an error running the query A [' . $db->error . ']'."\n$sql\n\n");
  45. while($o = $result->fetch_object()){
  46. $extid2q[$o->ext_id] = $o ;
  47. }
  48. $hadthat = array() ;
  49. $j = getSPARQL ( "SELECT ?item ?value { ?item wdt:P$prop ?value } ORDER BY ?item" ) ;
  50. // Fix deleted/redirected items
  51. $items = array() ;
  52. foreach ( $j->results->bindings AS $d ) {
  53. if ( !preg_match ( '/\/Q(\d+)$/' , $d->item->value , $m ) or $d->item->type != 'uri' or $d->value->type != 'literal' ) continue ;
  54. $q = $m[1] ;
  55. $items[] = "Q$q" ;
  56. }
  57. # print "Testing " . count($items) . ' - ' ;
  58. $use_q = array() ;
  59. if ( count($items) > 0 ) {
  60. $sql = "SELECT DISTINCT page_title FROM page WHERE page_namespace=0 AND page_is_redirect=0 AND page_title IN ('" . implode("','",$items)."')" ;
  61. unset($items) ;
  62. $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
  63. if(!$result = $dbwd->query($sql)) die('There was an error running the query X [' . $dbwd->error . ']'."\n$sql\n\n");
  64. while($o = $result->fetch_object()) $use_q[$o->page_title] = $o->page_title ;
  65. }
  66. print count($use_q) . " left\n" ;
  67. // if ( !$db->ping() )
  68. $db = openMixNMatchDB() ;
  69. foreach ( $j->results->bindings AS $d ) {
  70. if ( !preg_match ( '/\/Q(\d+)$/' , $d->item->value , $m ) or $d->item->type != 'uri' or $d->value->type != 'literal' ) {
  71. print_r ( $d ) ;
  72. continue ;
  73. }
  74. $q = $m[1] ;
  75. $v = $d->value->value ;
  76. if ( !isset($use_q["Q$q"]) ) continue ;
  77. $hadthat[$v][] = $q ;
  78. if ( !isset($extid2q[$v]) ) {
  79. $logs['Unknown external ID'][$catalog][] = "External ID [[Q$q|" . getLinkedExternalID($catalog,$v) . "]] is not in Mix'n'match" ;
  80. continue ;
  81. }
  82. if ( $extid2q[$v]->q == $q ) { // All right!
  83. if ( $extid2q[$v]->user == 0 ) { // Confirmed in Wikidata, set in mix'n'match
  84. $ts = date ( 'YmdHis' ) ;
  85. $sql = "UPDATE entry SET user=4,timestamp='$ts' where id=" . $extid2q[$v]->id . " AND q=$q" ;
  86. if(!$result = $db->query($sql)) die('There was an error running the query B [' . $db->error . ']'."\n$sql\n\n");
  87. }
  88. continue ;
  89. }
  90. if ( $extid2q[$v]->q == -1 ) { // Set in Wikidata, but not in Mix'n'match.
  91. $ts = date ( 'YmdHis' ) ;
  92. $sql = "UPDATE entry SET q=$q,user=4,timestamp='$ts' where id=" . $extid2q[$v]->id . " AND (q is null or q < 0)" ;
  93. if(!$result = $db->query($sql)) die('There was an error running the query C [' . $db->error . ']'."\n$sql\n\n");
  94. $extid2q[$v]->q = $q ; // To not create a new item additionally later
  95. continue ;
  96. }
  97. if ( $extid2q[$v]->q != $q ) { // Mismatch
  98. if ( $extid2q[$v]->q== null or $extid2q[$v]->q <= 0 or $extid2q[$v]->user == 0 ) { // Just the automatcher, overwrite
  99. $ts = date ( 'YmdHis' ) ;
  100. $sql = "UPDATE entry SET q=$q,user=4,timestamp='$ts' where id=" . $extid2q[$v]->id ; // . " AND q=" . $extid2q[$v]->q ;
  101. if(!$result = $db->query($sql)) die('There was an error running the query D [' . $db->error . ']'."\n$sql\n\n");
  102. } else {
  103. $logs["Mismatch between Wikidata and Mix'n'match"][$catalog][] = "Wikidata says the external ID " . getLinkedExternalID($catalog,$v) . " belongs to {{Q|$q}}, but [https://tools.wmflabs.org/mix-n-match/#/entry/".$extid2q[$v]->id." mix'n'match] says {{Q|" . $extid2q[$v]->q . "}}" ;
  104. }
  105. }
  106. }
  107. // Find multiple ID usage
  108. foreach ( $hadthat AS $v => $h ) {
  109. if ( count($h) == 1 ) continue ;
  110. $logs['Multiple items with same external ID'][$catalog][] = "Multiple Wikidata items with external ID " . getLinkedExternalID($catalog,$v) . " : {{Q|" . implode('}}, {{Q|',$h) . "}}" ;
  111. }
  112. // Find missing usage
  113. $to_check = array() ;
  114. foreach ( $extid2q AS $extid => $o ) {
  115. $q = $o->q ;
  116. if ( preg_match ( '/^fake_id/' , $extid ) ) continue ;
  117. if ( $o->user == 0 ) continue ;
  118. if ( !isset($q) or $q == null or $q <= 0 ) continue ;
  119. if ( isset($hadthat[$extid]) ) continue ;
  120. $to_check["Q$q"] = $extid ;
  121. }
  122. if ( count($to_check) > 0 ) {
  123. $to_check2 = "'" . implode ( "','" , array_keys($to_check) ) . "'" ;
  124. $skip = array() ;
  125. $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
  126. $exists = array() ;
  127. $sql = "SELECT page_title FROM page WHERE page_title IN ($to_check2) AND page_namespace=0" ;
  128. if(!$result = $dbwd->query($sql)) {
  129. print 'There was an error running the query Y1 [' . $dbwd->error . ']'."\n$sql\n\n" ;
  130. return ;
  131. }
  132. while($o = $result->fetch_object()) {
  133. $exists[$o->page_title] = 1 ;
  134. }
  135. $sql = "select page_title,rev_comment from revision,page where page_title IN ($to_check2) and rev_page=page_id AND page_namespace=0 AND rev_comment LIKE '%[[Property:P$prop]]%'" ;
  136. $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
  137. if(!$result = $dbwd->query($sql)) {
  138. print 'There was an error running the query Y2 [' . $dbwd->error . ']'."\n";//$sql\n\n";
  139. return ;
  140. }
  141. while($o = $result->fetch_object()) {
  142. $q = $o->page_title ;
  143. $c = $o->rev_comment ;
  144. if ( !isset($to_check[$q]) ) continue ;
  145. $check = ']]: ' . $to_check[$q] ;
  146. if ( FALSE === stristr ( $c , $check ) ) continue ;
  147. // print "$q\t$c\t$check\n" ;
  148. $skip[$q] = 1 ;
  149. }
  150. // Single value only, enforced for all properties here!
  151. $sql = "SELECT page_title FROM page,pagelinks WHERE page_title IN ($to_check2) AND page_namespace=0 AND pl_from=page_id AND pl_namespace=120 AND pl_title='P$prop'" ;
  152. if(!$result = $dbwd->query($sql)) die('There was an error running the query Y3 [' . $dbwd->error . ']'."\n");//$sql\n\n");
  153. while($o = $result->fetch_object()) {
  154. $q = $o->page_title ;
  155. $skip[$q] = 1 ;
  156. }
  157. foreach ( $to_check AS $q => $extid ) {
  158. if ( isset($skip[$q]) ) continue ;
  159. if ( !isset($exists[$q]) ) continue ;
  160. $qs[] = "$q\tP$prop\t\"$extid\"" ;
  161. }
  162. }
  163. /*
  164. $dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
  165. foreach ( $extid2q AS $extid => $o ) {
  166. $q = $o->q ;
  167. if ( preg_match ( '/^fake_id/' , $extid ) ) continue ;
  168. if ( $o->user == 0 ) continue ;
  169. if ( !isset($q) or $q == null or $q <= 0 ) continue ;
  170. if ( isset($hadthat[$extid]) ) continue ;
  171. $skip = false ;
  172. $sql = "select rev_comment from revision,page where page_title='Q$q' and rev_page=page_id AND page_namespace=0 AND rev_comment LIKE '%[[Property:P$prop]]%" . $dbwd->real_escape_string($extid) . "%' LIMIT 1" ;
  173. if(!$result = $dbwd->query($sql)) die('There was an error running the query Y [' . $dbwd->error . ']'."\n$sql\n\n");
  174. while($o = $result->fetch_object()) $skip = true ;
  175. if ( $skip ) continue ;
  176. $qs[] = "Q$q\tP$prop\t\"$extid\"" ;
  177. }
  178. */
  179. foreach ( $extid2q AS $o ) {
  180. if ( $o->q != -1 ) continue ;
  181. if ( preg_match ( '/^fake_id/' , $o->ext_id ) ) continue ;
  182. $qs[] = "CREATE" ;
  183. $qs[] = "LAST\tL$lang\t" . formatLabel($o->ext_name) ;
  184. if ( $lang != 'en' and $o->type == 'person' ) $qs[] = "LAST\tLen\t" . formatLabel($o->ext_name) ;
  185. if ( $o->type == 'person' ) $qs[] = "LAST\tP31\tQ5" ;
  186. $qs[] = "LAST\tP$prop\t\"{$o->ext_id}\"" ;
  187. }
  188. }
  189. $qs_file = '/data/project/mix-n-match/public_html/qs.txt' ;
  190. @unlink ( $qs_file ) ;
  191. # Init
  192. $sql_ignore_catalogs = " AND id NOT IN (506)" ; // Hard ignore, because data problem in source
  193. $db = openMixNMatchDB() ;
  194. #$dbwd = openDB ( 'wikidata' , 'wikidata' , true ) ;
  195. $logs = array () ;
  196. $qs = array() ;
  197. $catalogs = array() ;
  198. $sql = "SELECT * FROM catalog WHERE `active`=1" ;
  199. $sql .= $sql_ignore_catalogs ;
  200. if(!$result = $db->query($sql)) die('There was an error running the query 1 [' . $db->error . ']'."\n$sql\n\n");
  201. while($o = $result->fetch_object()) $catalogs[$o->id] = $o ;
  202. # Check all catalogs with property
  203. $todo = array() ;
  204. $sql = "SELECT * FROM catalog WHERE `active`=1 AND wd_prop is not null and wd_qual is null" ;
  205. $sql .= $sql_ignore_catalogs ;
  206. if(!$result = $db->query($sql)) die('There was an error running the query 2 [' . $db->error . ']'."\n$sql\n\n");
  207. while($o = $result->fetch_object()){
  208. $todo[$o->id] = $o->wd_prop ;
  209. }
  210. #$catalog = 506 ;
  211. #print_r ( $catalogs['506'] ) ;
  212. #checkCatalog ( $catalog , $todo[$catalog] ) ; exit ( 0 ) ; # TESTING
  213. $the_catalog = $argv[1] ;
  214. $specific_catalog = isset($the_catalog) ;
  215. if ( $the_catalog == 'random' ) {
  216. $the_catalog = $catalogs[array_rand($catalogs)]->id ;
  217. }
  218. foreach ( $todo AS $catalog => $prop ) {
  219. if ( $specific_catalog and $the_catalog!=$catalog ) continue ;
  220. print "Running catalog #" . $catalog . ", P" . $prop . "\n" ;
  221. checkCatalog ( $catalog , $prop ) ;
  222. }
  223. // Generate quick_statements file
  224. $fh = fopen ( $qs_file , 'wb' ) ;
  225. fwrite ( $fh , "\xEF\xBB\xBF" . implode ( "\n" , $qs ) ) ; # UTF8 header
  226. fclose ( $fh ) ;
  227. // Update Wikidata report
  228. $max_lines = 400 ;
  229. $ts = date ( 'Ymd' ) ;
  230. $wiki_texts = array() ;
  231. $wiki_header = "A report for the [https://tools.wmflabs.org/mix-n-match/ Mix'n'match] tool. '''This page will be replaced regularly!'''\n" ;
  232. foreach ( $logs as $title => $v0 ) {
  233. $first = true ;
  234. foreach ( $v0 AS $catalog => $lines ) {
  235. if ( !isset($wiki_texts[$catalog]) ) {
  236. $wiki_texts[$catalog] = $wiki_header ;
  237. $wiki_texts[$catalog] .= "''Please note: If you fix something from this list on Wikidata, please fix it on Mix'n'match as well, if applicable. Otherwise, the error might be re-introduced from there.''\n" ;
  238. $wiki_texts[$catalog] .= "==[https://tools.wmflabs.org/mix-n-match/#/catalog/$catalog " . $catalogs[$catalog]->name . "]==" ;
  239. $wiki_texts[$catalog] .= "\n" . $catalogs[$catalog]->desc ;
  240. // $wiki_texts[$catalog] .= "\nLast update: '''$ts'''\n" ;
  241. }
  242. if ( $first ) { $wiki_texts[$catalog] .= "\n== $title ==\n" ; $first = false ; }
  243. if ( count($lines) > $max_lines ) {
  244. $wiki_texts[$catalog] .= "* " . count($lines) . " entries for this, not showing\n" ;
  245. } else {
  246. foreach ( $lines AS $l ) $wiki_texts[$catalog] .= "# $l\n" ;
  247. }
  248. }
  249. }
  250. function updateCatalog ( $wiki , $catalog ) {
  251. // global $specific_catalog , $the_catalog ;
  252. global $ts ;
  253. $ini = parse_ini_file ( '/data/project/mix-n-match/bot.ini' ) ;
  254. $wiki_user = $ini['user'] ;
  255. $wiki_pass = $ini['pass'] ;
  256. $w = new wikipedia ;
  257. $w->quiet = true ;
  258. $w->url = "https://www.wikidata.org/w/api.php";
  259. $w->setUserAgent( 'User-Agent: '.$wiki_user.' (http://www.wikidata.org/wiki/User:' . str_replace(' ','_',$wiki_user) . ')' );
  260. $w->login( $wiki_user , $wiki_pass );
  261. $page = "User:Magnus Manske/Mix'n'match report" ;
  262. if ( $catalog > 0 ) $page .= '/' . $catalog ;
  263. $title = str_replace(' ','_',$page) ;
  264. $p = $w->getpage ( $title ) ;
  265. // print "$p\n-----\n$wiki\n" ;
  266. if ( trim($p) == trim($wiki) ) { print "Skipping $page\n" ; return ; }
  267. $w->edit( $title, $wiki, "Update $ts" );
  268. }
  269. $wiki = $wiki_header ;
  270. $wiki .= "\n''red links indicate there never was an issue with that catalog''\n" ;
  271. $wiki .= "\n{| class='wikitable'" ;
  272. $wiki .= "\n!Catalog!!Report!!Wikidata Property" ;
  273. foreach ( $catalogs AS $catalog => $v ) {
  274. if ( !isset($catalogs[$catalog]->wd_prop) or isset($catalogs[$catalog]->wd_qual) ) continue ;
  275. $wiki .= "\n|-" ;
  276. $wiki .= "\n|[https://tools.wmflabs.org/mix-n-match/#/catalog/catalog=$catalog #$catalog]" ;
  277. $wiki .= "\n|[[/$catalog|{$v->name}]]" ;
  278. if ( isset($catalogs[$catalog]->wd_prop) and !isset($catalogs[$catalog]->wd_qual) ) $wiki .= "\n|".'{{'."P|{$catalogs[$catalog]->wd_prop}".'}}' ;
  279. else $wiki .= "||" ;
  280. if ( !isset($wiki_texts[$catalog]) ) continue ;
  281. updateCatalog ( $wiki_texts[$catalog] , $catalog ) ;
  282. }
  283. $wiki .= "\n|}\n" ;
  284. updateCatalog ( $wiki , 0 ) ;
  285. //file_get_contents ( 'https://tools.wmflabs.org/mix-n-match/api.php?query=update_overview' ) ; // Update stats
  286. ?>