PageRenderTime 26ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/rake_tasks/fix_wikipedia.php

https://github.com/EOL/eol_php_code
PHP | 287 lines | 247 code | 26 blank | 14 comment | 69 complexity | ab78025cb9a0f18014c92e5431ad0abe MD5 | raw file
  1. <?php
  2. namespace php_active_record;
  3. include_once(dirname(__FILE__) . "/../config/environment.php");
  4. $mysqli = &$GLOBALS['db_connection'];
  5. exit;
  6. $latest_harvest_event_id = get_latest_harvest_event();
  7. if(!$latest_harvest_event_id) exit;
  8. define('LATEST_WIKIPEDIA_HARVEST_EVENT_ID', $latest_harvest_event_id);
  9. $data_object_guids = get_guids_of_wikipedia_objects();
  10. $guids_of_curated_objects = get_guids_of_curated_objects($data_object_guids);
  11. print_r($guids_of_curated_objects);
  12. // check_revision_history(array_keys($guids_of_curated_objects));
  13. // fix_previous_events();
  14. // turn_preview_objects_visible();
  15. // reindex_current_and_previous_events();
  16. index_old_objects();
  17. exit;
  18. function get_latest_harvest_event()
  19. {
  20. $result = $GLOBALS['mysqli_connection']->query("SELECT max(id) as max FROM harvest_events WHERE resource_id = 80");
  21. if($result && $row=$result->fetch_assoc())
  22. {
  23. return $row['max'];
  24. }
  25. }
  26. function get_guids_of_wikipedia_objects()
  27. {
  28. $data_object_guids = array();
  29. $wikipedia_resource = Resource::wikipedia();
  30. $current_harvest_id = $wikipedia_resource->most_recent_published_harvest_event_id();
  31. $i = 0;
  32. foreach($GLOBALS['mysqli_connection']->iterate_file("SELECT dohe.guid FROM harvest_events he JOIN data_objects_harvest_events dohe ON (he.id=dohe.harvest_event_id) WHERE he.resource_id=80 AND he.began_at>'2011-06-01' ORDER BY he.id DESC LIMIT 600000") as $row_num => $row)
  33. {
  34. if($i % 1000 == 0) echo "$i :: ". count($data_object_guids) ." :: ". memory_get_usage() ." :: ". time_elapsed() ."\n";
  35. $i++;
  36. $data_object_guids[$row[0]] = 1;
  37. }
  38. return $data_object_guids;
  39. }
  40. function check_revision_history($guids_of_curated_objects)
  41. {
  42. $GLOBALS['mysqli_connection']->begin_transaction();
  43. foreach($guids_of_curated_objects as $guid)
  44. {
  45. // check for curation actions
  46. $query = "SELECT do.guid, ta.name
  47. FROM data_objects do
  48. JOIN
  49. (".LOGGING_DB.".curator_activity_logs cal
  50. JOIN ".LOGGING_DB.".translated_activities ta ON (cal.activity_id=ta.activity_id AND ta.language_id=". Language::english()->id ." AND ta.name != 'choose_exemplar_article'))
  51. ON (do.id=cal.target_id AND (cal.changeable_object_type_id=1 OR cal.changeable_object_type_id=9))
  52. WHERE do.guid = '$guid'
  53. ORDER BY cal.created_at ASC";
  54. $actions_on_this_object = array();
  55. $result = $GLOBALS['mysqli_connection']->query($query);
  56. while($result && $row=$result->fetch_assoc())
  57. {
  58. if($row['name'] == 'trusted') unset($actions_on_this_object['untrusted']);
  59. if($row['name'] == 'untrusted') unset($actions_on_this_object['trusted']);
  60. if($row['name'] == 'show') unset($actions_on_this_object['hide']);
  61. if($row['name'] == 'hide') unset($actions_on_this_object['show']);
  62. if(isset($actions_on_this_object[$row['name']]))
  63. {
  64. $actions_on_this_object[$row['name']]++;
  65. }else $actions_on_this_object[$row['name']] = 1;
  66. }
  67. unset($actions_on_this_object['add_association']);
  68. unset($actions_on_this_object['remove_association']);
  69. if(count($actions_on_this_object) == 1 && @$actions_on_this_object['trusted'] >= 1)
  70. {
  71. set_vetted_status($guid, Vetted::trusted()->id);
  72. set_visibility($guid, Visibility::visible()->id);
  73. }elseif(count($actions_on_this_object) == 2 && @$actions_on_this_object['trusted'] >= 1 && @$actions_on_this_object['show'] >= 1)
  74. {
  75. set_vetted_status($guid, Vetted::trusted()->id);
  76. set_visibility($guid, Visibility::visible()->id);
  77. }elseif(count($actions_on_this_object) == 1 && @$actions_on_this_object['untrusted'] >= 1)
  78. {
  79. set_vetted_status($guid, Vetted::untrusted()->id);
  80. set_visibility($guid, Visibility::invisible()->id);
  81. }elseif(count($actions_on_this_object) == 2 && @$actions_on_this_object['untrusted'] >= 1 && @$actions_on_this_object['show'] >= 1)
  82. {
  83. set_vetted_status($guid, Vetted::untrusted()->id);
  84. set_visibility($guid, Visibility::invisible()->id);
  85. }elseif(count($actions_on_this_object) == 2 && @$actions_on_this_object['untrusted'] >= 1 && @$actions_on_this_object['hide'] >= 1)
  86. {
  87. set_vetted_status($guid, Vetted::untrusted()->id);
  88. set_visibility($guid, Visibility::invisible()->id);
  89. }elseif(count($actions_on_this_object) == 1 && @$actions_on_this_object['inappropriate'] >= 1)
  90. {
  91. set_vetted_status($guid, Vetted::untrusted()->id);
  92. set_visibility($guid, Visibility::invisible()->id);
  93. }elseif(count($actions_on_this_object) == 2 && @$actions_on_this_object['inappropriate'] >= 1 && @$actions_on_this_object['untrusted'] >= 1)
  94. {
  95. set_vetted_status($guid, Vetted::untrusted()->id);
  96. set_visibility($guid, Visibility::invisible()->id);
  97. }elseif(count($actions_on_this_object) == 1 && @$actions_on_this_object['hide'] >= 1)
  98. {
  99. set_visibility($guid, Visibility::invisible()->id);
  100. }elseif(count($actions_on_this_object) == 2 && @$actions_on_this_object['trusted'] >= 1 && @$actions_on_this_object['hide'] >= 1)
  101. {
  102. if($best_version_attributes = get_representative_version($guid))
  103. {
  104. set_vetted_status($guid, $best_version_attributes['vetted_id']);
  105. set_visibility($guid, $best_version_attributes['visibility_id']);
  106. }
  107. }elseif(count($actions_on_this_object) == 1 && @$actions_on_this_object['show'] >= 1)
  108. {
  109. // do nothing, it was just shown
  110. // echo "XXXXXXXXXA $guid\n";
  111. }else
  112. {
  113. if($best_version_attributes = get_representative_version($guid))
  114. {
  115. set_vetted_status($guid, $best_version_attributes['vetted_id']);
  116. set_visibility($guid, $best_version_attributes['visibility_id']);
  117. }
  118. }
  119. }
  120. $GLOBALS['mysqli_connection']->end_transaction();
  121. $count = count($guids_of_curated_objects);
  122. echo "Count: $count\n";
  123. }
  124. function get_guids_of_curated_objects($data_object_guids)
  125. {
  126. $guids_of_objects_with_activities = array();
  127. $guids_of_objects_with_different_values = array();
  128. $guids_of_curated_objects = array();
  129. $batches = array_chunk(array_keys($data_object_guids), 5000);
  130. $i = 0;
  131. foreach($batches as $batch)
  132. {
  133. echo "$i :: ". count($guids_of_curated_objects) ." :: ". memory_get_usage() ." :: ". time_elapsed() ."\n";
  134. $i++;
  135. // Objects which curator actions
  136. $query = "SELECT do.guid, do.id, ta.name, cal.created_at
  137. FROM data_objects do
  138. JOIN ".LOGGING_DB.".curator_activity_logs cal ON (do.id=cal.target_id AND cal.changeable_object_type_id=1)
  139. JOIN ".LOGGING_DB.".translated_activities ta ON (cal.activity_id=ta.activity_id AND ta.language_id=". Language::english()->id ." AND ta.name != 'choose_exemplar_article')
  140. WHERE do.guid IN ('". implode("','", $batch) ."')";
  141. $result = $GLOBALS['mysqli_connection']->query($query);
  142. while($result && $row=$result->fetch_assoc())
  143. {
  144. $guids_of_curated_objects[$row['guid']] = 1;
  145. $guids_of_objects_with_activities[$row['guid']] = 1;
  146. }
  147. // Objects whose vetted or visibility IDs have changed
  148. $query = "SELECT do.guid, do.id, dohe.*
  149. FROM data_objects do
  150. JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)
  151. WHERE do.guid IN ('". implode("','", $batch) ."')
  152. AND do.id > 2919703
  153. AND (dohe.vetted_id != ". Vetted::unknown()->id ." OR dohe.visibility_id != ". Visibility::visible()->id .")";
  154. $result = $GLOBALS['mysqli_connection']->query($query);
  155. while($result && $row=$result->fetch_assoc())
  156. {
  157. $guids_of_curated_objects[$row['guid']] = 1;
  158. $guids_of_objects_with_different_values[$row['guid']] = 1;
  159. }
  160. }
  161. return $guids_of_curated_objects;
  162. }
  163. function get_representative_version($guid)
  164. {
  165. // this object will look like it should be hidden, but it shouldn't
  166. if($guid == '09532fbf0b3e88d7d949aabf437a1670') return;
  167. $query = "SELECT do.guid, do.id, do.published, do.created_at, dohe.*
  168. FROM data_objects do
  169. JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)
  170. WHERE do.guid IN ('$guid')
  171. ORDER BY do.id DESC";
  172. $result = $GLOBALS['mysqli_connection']->query($query);
  173. while($result && $row=$result->fetch_assoc())
  174. {
  175. $id = $row['id'];
  176. $vetted_id = $row['vetted_id'];
  177. $visibility_id = $row['visibility_id'];
  178. $published = $row['published'];
  179. if($vetted_id != 0)
  180. {
  181. if($vetted_id == 4) $visibility_id = 0;
  182. elseif($vetted_id == 6) { $vetted_id = 4; $visibility_id = 0; }
  183. return array('data_object_id' => $id, 'vetted_id' => $vetted_id, 'visibility_id' => $visibility_id);
  184. }
  185. }
  186. $result->data_seek(0);
  187. while($result && $row=$result->fetch_assoc())
  188. {
  189. $id = $row['id'];
  190. $vetted_id = $row['vetted_id'];
  191. $visibility_id = $row['visibility_id'];
  192. $published = $row['published'];
  193. if($visibility_id != 2 && $visibility_id != 1)
  194. {
  195. if($vetted_id == 4) $visibility_id = 0;
  196. elseif($vetted_id == 6) { $vetted_id = 4; $visibility_id = 0; }
  197. return array('data_object_id' => $id, 'vetted_id' => $vetted_id, 'visibility_id' => $visibility_id);
  198. }
  199. }
  200. }
  201. function set_vetted_status($guid, $vetted_id)
  202. {
  203. $query = "UPDATE data_objects_harvest_events dohe JOIN data_objects_hierarchy_entries dohent ON (dohe.data_object_id=dohent.data_object_id) SET dohent.vetted_id=$vetted_id WHERE dohe.harvest_event_id=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." AND dohe.guid='$guid' AND dohent.vetted_id!=$vetted_id";
  204. $GLOBALS['mysqli_connection']->update($query);
  205. }
  206. function set_visibility($guid, $visibility_id)
  207. {
  208. $query = "UPDATE data_objects_harvest_events dohe JOIN data_objects_hierarchy_entries dohent ON (dohe.data_object_id=dohent.data_object_id) SET dohent.visibility_id=$visibility_id WHERE dohe.harvest_event_id=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." AND dohe.guid='$guid' AND dohent.visibility_id!=$visibility_id";
  209. $GLOBALS['mysqli_connection']->update($query);
  210. }
  211. function turn_preview_objects_visible()
  212. {
  213. // set objects in latest event which are preview, to visible
  214. $result = $GLOBALS['mysqli_connection']->update("UPDATE data_objects_harvest_events dohe JOIN data_objects_hierarchy_entries dohent ON (dohe.data_object_id=dohent.data_object_id) SET dohent.visibility_id=1 WHERE dohe.harvest_event_id=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." AND dohent.visibility_id=2");
  215. // make objects published
  216. $GLOBALS['db_connection']->update_where("data_objects", "id", "SELECT do.id FROM data_objects_harvest_events dohe JOIN data_objects do ON (dohe.data_object_id=do.id) WHERE dohe.harvest_event_id=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." and do.published!=1", "published=1");
  217. }
  218. function fix_previous_events()
  219. {
  220. $result = $GLOBALS['mysqli_connection']->query("SELECT id FROM harvest_events WHERE resource_id = 80 AND id!=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." ORDER BY id desc LIMIT 3");
  221. while($result && $row=$result->fetch_assoc())
  222. {
  223. $harvest_event_id = $row['id'];
  224. // set objects in previous event which are preview, to visible
  225. $GLOBALS['mysqli_connection']->update("UPDATE data_objects_harvest_events dohe JOIN data_objects_hierarchy_entries dohent ON (dohe.data_object_id=dohent.data_object_id) SET dohent.visibility_id=1 WHERE dohe.harvest_event_id=$harvest_event_id AND dohent.visibility_id=2");
  226. // // make objects unpublished
  227. $GLOBALS['db_connection']->update_where("data_objects", "id", "SELECT do.id FROM data_objects_harvest_events dohe JOIN data_objects do ON (dohe.data_object_id=do.id) WHERE dohe.harvest_event_id=$harvest_event_id and do.published=1", "published=0");
  228. }
  229. }
  230. function reindex_current_and_previous_events()
  231. {
  232. $latest_harvest_event = HarvestEvent::find(LATEST_WIKIPEDIA_HARVEST_EVENT_ID);
  233. $latest_harvest_event->index_for_search();
  234. $result = $GLOBALS['mysqli_connection']->query("SELECT id FROM harvest_events WHERE resource_id = 80 AND id!=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." ORDER BY id desc LIMIT 3");
  235. while($result && $row=$result->fetch_assoc())
  236. {
  237. $harvest_event_id = $row['id'];
  238. $latest_harvest_event = HarvestEvent::find($harvest_event_id);
  239. $latest_harvest_event->index_for_search(LATEST_WIKIPEDIA_HARVEST_EVENT_ID);
  240. }
  241. }
  242. function index_old_objects()
  243. {
  244. $data_object_ids = array();
  245. $result = $GLOBALS['mysqli_connection']->query("SELECT id FROM harvest_events WHERE resource_id = 80 AND id!=".LATEST_WIKIPEDIA_HARVEST_EVENT_ID." ORDER BY id desc LIMIT 2");
  246. while($result && $row=$result->fetch_assoc())
  247. {
  248. $harvest_event_id = $row['id'];
  249. $query = "SELECT dohe.data_object_id
  250. FROM data_objects_harvest_events dohe
  251. LEFT JOIN data_objects_harvest_events dohe2 ON (dohe.data_object_id = dohe2.data_object_id AND dohe2.harvest_event_id = ".LATEST_WIKIPEDIA_HARVEST_EVENT_ID.")
  252. WHERE dohe.harvest_event_id = $harvest_event_id AND dohe2.data_object_id IS NULL";
  253. foreach($GLOBALS['db_connection']->iterate_file($query) as $row_num => $row) $data_object_ids[$row[0]] = 1;
  254. }
  255. $data_object_ids = array_keys($data_object_ids);
  256. $object_indexer = new DataObjectAncestriesIndexer();
  257. $object_indexer->index_data_objects($data_object_ids);
  258. $search_indexer = new SiteSearchIndexer();
  259. if($data_object_ids) $search_indexer->index_type('DataObject', $data_object_ids);
  260. }
  261. ?>