PageRenderTime 55ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/extensions/MetavidWiki/maintenance/scrape_and_insert.inc.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 927 lines | 790 code | 24 blank | 113 comment | 95 complexity | f22bfb18f67fc647c8bb27e8ca21f580 MD5 | raw file
  1. <?php
  2. /*
  3. * scrape_and_insert.inc.php Created on Feb 14, 2008
  4. *
  5. * All Metavid Wiki code is Released under the GPL2
  6. * for more info visit http://metavid.org/wiki/Code
  7. *
  8. * @author Michael Dale
  9. * @email dale@ucsc.edu
  10. * @url http://metavid.org
  11. */
  12. require_once ( '../../../maintenance/commandLine.inc' );
  13. class MV_BillScraper extends MV_BaseScraper {
  14. var $base_url = 'http://www.c-spanarchives.org/congress/';
  15. var $base_query = '?q=node/69850';
  16. var $govTrack_bill_url = 'http://www.govtrack.us/congress/bill.xpd?bill=';
  17. var $mapLightBillSearch = 'http://maplight.org/map/us/bill/search/';
  18. var $mapLightBillInfo = 'http://maplight.org/map/us/bill/$1/default';
  19. var $mapLightInterestG = 'http://maplight.org/map/us/interest/$1/view/all';
  20. var $mapLightInterestGBills = 'http://maplight.org/map/us/interest/$1/bills';
  21. // flag to control maplight lookup
  22. var $bill_name_maplight_lookup = true;
  23. // swich on letter types:
  24. var $bill_types = array( 'H.J.RES.' => 'hj', 'H.R.' => 'h', 'H.RES.' => 'hr',
  25. 'S.CON.RES.' => 'sc', 'S.J.RES' => 'sj', 'S.RES.1' => 'sr', 'S.' => 's' );
  26. var $bill_titles = array();
  27. var $mapLight_cache = array();
  28. function procArguments() {
  29. global $options, $args;
  30. if ( !isset( $options['stream_name'] ) && !isset( $options['s'] ) ) {
  31. die( "error missing stream name\n" );
  32. } else {
  33. $stream_inx = ( isset( $options['stream_name'] ) ) ? $options['stream_name']:$options['s'];
  34. if ( $args[$stream_inx] == 'all' ) {
  35. $dbr = wfGetDB( DB_SLAVE );
  36. // put all in wiki into stream list
  37. print "do all streams\n";
  38. $result = $dbr->query( 'SELECT * FROM `mv_streams`' );
  39. while ( $row = $dbr->fetchObject( $result ) ) {
  40. $this->streams[$row->name] = new MV_Stream( $row );
  41. }
  42. } else {
  43. $stream_name = $args[$stream_inx];
  44. $this->streams[$stream_name] = new MV_Stream( array( 'name' => $stream_name ) );
  45. if ( !$this->streams[$stream_name]->doesStreamExist() ) {
  46. die( 'error: stream ' . $stream_name . ' does not exist' );
  47. }
  48. print "processing Stream: $stream_name \n";
  49. }
  50. }
  51. }
  52. function doScrapeInsert() {
  53. foreach ( $this->streams as & $stream ) {
  54. if ( !isset( $stream->date_start_time ) )$stream->date_start_time = 0;
  55. if ( $stream->date_start_time == 0 ) {
  56. print 'error stream ' . $stream->name . ' missing time info' . "\n";
  57. continue;
  58. }
  59. $hors = ( strpos( $stream->name, 'house' ) !== false ) ? 'h':'s';
  60. $date_req = date( 'Y-m-d', $stream->date_start_time );
  61. if ( strpos( $stream->name, date( 'm-d-y', $stream->date_start_time ) ) === false ) {
  62. $dTitle = Title::newFromText( 'Archive:Stream_DateMissMatch' );
  63. append_to_wiki_page( $dTitle, 'DateMissMatch:[[Stream:' . $stream->stream_name . ']]:' . date( 'm-d-y', $stream->date_start_time ) . "\n" );
  64. // use date from stream name:
  65. // house_da_01-01-07_
  66. preg_match( '/[0-9]+\-[0-9]+\-[0-9][0-9]/U', $stream->name, $matches );
  67. if ( isset( $matches[0] ) ) {
  68. list( $month, $day, $year ) = explode( '-', $matches[0] );
  69. $date_req = '20' . $year . '-' . $month . '-' . $day;
  70. } else {
  71. die( 'could not find date in stream name' );
  72. }
  73. }
  74. $cspan_url = $this->base_url . $this->base_query . '&date=' . $date_req . '&hors=' . $hors;
  75. echo $cspan_url . "\n";
  76. $rawpage = $this->doRequest( $cspan_url );
  77. // get the title and href if present:
  78. $patern = '/overlib\(\'(.*)\((Length: ([^\)]*)).*CAPTION,\'<font size=2>(.*)<((.*href="([^"]*))|.*)>/';
  79. preg_match_all( $patern, $rawpage, $matches );
  80. $cspan_person_ary = array();
  81. // format matches:
  82. foreach ( $matches[0] as $k => $v ) {
  83. $href = '';
  84. $href_match = array();
  85. preg_match( '/href="(.*)"/', $matches[5][$k], $href_match );
  86. if ( count( $href_match ) != 0 )$href = $href_match[1];
  87. $porg = str_replace( '<br />', ' ', $matches[4][$k] );
  88. $porg = preg_replace( '/[D|R|I]+\-\[.*\]/', '', $porg );
  89. $pparts = explode( ',', $porg );
  90. if ( isset( $pparts[1] ) && isset( $pparts[0] ) ) {
  91. $pname = trim( $pparts[1] ) . '_' . trim( $pparts[0] );
  92. if ( mv_is_valid_person( $pname ) ) {
  93. $cspan_person_ary[] = array(
  94. 'start_time' => strip_tags( $matches[1][$k] ),
  95. 'length' => $matches[3][$k],
  96. 'person_title' => str_replace( '<br />', ' ', $matches[4][$k] ),
  97. 'Spoken_by' => $pname,
  98. 'href' => $href
  99. );
  100. }
  101. }
  102. }
  103. // group people in page matches
  104. // $g_cspan_matches=array();
  105. // $prev_person=null;
  106. // foreach($person_time_ary as $ptag){
  107. // $g_cspan_matches[strtolower($ptag['Spoken_by'])][]=$ptag;
  108. // }
  109. // retrive db rows to find match:
  110. $dbr = wfGetDB( DB_SLAVE );
  111. // $mvd_res = MV_Index::getMVDInRange($stream->id, null, null, $mvd_type='ht_en',false,$smw_properties=array('Spoken_by'), '');
  112. /*while ($row = $dbr->fetchObject($mvd_res)) {
  113. $db_person_ary=$g_row_matches=array();
  114. //group peole in db matches:
  115. $cur_person = '';
  116. $curKey=0;
  117. while ($row = $dbr->fetchObject($mvd_res)) {
  118. if(!isset($row->Spoken_by))continue;
  119. if($cur_person!=$row->Spoken_by){
  120. $g_row_matches[]=get_object_vars($row);
  121. $curKey=count($g_row_matches)-1;
  122. $cur_person=$row->Spoken_by;
  123. }else{
  124. $g_row_matches[$curKey]['end_wiki_title']=$row->wiki_title;
  125. $g_row_matches[$curKey]['end_time']+=($row->end_time-$row->start_time);
  126. }
  127. //print_r($g_row_matches);
  128. //if($curKey>2){
  129. // die;
  130. //}
  131. } */
  132. // get people from metavid table (and conform to mvd_res)
  133. $sql = 'SELECT (`people_time`.`time`-`streams`.`adj_start_time`) as `time`,
  134. `person_lookup`.`name_clean` as `Spoken_by`,
  135. `person_lookup`.`first` as `first`,
  136. `person_lookup`.`last` as `last`
  137. FROM `metavid`.`people_attr_stream_time` as `people_time`
  138. RIGHT JOIN `metavid`.`streams` as `streams` ON `streams`.`id`=`people_time`.`stream_fk`
  139. LEFT JOIN `metavid`.`people` as `person_lookup` ON `person_lookup`.`id` = `people_time`.`people_fk`
  140. WHERE `streams`.`name`=\'' . $stream->name . '\'
  141. ORDER BY `people_time`.`time` ';
  142. $people_res = $dbr->query( $sql );
  143. $cur_person = '';
  144. $curKey = 0;
  145. while ( $row = $dbr->fetchObject( $people_res ) ) {
  146. if ( !isset( $row->Spoken_by ) )continue;
  147. $cur_row_person = $row->first . '_' . $row->last;
  148. if ( $cur_person != $cur_row_person ) {
  149. $db_person_ary[] = get_object_vars( $row );
  150. $curKey = count( $db_person_ary ) - 1;
  151. $db_person_ary[$curKey]['Spoken_by'] = $row->first . '_' . $row->last;
  152. $db_person_ary[$curKey]['start_time'] = $row->time;
  153. // not on screen a long time if only one hit:
  154. $db_person_ary[$curKey]['end_time'] = $row->time + 10;
  155. $cur_person = $cur_row_person;
  156. } else {
  157. // update the end time:
  158. $db_person_ary[$curKey]['end_time'] = $row->time;
  159. }
  160. }
  161. // list on screen times for everyone:
  162. foreach ( $db_person_ary as $row ) {
  163. print $row['Spoken_by'] . ' on screen for ' . ( $row['end_time'] - $row['start_time'] ) . "\n";
  164. // $db_person_ary[]=$row;
  165. }
  166. // print_r($db_person_ary);
  167. // die;
  168. // count($cspan_person_ary)
  169. $cur_db_inx = 0;
  170. $cur_person = null;
  171. $fistValid = true;
  172. for ( $i = 0; $i < count( $cspan_person_ary ); $i++ ) {
  173. // print "looking at: ". $cspan_person_ary[$i]['Spoken_by'] . "\n";
  174. print "\tCSPAN: " . $cspan_person_ary[$i]['Spoken_by'] . ' on screen for ' . $cspan_person_ary[$i]['length'] . ' or:' . npt2seconds( $cspan_person_ary[$i]['length'] ) . "\n";
  175. // set up cur, the next and prev pointers:
  176. $cur_person = $cspan_person_ary[$i]['Spoken_by'];
  177. // make sure next is not the same as current:
  178. // note: we don't group above since the same person can give two subsequent different speeches
  179. $next_person = $cur_person;
  180. $k_person_inx = 1;
  181. $person_insert_set = array();
  182. while ( $next_person == $cur_person ) {
  183. if ( isset( $cspan_person_ary[$i + $k_person_inx] ) ) {
  184. $potential_next_person = ( mv_is_valid_person( $cspan_person_ary[$i + $k_person_inx]['Spoken_by'] ) ) ?
  185. $cspan_person_ary[$i + $k_person_inx]['Spoken_by']:null;
  186. if ( $potential_next_person == null && $k_person_inx == 1 ) {
  187. $next_person = null;
  188. break;
  189. } elseif ( $potential_next_person != null ) {
  190. $next_person = $potential_next_person;
  191. }
  192. $k_person_inx++;
  193. } else {
  194. $next_person = null;
  195. }
  196. }
  197. // should be no need to make sure prev is not the same as current (as we do greedy look ahead below)
  198. // $prev_person = $cur_person;
  199. // $k=1;
  200. // while($prev_person==$cur_person){
  201. if ( isset( $cspan_person_ary[$i - 1] ) ) {
  202. $prev_person = ( mv_is_valid_person( $cspan_person_ary[$i - 1]['Spoken_by'] ) ) ?
  203. $cspan_person_ary[$i - 1]['Spoken_by']:null;
  204. } else {
  205. $prev_person = null;
  206. }
  207. // }
  208. if ( mv_is_valid_person( $cspan_person_ary[$i]['Spoken_by'] ) ) {
  209. // print "\tis valid person looking for db sync\n";
  210. // print "\t prev: $prev_person cur: $cur_person next: $next_person\n";
  211. if ( $prev_person == null && $next_person == null ) {
  212. print "error both prev and next are null skiping person\n";
  213. continue;
  214. }
  215. // check how long they where on screen (also check subquent)
  216. $cspan_on_screen_time = npt2seconds( $cspan_person_ary[$i]['length'] );
  217. // print "NOW STARTING AT: $cur_db_inx of " . count($db_person_ary) . "\n";
  218. for ( $j = $cur_db_inx; $j < count( $db_person_ary ); $j++ ) {
  219. // print "searchig db on: " . $db_person_ary[$j]['Spoken_by'] . "!=" . $cspan_person_ary[$i]['Spoken_by'] . " \n";
  220. $prevMatch = $curMatch = $nextMatch = false;
  221. if ( $cur_db_inx == 0 || $prev_person == null ) {
  222. // no need to check prev in db_inx
  223. $prevMatch = true;
  224. // print "(no back check)";
  225. } else {
  226. if ( $db_person_ary[$j - 1]['Spoken_by'] == $prev_person ) {
  227. // print "found prev match: $prev_person\n;";
  228. $prevMatch = true;
  229. }
  230. }
  231. if ( isset( $db_person_ary[$j] ) ) {
  232. if ( isset( $cspan_person_ary[$i] ) ) {
  233. if ( $db_person_ary[$j]['Spoken_by'] == $cspan_person_ary[$i]['Spoken_by'] ) {
  234. // print "found cur match:". $cspan_person_ary[$i]['Spoken_by']."\n";
  235. $curMatch = true;
  236. }
  237. }
  238. }
  239. if ( $next_person == null ) {
  240. // no need to check next in db_inx
  241. $nextMatch = true;
  242. // print "(no next check)";
  243. } else {
  244. if ( isset( $db_person_ary[$j + 1] ) ) {
  245. if ( $db_person_ary[$j + 1]['Spoken_by'] == $next_person ) {
  246. // print "found next match:".$next_person."\n";
  247. $nextMatch = true;
  248. }
  249. }
  250. }
  251. // if we have a match set do insert proc:
  252. if ( $prevMatch && $curMatch && $nextMatch ) {
  253. // print "FOUND Match on $j\n";
  254. // print "\t prev: $prev_person cur: $cur_person next: $next_person\n";
  255. $cur_db_inx = $j;
  256. // add all additional info we can from c-span:
  257. // also push forward for all of current (we should always hit the first series of the same person first )
  258. $k = 0;
  259. // build insert set:
  260. $cur_start_time = $db_person_ary[$j]['start_time'];
  261. while ( $cur_person == $cspan_person_ary[$i + $k]['Spoken_by'] ) {
  262. // use the last cspan_person for start case
  263. $cspan_person_ary[$i + $k]['wiki_start_time'] = $cur_start_time;
  264. if ( npt2seconds( $cspan_person_ary[$i + $k]['length'] ) >
  265. $db_person_ary[$j]['end_time'] - $cur_start_time ) {
  266. $cspan_person_ary[$i + $k]['wiki_end_time'] = $db_person_ary[$j]['end_time'];
  267. // already used up our db_person_ary continue:
  268. print "a cspan insert sync " .
  269. ' ' . $cspan_person_ary[$i + $k]['wiki_start_time'] . " to " .
  270. $cspan_person_ary[$i + $k]['wiki_end_time'] . " of " .
  271. $db_person_ary[$j]['end_time'] . " for: " .
  272. $cspan_person_ary[$i]['Spoken_by'] . "\n";
  273. break;
  274. } else {
  275. $cspan_person_ary[$i + $k]['wiki_end_time'] = $cur_start_time +
  276. npt2seconds( $cspan_person_ary[$i + $k]['length'] );
  277. // print "add " . npt2seconds($cspan_person_ary[$i+$k]['length']) . "\n";
  278. $cur_start_time += npt2seconds( $cspan_person_ary[$i + $k]['length'] );
  279. }
  280. print "p cspan insert sync " .
  281. ' ' . $cspan_person_ary[$i + $k]['wiki_start_time'] . " to " .
  282. $cspan_person_ary[$i + $k]['wiki_end_time'] . " of " .
  283. $db_person_ary[$j]['end_time'] . " for: " .
  284. $cspan_person_ary[$i]['Spoken_by'] . "\n";
  285. // print_r($db_person_ary[$j]);
  286. // print_r($cspan_person_ary[$i+$k]);
  287. $k++;
  288. if ( !isset( $cspan_person_ary[$i + $k] ) )break;
  289. }
  290. $k--;
  291. // extend the last property if within 100 seconds
  292. if ( abs( $cspan_person_ary[$i + $k]['wiki_end_time'] - $db_person_ary[$j]['end_time'] ) < 100 ) {
  293. $cspan_person_ary[$i + $k]['wiki_end_time'] = $db_person_ary[$j]['end_time'];
  294. print "updated cspan insert for: " . $cspan_person_ary[$i]['Spoken_by'] .
  295. ' ' . $cspan_person_ary[$i + $k]['wiki_start_time'] . " to " .
  296. $cspan_person_ary[$i + $k]['wiki_end_time'] . " of " .
  297. $db_person_ary[$j]['end_time'] . "\n";
  298. }
  299. $k++;
  300. // /die;
  301. // move the index to the current:
  302. $i = $i + $k;
  303. continue;
  304. }
  305. }
  306. } else {
  307. // print $cspan_person_ary[$i]['Spoken_by'] . " is not valid person\n";
  308. }
  309. }
  310. print "Get Additonal C-SPAN Data For \"synced\" Data:\n";
  311. foreach ( $cspan_person_ary as $pData ) {
  312. if ( isset( $pData['wiki_start_time'] ) ) {
  313. // init:
  314. $bill_categories = array();
  315. $annotate_body = '';
  316. $body = '';
  317. $bill_key = null;
  318. $rawpage = $this->doRequest( $this->base_url . $pData['href'] );
  319. // $rawpage = $this->doRequest('http://www.c-spanarchives.org/congress/?q=node/77531&id=8330447');
  320. preg_match( '/<\/td><th><center>([^<]*)<\/center><\/th><td>/U', $rawpage, $title_matches );
  321. preg_match( '/<table width="400">\n<tr><td>\n(.*)<\/tr><\/td>/', $rawpage, $page_matches );
  322. if ( isset( $title_matches[1] ) && isset( $page_matches[1] ) ) {
  323. $title = trim( $title_matches[1] );
  324. $body = $page_matches[1];
  325. // print_r($page_matches);
  326. } else {
  327. print "error can't find title or body\n";
  328. print "skip...";
  329. continue;
  330. }
  331. // do debate tag search:
  332. preg_match( '/<td colspan="2">Debate:\s*<[^>]*>([^<]*)/U', $rawpage, $debate_matches );
  333. if ( isset( $debate_matches[1] ) ) {
  334. $bill_key = trim( $debate_matches[1] );
  335. print "found debate: tag " . $bill_key . "\n";
  336. // build gov-track-congress-session friendly debate url:
  337. if ( $this->get_and_process_billid( $bill_key, $stream->date_start_time ) != null ) {
  338. $bill_categories[$bill_key] = $bill_key;
  339. }
  340. }
  341. // title fix hack for C-span error motion to procceed
  342. // @@todo add in the rest of the motions:
  343. if ( strpos( $title, 'MOTION TO PROCEED' ) !== false ) {
  344. $title = str_replace( 'MOTION TO PROCEED', '', $title );
  345. // $annotate_body.="[[Bill Motion:=MOTION TO PROCEED]]\n";
  346. }
  347. // fix title case
  348. $title = ucwords( strtolower( $title ) );
  349. // don't Cap a Few of the Words: '
  350. $title = str_replace( array( ' And', ' Or', ' Of', ' A' ), array( ' and', ' or', ' of', ' a' ), $title );
  351. // replace '' with ``
  352. $body = str_replace( '\'\'', '``', $body );
  353. // replace bill names with [[Catgory:: bill name #]]
  354. // $bill_pattern = '/(H\.R\.\s[0-9]+)/';
  355. $bill_pattern = '/';
  356. $bill_pattern_ary = array();
  357. $or = '';
  358. foreach ( $this->bill_types as $cspanT => $govtrakT ) {
  359. $cspanT = str_replace( 'RES', '[\s]?RES', $cspanT );// sometimes spaces before res in text
  360. $cspanT = str_replace( 'CON', '[\s]?CON', $cspanT );// sometimes spaces before res in text
  361. // replace . with \.[\s]?
  362. $bill_pattern .= $or . '(' . str_replace( '.', '\\.[\s]?', $cspanT ) . '\s?[0-9]+)';
  363. $bill_pattern_ary[] = '(' . str_replace( '.', '\\.[\s]?', $cspanT ) . '\s?[0-9]+)';
  364. $or = '|';
  365. }
  366. $bill_pattern .= '/i';// case insensative
  367. // $body='bla bla H.R. 3453 test S. 3494 some more text';
  368. // print "pattern:".$bill_pattern . "\n";
  369. preg_match_all( $bill_pattern, $body, $bill_matches );
  370. // print_r($bill_matches);
  371. // die;
  372. if ( isset( $bill_matches[1] ) ) {
  373. foreach ( $bill_matches as $k => $bill_type_ary ) {
  374. if ( $k != 0 ) {
  375. if ( isset( $bill_type_ary[0] ) ) {
  376. $bill_name = $bill_type_ary[0];
  377. } elseif ( isset( $bill_type_ary[1] ) ) {
  378. $bill_name = $bill_type_ary[1];
  379. } else {
  380. continue;
  381. }
  382. // if the first letter is lower case not likely a bill
  383. if ( trim( $bill_name ) == '' )continue;
  384. if ( islower( substr( $bill_name, 0, 1 ) ) )continue;
  385. // conform white space and case:
  386. $bill_name = str_replace( array( 'S. ', 'Con. ', 'Res. ' ), array( 'S.', 'CON.', 'RES. ' ), $bill_name );
  387. // make sure its not a false possitave and load bill data from govTrack:
  388. if ( $this->get_and_process_billid( $bill_name, $stream->date_start_time ) ) {
  389. $bill_categories[$bill_name] = $bill_name;
  390. }
  391. }
  392. }
  393. }
  394. // add speech by attribute to annotation body:
  395. $annotate_body .= 'Speech By: [[Speech by:=' . str_replace( '_', ' ', $pData['Spoken_by'] ) . ']] ';
  396. // add speech by attribute to body as well?
  397. $body .= "\n\n" . 'Speech By: [[Speech by:=' . str_replace( '_', ' ', $pData['Spoken_by'] ) . ']] ';
  398. // add any mentions of bills with linkback to full bill title:
  399. $body = preg_replace_callback( $bill_pattern_ary, array( 'self', 'bill_pattern_cp' ), $body );
  400. // source the doument:
  401. $body .= "\n\n" . 'Source: [[Data Source Name:=C-SPAN Congressional Chronicle]] [[Data Source URL:=' . $this->base_url . $pData['href'] . ']]';
  402. $body .= "\n";
  403. // add the title to the top of the page:
  404. $body = "===$title===\n" . $body;
  405. $cspan_title_str = $this->get_aligned_time_title( $pData, 'Thomas_en', $stream );
  406. if ( !$cspan_title_str ) {
  407. $cspan_title_str = 'Thomas_en:' . $stream->name . '/' .
  408. seconds2npt( $pData['wiki_start_time'] ) . '/' .
  409. seconds2npt( $pData['wiki_end_time'] );
  410. }
  411. $cspanTitle = Title::makeTitle( MV_NS_MVD, ucfirst( $cspan_title_str ) );
  412. // print "do edit ".$cspanTitle->getText()."\n";
  413. do_update_wiki_page( $cspanTitle, $body );
  414. // protect editing of the offical record (but allow moving for sync)
  415. $cspanTitle->loadRestrictions();
  416. global $wgRestrictionTypes;
  417. foreach ( $wgRestrictionTypes as $action ) {
  418. // Fixme: this form currently requires individual selections,
  419. // but the db allows multiples separated by commas.
  420. $mRestrictions[$action] = implode( '', $cspanTitle->getRestrictions( $action ) );
  421. }
  422. $article = new Article( $cspanTitle );
  423. $mRestrictions['edit']['sysop'] = true;
  424. $expiry = Block::infinity();
  425. $dbw = wfGetDB( DB_MASTER );
  426. $dbw->begin();
  427. $ok = $article->updateRestrictions( $mRestrictions, wfMsg( 'mv_source_material' ), false, $expiry );
  428. if ( $ok ) {
  429. print "updated permisions for " . $cspanTitle->getText() . "\n";
  430. $dbw->commit();
  431. } else {
  432. print "failed to update restrictions :(\n";
  433. }
  434. // process each bill to the annotation body;
  435. $bcat = '';
  436. $bill_lead_in = "\n\nBill ";
  437. // print_r($bill_categories);
  438. foreach ( $bill_categories as $bill ) {
  439. if ( trim( $bill ) != '' ) {
  440. // use short title for category and long title for semantic link... (highly arbitrary)
  441. $annotate_body .= $bill_lead_in . '[[Bill:=' . $this->cur_bill_short_title . ']] ';
  442. $bill_lead_in = ' , ';
  443. $annotate_body .= "[[Category:$bill]] ";
  444. }
  445. }
  446. if ( trim( $title ) != '' ) {
  447. $annotate_body .= "[[Category:$title]]\n";
  448. }
  449. // see if we can align with an existing speech page:
  450. $anno_title_str = $this->get_aligned_time_title( $pData, 'Anno_en', $stream );
  451. if ( !$anno_title_str ) {
  452. $anno_title_str = 'Anno_en:' . $stream->name . '/' .
  453. seconds2npt( $pData['wiki_start_time'] ) . '/' .
  454. seconds2npt( $pData['wiki_end_time'] );
  455. }
  456. $annoTitle = Title::makeTitle( MV_NS_MVD, ucfirst( $anno_title_str ) );
  457. do_update_wiki_page( $annoTitle, $annotate_body );
  458. // [Page: S14580] replaced with: [[Category:BillName]]
  459. // would be good to link into the official record for "pages"
  460. // [[Speech by:=name]]
  461. // [[category:=title]]
  462. // for documentation:
  463. // semantic qualities would be Aruging For:billX or Arguging Agaist billY
  464. // these pages are non-editable
  465. // maybe put the category info into annotations layer? (since it applies to both?)
  466. // do new page mvd:or_
  467. }
  468. }
  469. // $inx_cspan_person_ary = array_keys($g_row_matches);
  470. // $inx_row_person_ary = array_keys($g_person_time_ary);
  471. // for($i=0;$i<5;$i++){
  472. // }
  473. // find match person1->person2
  474. // average switch time to get offset of stream
  475. // use offset to insert all $person_time_array data
  476. }
  477. }
  478. function get_aligned_time_title( &$pData, $preFix = 'Anno_en', $stream ) {
  479. $dbr = wfGetDB( DB_SLAVE );
  480. $mvd_rows = MV_Index::getMVDInRange( $stream->getStreamId(),
  481. $pData['wiki_start_time'] - 120, $pData['wiki_end_time'] + 120,
  482. $mvd_type = 'Anno_en', $getText = false, $smw_properties = 'Speech_by' );
  483. $doSpeechInsert = true;
  484. foreach ( $mvd_rows as $row ) {
  485. if ( $row->Speech_by ) {
  486. if ( $row->Speech_by == $pData['Spoken_by'] ) {
  487. print "match update existing: $row->Speech_by == " . $pData['Spoken_by'] . "\n";
  488. $anno_title_str = $preFix . ':' . $stream->name . '/' .
  489. seconds2npt( $row->start_time ) . '/' .
  490. seconds2npt( $row->end_time );
  491. return $anno_title_str;
  492. } else {
  493. print "\nno existing speech match:$row->Speech_by != " . $pData['Spoken_by'] . "\n\n";
  494. }
  495. }
  496. }
  497. return false;
  498. }
  499. function bill_pattern_cp( $matches ) {
  500. if ( isset( $this->bill_titles[$matches[0]] ) ) {
  501. return "[[Mentions Bill:=" . $this->bill_titles[$matches[0]] . "|{$matches[0]}]]";
  502. } else {
  503. return "[[Mentions Bill:={$matches[0]}]]";
  504. }
  505. }
  506. /* converts c-span bill_id to gov_track bill id */
  507. function get_and_process_billid( $bill_key, $stream_date = '', $session = '' ) {
  508. global $MvBillTypes;
  509. // add a space to bill key after $bill_type key
  510. foreach ( $this->bill_types as $bk => $na ) {
  511. if ( strpos( $bill_key, $bk ) !== false ) {
  512. if ( strpos( $bill_key, $bk . ' ' ) === false ) {
  513. $bill_key = str_replace( $bk, $bk . ' ', $bill_key );
  514. }
  515. }
  516. }
  517. // first get the year to determine the house session:
  518. if ( $session == '' ) {
  519. $year = date( 'y', $stream_date );
  520. if ( $year == '01' || $year == '02' ) { $session = '107';
  521. } elseif ( $year == '03' || $year == '04' ) { $session = '108';
  522. } elseif ( $year == '06' || $year == '05' ) { $session = '109';
  523. } elseif ( $year == '07' || $year == '08' ) { $session = '110';
  524. } elseif ( $year == '09' || $year == '10' ) { $session = '111';
  525. } elseif ( $year == '11' || $year == '12' ) { $session = '112'; }
  526. $this->cur_session = $session;
  527. }
  528. foreach ( $this->bill_types as $cspanT => $govtrakT ) {
  529. $bill_key = trim( $bill_key );
  530. if ( substr( $bill_key, 0, strlen( $cspanT ) ) == $cspanT ) {
  531. $govTrackBillId = $govtrakT . $session . '-' . trim( substr( $bill_key, strlen( $cspanT ) ) );
  532. $openCongBillId = $session . '-' . strtolower( $govtrakT ) . trim( substr( $bill_key, strlen( $cspanT ) ) );
  533. break;
  534. }
  535. }
  536. if ( trim( $bill_key ) == '' )return false;
  537. // attempt to ascertain maplight bill id:
  538. $mapLightBillId = $this->getMAPLightBillId( $bill_key, $session );
  539. //fix strange govTrackBillID bug:
  540. $govTrackBillId = str_replace('Res.', '', $govTrackBillId);
  541. print "GOT bill id: $govTrackBillId from $bill_key\n";
  542. print "GOT openCon id: $openCongBillId from $bill_key\n";
  543. print "GOT mapLight id: $mapLightBillId from $bill_key\n";
  544. if ( $govTrackBillId ) {
  545. $this->processBill( $govTrackBillId, $bill_key, $openCongBillId, $mapLightBillId );
  546. $this->govTrackBillId = $govTrackBillId;
  547. return $this->govTrackBillId;
  548. } else {
  549. print 'error in getting govTrack bill id on: ' . $bill_key . " (skipped)\n";
  550. return null;
  551. }
  552. }
  553. function getMAPLightBillId( $bill_key, $session ) {
  554. if ( trim( $bill_key ) == '' )return false;
  555. $raw_map_light = $this->doRequest( $this->mapLightBillSearch . str_replace( ' ', '+', $bill_key ) );
  556. // check if we got redirected:
  557. $patern = '/<a href=\"\/map\/us\/bill\/([^\/]*)\/default" class="active">Supporter/';
  558. preg_match( $patern, $raw_map_light, $matches );
  559. if ( isset( $matches[1] ) ) {
  560. print "got redirected from search: " . $matches[1] . "\n";
  561. return $matches[1];
  562. }
  563. $patern = '/<a href=\"\/map\/us\/bill\/([^"]*)">' . str_replace( ' ', '\s?', $bill_key ) . '\s\(' . $session . '/i';
  564. preg_match( $patern, $raw_map_light, $matches );
  565. // print $patern;
  566. // print_r($matches);
  567. if ( isset( $matches[1] ) ) {
  568. return $matches[1];
  569. } else {
  570. print "could not find bill id: $bill_key $session \n";
  571. print "at : " . $this->mapLightBillSearch . str_replace( ' ', '+', $bill_key ) . "\n";
  572. return false;
  573. }
  574. }
  575. function processBill( $govTrackBillId, $bill_key, $openCongBillId = false, $mapLightBillId = false, $forceUpdate = false , $doIntrestLookup=false) {
  576. // get the bill title & its sponsor / co-sponsors:
  577. $rawGovTrackPage = $this->doRequest( $this->govTrack_bill_url . $govTrackBillId );
  578. if( $rawGovTrackPage === false)
  579. return false;
  580. /*****************************
  581. * Process Bill GovTrack info
  582. *****************************/
  583. print "gov_track id: " . $govTrackBillId . " from: " . $this->govTrack_bill_url . $govTrackBillId . "\n";
  584. // get title:
  585. $patern= '/<title>(.*)<\/title>/';
  586. preg_match($patern, $rawGovTrackPage, $title_match );
  587. if ( isset( $title_match[1] ) ) {
  588. //strip govtrack.us
  589. $title_match[1] = str_replace( '(GovTrack.us)', '', $title_match[1]);
  590. if ( trim( $title_match[1] ) == '' ) {
  591. print "empty title\n";
  592. return false;
  593. }
  594. $title_short = str_replace( array( '_', '...', ' [110th]', ' [109th]', ' [108th]', ' [107th]' ), array( ' ', '', '', '', '', '' ), $title_match[1] );
  595. $this->cur_bill_short_title = $title_short;
  596. // set the desc if present:
  597. preg_match( '/<meta name="description" content="([^">]*)"/', $rawGovTrackPage, $desc_match );
  598. if(isset($desc_match[1])){
  599. $title_desc = $desc_match[1];
  600. }else{
  601. die('could not find title desc: ' . $title_desc);
  602. }
  603. $this->bill_titles[$bill_key] = $title_short;
  604. } else {
  605. print $this->govTrack_bill_url . $govTrackBillId . "\n" . $patern . "\n" . $rawGovTrackPage;
  606. die( 'could not get title for bill: ' . $govTrackBillId );
  607. }
  608. // print "raw govtrack:\n $rawGovTrackPage";
  609. // get the $thomas_match
  610. preg_match( '/thomas\.loc\.gov\/cgi-bin\/bdquery\/z\?([^\"]*)/', $rawGovTrackPage, $thomas_match );
  611. // get introduced: //strange .* does not seem to work :(
  612. preg_match( '/Introduced<\/nobr>[^>]*>[^>]*>[^>]*>([^<]*)/', $rawGovTrackPage, $date_intro_match );
  613. // get sponsor govtrack_id:
  614. preg_match( '/usbill:sponsor[^<]*<a href="person.xpd\?id=([^"]*)/i', $rawGovTrackPage, $sponsor_match );
  615. // lookup govtrack_id
  616. // print_r($sponsor_match);
  617. if ( isset( $sponsor_match[1] ) ) {
  618. $sponsor_name = str_replace( '_', ' ', $this->get_wiki_name_from_govid( $sponsor_match[1] ) );
  619. }
  620. // get cosponsor chunk:
  621. $scospon = strpos( $rawGovTrackPage, 'Cosponsors [as of' );
  622. $cochunk = substr( $rawGovTrackPage,
  623. $scospon,
  624. strpos( $rawGovTrackPage, '<a href="/faq.xpd#cosponsors">' ) - $scospon );
  625. preg_match_all( '/person.xpd\?id=([^"]*)/', $cochunk, $cosponsor_match );
  626. $bp = "{{Bill|\n" .
  627. 'GovTrackID=' . $govTrackBillId . "|\n";
  628. if ( isset( $thomas_match[1] ) )$bp .= 'ThomasID=' . $thomas_match[1] . "|\n";
  629. if ( $openCongBillId )$bp .= 'OpenCongressBillID=' . $openCongBillId . "|\n";
  630. if ( $mapLightBillId )$bp .= 'MapLightBillID=' . $mapLightBillId . "|\n";
  631. if ( isset( $this->cur_session ) )$bp .= 'Session=' . $this->cur_session . "th session|\n";
  632. $bp .= 'Bill Key=' . $bill_key . "|\n";
  633. if ( isset( $date_intro_match[1] ) )$bp .= 'Date Introduced=' . $date_intro_match[1] . "|\n";
  634. if ( $title_desc ) {
  635. $bp .= 'Title Description=' . $title_desc . "|\n";
  636. }
  637. if ( $sponsor_name )$bp .= 'Sponsor=' . $sponsor_name . "|\n";
  638. if ( isset( $cosponsor_match[1] ) ) {
  639. foreach ( $cosponsor_match[1] as $k => $govid ) {
  640. $cosponsor_name = $this->get_wiki_name_from_govid( $govid );
  641. if ( $cosponsor_name ) {
  642. $bp .= 'Cosponsor ' . ( $k + 1 ) . '=' . $cosponsor_name . "|\n";
  643. }
  644. }
  645. }
  646. /*****************************
  647. * Process MapLight Info
  648. *****************************/
  649. if ( $mapLightBillId ) {
  650. $bill_interest = $this->proccMapLightBillIntrests( $mapLightBillId );
  651. if( $bill_interest ) {
  652. $i = 1;
  653. foreach ( $bill_interest['support'] as $interest ) {
  654. $this->procMapLightInterest( $interest );
  655. $bp .= 'Supporting Interest ' . $i . '=' . $interest['name'] . "|\n";
  656. $i++;
  657. //process interest
  658. if($doIntrestLookup)
  659. $this->procMapLightInterest( $interest );
  660. }
  661. $i = 1;
  662. foreach ( $bill_interest['oppose'] as $interest ) {
  663. $bp .= 'Opposing Interest ' . $i . '=' . $interest['name'] . "|\n";
  664. $i++;
  665. //process interest
  666. if($doIntrestLookup)
  667. $this->procMapLightInterest( $interest );
  668. }
  669. }
  670. }
  671. $bp .= "}}\n";
  672. // print 'page : '.$title_short.' ' . $bp . "\n";
  673. // incorporated into the template:
  674. // $body.="\n\n".'Source: [[Data Source Name:=GovTrack]] [[Data Source URL:='.$this->govTrack_bill_url . $govTrackBillId.']]';
  675. // set up the base bill page:
  676. $wgBillTitle = Title::newFromText( $title_short );
  677. //print $bp;
  678. //die;
  679. do_update_wiki_page( $wgBillTitle, $bp );
  680. // set up a redirect for the bill key, and a link for the category page:
  681. //print "\ndo redirect for: $title_short \n";
  682. global $mvForceUpdate;
  683. //$wgBillKeyTitle = Title::newFromText( $bill_key );
  684. //do_update_wiki_page( $wgBillKeyTitle, '#REDIRECT [[' . $title_short . ']]', null, $mvForceUpdate );
  685. // set up link on the category page:
  686. //$wgCatPageTitle = Title::newFromText( $bill_key, NS_CATEGORY );
  687. //do_update_wiki_page( $wgCatPageTitle, 'See Bill Page For More Info: [[:' . $wgBillTitle->getText() . ']]', null, $mvForceUpdate );
  688. }
  689. function procMapLightInterest( $interest ) {
  690. global $mvMaxContribPerInterest, $mvMaxForAgainstBills;
  691. if ( $this->bill_name_maplight_lookup ) {
  692. include_once( 'metavid2mvWiki.inc.php' );
  693. do_proc_interest( $interest['key'], $interest['name'] );
  694. }
  695. }
  696. // returns an array of interest in ['support'] & ['opposition'] .. also procces interest links
  697. function proccMapLightBillIntrests( $mapLightBillId ) {
  698. // print "map info: $this->mapLightBillInfo \n";
  699. print str_replace( '$1', $mapLightBillId, $this->mapLightBillInfo ) . "\n\n";
  700. $ret_ary = array( 'support' => array(), 'oppose' => array() );
  701. $bill_url = str_replace( '$1', $mapLightBillId, $this->mapLightBillInfo );
  702. $bill_page = $this->doRequest( $bill_url);
  703. // $bill_page = $this->doRequest('http://maplight.org/map/us/bill/10831/default');
  704. // print $bill_page;
  705. // ([^<]*)<\/a>)*
  706. // a href="\/map\/us\/interest\/([^"]*) class="interest"
  707. // class="organizations"\sid="for
  708. // preg_match_all('/class="organizations"\sid="for.*<ul class="industries list-clear">()*/',$bill_page, $matches);
  709. print "\n". $bill_url."\n";
  710. preg_match_all( '/href\=\"\/map\/us\/interest\/([^"]*)[^>]*>([^<]*)/', $bill_page, $matches, PREG_OFFSET_CAPTURE );
  711. $aginst_pos = strpos( $bill_page, 'id="against"' );
  712. // return empty arrays if we don't have info to give back:'
  713. if ( $aginst_pos === false )return $ret_ary;
  714. if ( !isset( $matches[1] ) )return $ret_ary;
  715. foreach ( $matches[1] as $inx => $intrest ) {
  716. if ( $intrest[1] < $aginst_pos ) {
  717. $ret_ary['support'][] = array( 'key' => $intrest[0], 'name' => htmlspecialchars_decode( $matches[2][$inx][0]) );
  718. } else {
  719. $ret_ary['oppose'][] = array( 'key' => $intrest[0], 'name' => htmlspecialchars_decode( $matches[2][$inx][0] ) );
  720. }
  721. }
  722. return $ret_ary;
  723. }
  724. function get_bill_name_from_mapLight_id( $mapBillId, $doLookup = true ) {
  725. global $mvForceUpdate;
  726. if ( !$mvForceUpdate ) {
  727. if ( !isset( $this->mapLight_bill_cache ) ) {
  728. $sql = 'SELECT * FROM `smw_attributes` WHERE `attribute_title` = \'MAPLight_Bill_ID\'';
  729. $dbr = wfGetDB( DB_SLAVE );
  730. $res = $dbr->query( $sql );
  731. while ( $row = $dbr->fetchObject( $res ) ) {
  732. $this->mapLight_bill_cache[$row->value_xsd] = $row->subject_title;
  733. }
  734. }
  735. }
  736. if ( !isset( $this->mapLight_bill_cache[$mapBillId] ) ) {
  737. if ( $doLookup ) {
  738. print "missing bill by mapId: $mapBillId retrieve it: \n";
  739. $raw_bill_page = $this->doRequest( 'http://www.maplight.org/map/us/bill/' . $mapBillId . '/default' );
  740. preg_match( '/title">([^-]*)-/', $raw_bill_page, $matches );
  741. if ( isset( $matches[1] ) )$bill_key = trim( $matches[1] );
  742. preg_match( '/map-bill-title">([^t]*)t/', $raw_bill_page, $matches );
  743. if ( isset( $matches[1] ) )$session_num = trim( $matches[1] );
  744. print " found bill key:$session_num $bill_key \n";
  745. // set a flag as to not get caught in infintate loop:
  746. $this->bill_name_maplight_lookup = false;
  747. $this->get_and_process_billid( $bill_key, '', $session_num );
  748. print " found bill title: " . $this->cur_bill_short_title . "\n";
  749. // should now have the bill name update the cache and return
  750. $this->mapLight_bill_cache[$mapBillId] = $this->cur_bill_short_title;
  751. } else {
  752. print "unable to find bill mapId: $mapBillId \n";
  753. return false;
  754. }
  755. }
  756. return $this->mapLight_bill_cache[$mapBillId];
  757. }
  758. function get_wiki_name_from_govid( $govID ) {
  759. if ( !isset( $this->govTrack_cache ) ) {
  760. $sql = 'SELECT * FROM `smw_attributes` WHERE `attribute_title` = \'GovTrack_Person_ID\'';
  761. $dbr = wfGetDB( DB_SLAVE );
  762. $res = $dbr->query( $sql );
  763. while ( $row = $dbr->fetchObject( $res ) ) {
  764. $this->govTrack_cache[$row->value_xsd] = $row->subject_title;
  765. }
  766. }
  767. if ( !isset( $this->govTrack_cache[$govID] ) ) {
  768. //$wgTitle = Title::newFromText( 'Archive:Missing_People' );
  769. print $govID . ' not found ' . "\n";
  770. //append_to_wiki_page( $wgTitle, "Missing GovTrack person: [[Missing GovTrackId:=$govID]][http://www.govtrack.us/congress/person.xpd?id=$govID] " );
  771. return false;
  772. }
  773. return str_replace( '_', ' ', $this->govTrack_cache[$govID] );
  774. }
  775. function get_wiki_name_from_maplightid( $mapID ) {
  776. if ( !isset( $this->mapLight_cache[$mapID] ) ) {
  777. //$sql = 'SELECT * FROM `smw_attributes` WHERE `attribute_title` = \'MAPLight_Person_ID\'';
  778. $query_string= "[[MAPLight Person ID::{$mapID}]]";
  779. $params=array('format' => 'broadtable',
  780. 'offset' => 0,
  781. 'limit' =>1);
  782. $results = array();
  783. $queryobj = SMWQueryProcessor::createQuery($query_string, $params, false, '', array());
  784. $queryobj->querymode = SMWQuery::MODE_INSTANCES;
  785. $res = smwfGetStore()->getQueryResult($queryobj);
  786. for($i=0;$i< $res->getCount();$i++){
  787. $v = $res->getNext();
  788. $v = current(current($v)->getContent());
  789. $this->mapLight_cache[$mapID] = $v->getXSDValue();
  790. }
  791. }
  792. if ( !isset( $this->mapLight_cache[$mapID] ) ) {
  793. $wgTitle = Title::newFromText( 'CongressVid:Missing_People' );
  794. print "{$query_string} No $mapID found\n";
  795. // append_to_wiki_page($wgTitle, "Missing MapLight person: [http://maplight.org/map/us/legislator/$mapID $mapID]");
  796. return false;
  797. }
  798. return str_replace( '_', ' ', $this->mapLight_cache[$mapID] );
  799. }
  800. }
  801. class MV_ArchiveOrgScrape extends MV_BaseScraper {
  802. function getFileList( $stream_name ) {
  803. //get the latest archive.org page:
  804. $raw_page = $this->doRequest( 'http://www.archive.org/details/mv_' . $stream_name, array(), $get_fresh=true );
  805. if($raw_page=='')
  806. return false;
  807. //print "Raw page: $raw_page";
  808. preg_match_all( '/href="(\/download\/mv_[^"]*)">([^<]*)<\/a>([^<]*)/', $raw_page, $matches );
  809. $files = array();
  810. if ( isset( $matches[1] ) ) {
  811. foreach ( $matches as $inx => $set ) {
  812. foreach ( $set as $k => $v ) {
  813. $files[$k][$inx] = trim( $v );
  814. }
  815. }
  816. //remove duplicates
  817. $orgFiles = $files;
  818. $files = array();
  819. $dupCheck=array();
  820. foreach( $orgFiles as $file ){
  821. if( !isset($dupCheck[ $file[1] ] )){
  822. $files[] = $file;
  823. $dupCheck[ $file[1] ] = true;
  824. }
  825. }
  826. } else {
  827. return false;
  828. }
  829. return $files;
  830. }
  831. }
  832. class MV_BaseScraper {
  833. var $number_of_tries = 3;
  834. /*
  835. * simple url cach using the mv_url_cache table
  836. *
  837. * @@todo handle post vars
  838. */
  839. function doRequest( $url, $post_vars = array(), $get_fresh=false, $try_count=1 ) {
  840. $dbr = wfGetDB( DB_SLAVE );
  841. $dbw = wfGetDB( DB_MASTER );
  842. // check the cache
  843. // $sql = "SELECT * FROM `metavid`.`cache_time_url_text` WHERE `url` LIKE '$url'";
  844. // select( $table, $vars, $conds='', $fname = 'Database::select', $options = array() )
  845. $res = $dbr->select( 'mv_url_cache', '*', array( 'url' => $url ), 'MV_BaseScraper::doRequest' );
  846. // @@todo check date for expiration
  847. if ( $res->numRows() == 0 || $get_fresh) {
  848. echo "do web request: " . $url . "\n";
  849. // get the content:
  850. $page = file_get_contents( $url );
  851. if ( $page === false ) {
  852. echo( "error getting url retrying (".$try_count." of $this->number_of_tries)" );
  853. sleep( 5 );
  854. if($try_count >= $this->number_of_tries){
  855. print "could not get url after $this->number_of_tries \n\n";
  856. return false;
  857. }
  858. $try_count++;
  859. return $this->doRequest( $url, $post_vars, $get_fresh, $try_count );
  860. }
  861. if ( $page != '' ) {
  862. // insert back into the db:
  863. // function insert( $table, $a, $fname = 'Database::insert', $options = array() )
  864. $dbw->insert( 'mv_url_cache', array( 'url' => $url, 'result' => $page, 'req_time' => time() ) );
  865. return $page;
  866. }
  867. } else {
  868. $row = $dbr->fetchObject( $res );
  869. return $row->result;
  870. }
  871. }
  872. }