PageRenderTime 1701ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/extensions/MetavidWiki/maintenance/metavid2mvWiki.inc.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 1424 lines | 1112 code | 60 blank | 252 comment | 244 complexity | 3ecf14ee362fbe56ae6150007bb2f585 MD5 | raw file
  1. <?php
  2. /*
  3. * metavid2mvWiki.inc.php Created on Jan 19, 2008
  4. *
  5. * All Metavid Wiki code is Released under the GPL2
  6. * for more info visit http://metavid.org/wiki/Code
  7. *
  8. * @author Michael Dale
  9. * @email dale@ucsc.edu
  10. * @url http://metavid.org
  11. */
  12. /*
  13. * Templates:
  14. */
  15. require_once ( '../../../maintenance/commandLine.inc' );
  16. // $i=0;
  17. function do_stream_attr_check( $old_stream ) {
  18. global $i;
  19. $mvStream = & mvGetMVStream( array (
  20. 'name' => $old_stream->name
  21. ) );
  22. // print "doding stream attr check: ";
  23. // print_r($old_stream);
  24. if ( $mvStream->date_start_time != $old_stream->adj_start_time ) {
  25. $mvStream->date_start_time = $old_stream->adj_start_time;
  26. }
  27. if ( $mvStream->duration != ( $old_stream->adj_end_time - $old_stream->adj_start_time ) ) {
  28. $mvStream->duration = ( $old_stream->adj_end_time - $old_stream->adj_start_time );
  29. }
  30. $mvStream->updateStreamDB();
  31. print "$old_stream->name update: duration:" . seconds2npt( $mvStream->duration ) . ' startDay:' . date( 'm-d-y', $mvStream->date_start_time ) . "\n";
  32. // if($i==3)die;
  33. // $i++;
  34. }
  35. function get_all_mv_streams(){
  36. $dbr = wfGetDB( DB_READ );
  37. $streams = array();
  38. $result = $dbr->select( 'mv_streams',
  39. '*',
  40. '',
  41. __METHOD__
  42. );
  43. if ( $dbr->numRows( $result ) == 0 )die("do_stream_file_check: no streams found");
  44. while ( $stream = $dbr->fetchObject( $result ) ) {
  45. $streams[$stream->id] = $stream;
  46. }
  47. return $streams;
  48. }
  49. function do_remove_orphaned_streams(){
  50. //get all stream ids present in mv_stream_files and mv_stream_images
  51. $dbr = wfGetDB( DB_READ );
  52. $orphaned_streams = array();
  53. $all_valid_streams = get_all_mv_streams();
  54. //could be done with a join ..oh well
  55. $result = $dbr->select( 'mv_stream_files',
  56. 'stream_id',
  57. '',
  58. __METHOD__,
  59. array( 'GROUP BY' => 'stream_id')
  60. );
  61. while ( $stream = $dbr->fetchObject( $result ) ) {
  62. if(!isset($all_valid_streams[$stream->stream_id ])){
  63. $orphaned_streams[ $stream->stream_id ] = 1;
  64. }
  65. }
  66. $result = $dbr->select( 'mv_stream_images',
  67. 'stream_id',
  68. '',
  69. __METHOD__,
  70. array( 'GROUP BY' => 'stream_id')
  71. );
  72. while ( $stream = $dbr->fetchObject( $result ) ) {
  73. if( !isset($all_valid_streams[ $stream->stream_id ] ) ){
  74. $orphaned_streams[ $stream->stream_id ] = 1;
  75. }
  76. }
  77. foreach($orphaned_streams as $stream_id => $na){
  78. $mvStream = new MV_Stream( array('id'=> $stream_id) );
  79. //double check stream does not exist:
  80. if( ! $mvStream->doesStreamExist() ){
  81. print "stream id: {$stream_id} does not exist in stream table (remove)\n";
  82. //remove files in the stream directory:
  83. $filedir = '/video/metavid/mvprime_stream_images/' .
  84. MV_StreamImage::getRelativeImagePath( $stream_id );
  85. //print "dir is: $filedir \n";
  86. if( is_dir($filedir )){
  87. $cmd = 'rm -rf ' . $filedir;
  88. print "removing image run#: $cmd \n";
  89. shell_exec($cmd);
  90. }
  91. //print "removing DB entires for $stream_id\n";
  92. $mvStream->deleteDB();
  93. }
  94. }
  95. /*$streams = get_all_mv_streams();
  96. foreach( $streams as $stream){
  97. //check if stream page exists:
  98. $mvStreamTitle = Title::newFromText( $stream->name, MV_NS_STREAM );
  99. if( !$mvStreamTitle->exists() ){
  100. print "stream: {$stream->name} does not exist as a wiki page\n";
  101. //should remove here
  102. $mvStream = new MV_Stream( $stream );
  103. }
  104. }*/
  105. }
  106. function do_stream_date_check(){
  107. $streams = get_all_mv_streams();
  108. foreach ( $streams as $stream ) {
  109. preg_match("/([0-9]+-[0-9]+-[0-9]+)/", $stream->name, $matches);
  110. if( ! isset( $matches[1] ) ){
  111. print "no date found in {$stream->name}\n";
  112. continue;
  113. }
  114. $sdate = $force_update = false;
  115. //check for srt file:
  116. $srt_file = '/video/metavid/raw_mpeg2/' . $stream->name . '.srt';
  117. if( is_file( $srt_file ) ){
  118. $srt_ary = file( $srt_file );
  119. $time = intval( trim( str_replace( 'starttime' , '', $srt_ary[2] )) );
  120. //ignore bad .srt values (before 08
  121. if( intval( date('y', $time) > 8)) {
  122. if( $stream->date_start_time != $time){
  123. $sdate=$time;
  124. $force_update = true;
  125. print "force srt update:: ";
  126. }
  127. }
  128. }
  129. //no date from srt make starting at 9am
  130. if( !$sdate ){
  131. $sd = explode('-',$matches[1]);
  132. $sdate = mktime( 9, 0, 0, $sd[0], $sd[1], intval('20'.$sd[2]) );
  133. }
  134. if( date('d-y', $stream->date_start_time) != date('d-y',$sdate) || $force_update ) {
  135. //print "should update date: " . $stream->date_start_time . ' to '. $sdate . ' for ' . $stream->name . "\n";
  136. $dbw = wfGetDB( DB_WRITE );
  137. $sql = "UPDATE `mv_streams` SET `date_start_time`= '$sdate' " .
  138. " WHERE `id`={$stream->id} LIMIT 1 ";
  139. $dbw->query($sql);
  140. print "$stream->name date updated\n";
  141. }else{
  142. print "$stream->name date is ok\n";
  143. }
  144. }
  145. }
  146. function do_stream_file_check( $old_stream=false ) {
  147. global $mvgIP, $mvVideoArchivePaths;
  148. $stream_set = Array();
  149. if($old_stream==false){
  150. $stream_set = get_all_mv_streams();
  151. }else{
  152. $stream_set = Array( $old_stream );
  153. }
  154. foreach($stream_set as $stream){
  155. $mvStream = & mvGetMVStream( array (
  156. 'name' => $stream->name,
  157. 'duration' => $stream->duration
  158. ) );
  159. $file_list = $mvStream->getFileList();
  160. //print 'f:do_stream_file_check:' . $stream->name . ' dur: ' . $mvStream->getDuration() . "\n";
  161. // @@todo have multiple file locations for same file?
  162. $set = array();
  163. foreach ( $mvVideoArchivePaths as $path ) {
  164. if ( url_exists( $path . $stream->name . '.ogg' ) ) {
  165. $set['mv_ogg_low_quality'] = $path . $stream->name . '.ogg';
  166. // force cap1 path @@todo remove!:
  167. // $set['mv_ogg_low_quality']='http://128.114.20.64/media/' . $stream->name . '.ogg';
  168. }
  169. if ( url_exists( $path . $stream->name . '.HQ.ogg' ) ) {
  170. $set['mv_ogg_high_quality'] = $path . $stream->name . '.HQ.ogg';
  171. // force cap1 path @@todo remove!:
  172. // $set['mv_ogg_high_quality']='http://128.114.20.64/media/' . $stream->name . '.HQ.ogg';
  173. }
  174. if ( url_exists( $path . $stream->name . '.flv' ) ) {
  175. $path = str_replace('/media/','', $path);
  176. $set['mv_flash_low_quality'] = $path . '/mvFlvServer.php/'. $stream->name . '.flv';
  177. // force cap1 path @@todo remove!:
  178. // $set['mv_ogg_high_quality']='http://128.114.20.64/media/' . $stream->name . '.HQ.ogg';
  179. }
  180. }
  181. //check archive.org paths:
  182. if ( count( $set ) == 0 ) {
  183. // no files present (remove stream)
  184. print 'no files present should remove: ' . $stream->name . "\n";
  185. continue;
  186. }
  187. $dbw = wfGetDB( DB_WRITE );
  188. $sql = "DELETE FROM `mv_stream_files` WHERE `stream_id`=" . $mvStream->id . " AND " .
  189. "(`file_desc_msg`='mv_ogg_high_quality' " .
  190. " OR `file_desc_msg`='mv_ogg_low_quality' " .
  191. " OR `file_desc_msg`='mv_flash_low_quality')";
  192. $dbw->query( $sql );
  193. // update files:
  194. if(!isset($set['mv_ogg_low_quality'])){
  195. print "Missing lowQ ogg for: " .$stream->name ."\n";
  196. }
  197. if(!isset($set['mv_ogg_high_quality'])){
  198. print "Missing highQ ogg for: " .$stream->name ."\n";
  199. }
  200. if(!isset($set['mv_flash_low_quality'])){
  201. print "Missing flash for: " .$stream->name ."\n";
  202. }
  203. foreach ( $set as $qf => $path_url ) {
  204. do_insert_stream_file( $mvStream, $path_url, $qf );
  205. }
  206. }
  207. }
  208. function do_insert_stream_file( $mvStream, $path, $quality_msg ) {
  209. global $mvVideoArchivePaths;
  210. $dbw = wfGetDB( DB_WRITE );
  211. $dur = $mvStream->getDuration();
  212. // get file duration from nfo file :
  213. $dur = $mvStream->getDuration();
  214. if($dur == 0){
  215. $nfo_url = $path . '.nfo';
  216. if( url_exists($nfo_url) ){
  217. $nfo_txt = @file( $nfo_url );
  218. if ( $nfo_txt !== false ) {
  219. if ( isset( $nfo_txt[0] ) ) {
  220. list( $na, $len ) = explode( 'n:', $nfo_txt[0] );
  221. $len = trim( $len );
  222. // trim leading zero
  223. if ( $len[0] == '0' )$len = substr( $len, 1 );
  224. // trim sub frame times:
  225. if ( strpos( $len, '.' ) !== false ) {
  226. $len = substr( $len, 0, strpos( $len, '.' ) );
  227. }
  228. $dur = npt2seconds( $len );
  229. } else {
  230. echo "empty nfo file: $nfo_url \n";
  231. $dur = 0;
  232. }
  233. } else {
  234. echo "missing nfo file: $nfo_url \n";
  235. $dur = 0;
  236. }
  237. }
  238. }
  239. $sql = "INSERT INTO `mv_stream_files` (`stream_id`, `file_desc_msg`, `path`, `duration`)" .
  240. " VALUES ('{$mvStream->id}', '{$quality_msg}', '{$path}', {$dur} )";
  241. $dbw->query( $sql );
  242. }
  243. // @@todo convert to MV_EditStream
  244. function do_add_stream( & $mvTitle, & $stream ) {
  245. $MV_SpecialAddStream = new MV_SpecialCRUDStream( 'add' );
  246. $MV_SpecialAddStream->stream_name = $mvTitle->getStreamName();
  247. $MV_SpecialAddStream->stream_type = 'metavid_file';
  248. $MV_SpecialAddStream->stream_desc = mv_semantic_stream_desc( $mvTitle, $stream );
  249. // add the stream:
  250. $MV_SpecialAddStream->add_stream();
  251. }
  252. function do_stream_insert( $mode, $stream_name = '' ) {
  253. global $mvgIP, $MVStreams, $options, $args, $wgDBname;
  254. $dbr = wfGetDB( DB_SLAVE );
  255. if ( $mode == 'all' ) {
  256. $sql = "SELECT * FROM `metavid`.`streams` WHERE `sync_status`='in_sync'";
  257. } elseif ( $mode == 'files' ) {
  258. $sql = "SELECT * FROM `metavid`.`streams` WHERE `trascoded` != 'none'";
  259. } elseif ( $mode == 'all_in_wiki' ) {
  260. $sql = "SELECT `metavid`.`streams`.* FROM `$wgDBname`.`mv_streams` LEFT JOIN `metavid`.`streams` ON (`$wgDBname`.`mv_streams`.`name` = `metavid`.`streams`.`name`) ";
  261. } elseif ( $mode == 'all_sync_past_date' ) {
  262. print "doing all after: " . $args[$options['date']] . "\n";
  263. list( $month, $day, $year ) = explode( '/', $args[$options['date']] );
  264. $date_time = mktime( 0, 0, 0, $month, $day, $year );
  265. $sql = "SELECT * FROM `metavid`.`streams` WHERE `sync_status`= 'in_sync' AND `adj_start_time` > $date_time";
  266. } else {
  267. $sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '{$stream_name}'";
  268. }
  269. $res = $dbr->query( $sql );
  270. if ( $dbr->numRows( $res ) == 0 )
  271. die( 'could not find stream: ' . $stream_name . "\n" );
  272. // load all stream names:
  273. while ( $row = $dbr->fetchObject( $res ) ) {
  274. $streams[] = $row;
  275. }
  276. print "working on " . count( $streams ) . ' streams' . "\n";
  277. foreach ( $streams as $stream ) {
  278. print "on stream $stream->name \n";
  279. $force = ( isset( $options['force'] ) ) ? true:false;
  280. // init the stream
  281. $MVStreams[$stream->name] = new MV_Stream( $stream );
  282. // check if the stream has already been added to the wiki (if not add it)
  283. $mvTitle = new MV_Title( 'Stream:' . $stream->name );
  284. if ( !$mvTitle->doesStreamExist() ) {
  285. // print 'do stream desc'."\n";
  286. do_add_stream( $mvTitle, $stream );
  287. echo "stream " . $mvTitle->getStreamName() . " added \n";
  288. } else {
  289. do_update_wiki_page( $stream->name, mv_semantic_stream_desc( $mvTitle, $stream ), MV_NS_STREAM, $force );
  290. // $updated = ' updated' echo "stream " . $mvTitle->getStreamName() . " already present $updated\n";
  291. }
  292. if ( $mode != 'all_in_wiki' ) {
  293. // add duration and start_time attr
  294. do_stream_attr_check( $stream );
  295. }
  296. // do insert/copy all media images
  297. if ( !isset( $options['skipimage'] ) ) {
  298. do_process_images( $stream, $force );
  299. print "done with images\n";
  300. }
  301. if ( !isset( $options['skipfiles'] ) ) {
  302. // check for files (make sure they match with metavid db values
  303. do_stream_file_check( $stream );
  304. }
  305. if ( !isset( $options['skiptext'] ) ) {
  306. // process all stream text:
  307. do_process_text( $stream, $force );
  308. }
  309. if ( !isset( $options['skipSpeechMeta'] ) ) {
  310. // do annoative track for continus speches
  311. do_annotate_speeches( $stream, $force );
  312. }
  313. }
  314. }
  315. function do_annotate_speeches( $stream, $force ) {
  316. print "do annotations for $stream->name \n";
  317. $dbr = wfGetDB( DB_SLAVE );
  318. if ( $force ) {
  319. global $botUserName;
  320. // get wiki stream id:
  321. $wikiStream = new MV_Stream( array( 'name' => $stream->name ) );
  322. // first remove all bot edited pages:
  323. $mvd_rows =& MV_Index::getMVDInRange( $wikiStream->getStreamId(), null, null, 'Anno_en' );
  324. foreach ( $mvd_rows as $row ) {
  325. $title = Title::newFromText( $row->wiki_title, MV_NS_MVD );
  326. $current = Revision::newFromTitle( $title );
  327. if ( $current->getUserText() == $botUserName ) {
  328. $article = new Article( $title );
  329. $article->doDelete( 'mvbot removal' );
  330. print "removed $row->wiki_title \n";
  331. } else {
  332. print "skiped $roe->wiki_title (last edit by: " . $current->getUserText() . ")\n";
  333. }
  334. }
  335. }
  336. // get all mvd's
  337. $mvStream = MV_Stream::newStreamByName( $stream->name );
  338. if ( $mvStream->doesStreamExist() ) {
  339. $dbr = wfGetDB( DB_SLAVE );
  340. // get all pages in range (up 10k)
  341. $mvd_rows =& MV_Index::getMVDInRange( $mvStream->getStreamId(), null, null, 'Ht_en', false, 'Spoken_By', array('LIMIT'=>10000) );
  342. if ( count( $mvd_rows ) != 0 ) {
  343. print "looking at ". count( $mvd_rows ). " text rows\n";
  344. $prev_person = '';
  345. $prev_st = $prev_et = 0;
  346. foreach($mvd_rows as $mvd){
  347. //print "On: ".$mvd->Spoken_By."\n";
  348. if ( $mvd->Spoken_By ) {
  349. if ( $prev_person == '' ) {
  350. $prev_person = $mvd->Spoken_By; // init case:
  351. $prev_st = $mvd->start_time;
  352. $prev_et = $mvd->end_time;
  353. } else {
  354. if ( $prev_person == $mvd->Spoken_By ) {
  355. // continue
  356. // print "acumulating for $mvd->Spoken_by \n";
  357. $prev_et = $mvd->end_time;
  358. } else {
  359. // diffrent person: if more than 1 min long
  360. if ( $prev_et - $prev_st > 60 ) {
  361. $doSpeechUpdate = true;
  362. print "insert annotation $prev_person: " . seconds2npt( $prev_st ) . " to " . seconds2npt( $prev_et ) . " \n";
  363. // check for existing speech by in range if so skip (add subtract 1 to start/end (to not get matches that land on edges) (up to 10,000 meta per stream)
  364. $mvd_anno_rows = MV_Index::getMVDInRange( $mvStream->getStreamId(), $prev_st + 1, $prev_et - 1, 'Anno_en', false, 'Speech_by' );
  365. foreach($mvd_anno_rows as $row) {
  366. if ( $row->Speech_by ) {
  367. print ".. range already has: $row->Speech_by skip\n";
  368. $doSpeechUpdate = false;
  369. break;
  370. }
  371. }
  372. if ( $doSpeechUpdate ) {
  373. $page_txt = '[[Speech by:=' . str_replace( '_', ' ', $prev_person ) . ']]';
  374. $annoTitle = Title::makeTitle( MV_NS_MVD, 'Anno_en:' . $mvStream->getStreamName() . '/' . seconds2npt( $prev_st ) . '/' . seconds2npt( $prev_et ) );
  375. do_update_wiki_page( $annoTitle, $page_txt );
  376. }
  377. }
  378. $prev_person = $mvd->Spoken_By; // init case:
  379. $prev_st = $mvd->start_time;
  380. }
  381. }
  382. }
  383. }
  384. print "\n\ndone with annotation inserts got to " . seconds2npt( $prev_et ) . ' of ' . seconds2npt( $mvStream->getDuration() ) . "\n";
  385. }else{
  386. print "no annotations added 0 mvd transcript pages found\n";
  387. }
  388. }
  389. }
  390. function do_process_text( $stream, $force ) {
  391. $dbr = wfGetDB( DB_SLAVE );
  392. if ( $force ) {
  393. global $botUserName;
  394. // get wiki stream id:
  395. $wikiStream = new MV_Stream( array( 'name' => $stream->name ) );
  396. // first remove all bot edited pages:
  397. $mvd_res = MV_Index::getMVDInRange( $wikiStream->getStreamId(), null, null, 'Ht_en' );
  398. while ( $row = $dbr->fetchObject( $mvd_res ) ) {
  399. $title = Title::newFromText( $row->wiki_title, MV_NS_MVD );
  400. $current = Revision::newFromTitle( $title );
  401. if ( $current->getUserText() == $botUserName ) {
  402. $article = new Article( $title );
  403. $article->doDelete( 'mvbot removal' );
  404. print "removed $row->wiki_title \n";
  405. } else {
  406. print "skiped $roe->wiki_title (last edit by: " . $current->getUserText() . ")\n";
  407. }
  408. }
  409. }
  410. /* for now use the stream search table (in the future should put in our orphaned person data)
  411. * should be able to do quick checks against the index. */
  412. $sql = "SELECT (`time`+" . CC_OFFSET . ") as time, `value` " .
  413. "FROM `metavid`.`stream_attr_time_text`
  414. WHERE `stream_fk`=" . $stream->id . "
  415. AND `time` >= " . $stream->adj_start_time . "
  416. AND `time` <= " . $stream->adj_end_time . "
  417. ORDER BY `time` ASC ";
  418. // $sql = "SELECT * FROM `metavid`.`stream_search` WHERE `stream_fk`={$stream->id}";
  419. $page_res = $dbr->query( $sql );
  420. if ( $dbr->numRows( $page_res ) == 0 )
  421. echo 'No pages for stream' . $stream->name . "\n";
  422. $pages = array ();
  423. while ( $page = $dbr->fetchObject( $page_res ) ) {
  424. $pages[] = $page;
  425. }
  426. print "Checking " . count( $pages ) . " text pages\n";
  427. $i = $j = 0;
  428. foreach ( $pages as $inx => $page ) {
  429. // status updates:
  430. if ( $i == 50 ) {
  431. print "on $j of " . count( $pages ) . "\n";
  432. $i = 0;
  433. }
  434. $i++;
  435. $j++;
  436. $start_time = $page->time - $stream->adj_start_time;
  437. if ( seconds2npt( $start_time ) < 0 )
  438. $start_time = '0:00:00';
  439. if ( ( $inx + 1 ) == count( $pages ) ) {
  440. $end_time = $stream->adj_end_time - $stream->adj_start_time;
  441. } else {
  442. $end_time = $pages[$inx + 1]->time - $stream->adj_start_time;
  443. }
  444. if ( ( $end_time - $start_time ) > 40 )
  445. $end_time = $start_time + 40;
  446. // skip if end_time <1
  447. if ( $end_time < 0 )
  448. continue;
  449. // now pull up the person for the given stream time:`metavid`.`people`.`name_clean`
  450. $sql = "SELECT * , abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} ) AS `distance` " .
  451. "FROM `metavid`.`people_attr_stream_time` " .
  452. "LEFT JOIN `metavid`.`people` ON `metavid`.`people_attr_stream_time`.`people_fk` = `metavid`.`people`.`id` " .
  453. "WHERE `metavid`.`people_attr_stream_time`.`stream_fk` ={$stream->id} " .
  454. // have a negative threshold of 4 seconds
  455. "AND (`metavid`.`people_attr_stream_time`.`time`-{$page->time})>-4 " .
  456. // have a total distance threshold of 30 seconds
  457. "AND abs( `metavid`.`people_attr_stream_time`.`time` -{$page->time} )< 90 " .
  458. "ORDER BY `distance` ASC " .
  459. "LIMIT 1 ";
  460. $person_res = $dbr->query( $sql );
  461. $page_title = $stream->name . '/' . seconds2npt( $start_time ) . '/' . seconds2npt( $end_time );
  462. // print $page_title . "\n";
  463. $page_body = '';
  464. if ( $dbr->numRows( $person_res ) != 0 ) {
  465. $person = $dbr->fetchObject( $person_res );
  466. $person_name = utf8_encode( $person->name_clean );
  467. $page_body .= "\n[[Spoken By::{$person_name}]] ";
  468. }
  469. $page_body .= trim( str_replace( "\n", ' ', strtolower( $page->value ) ) );
  470. // print $page_title . "\n";
  471. // die;
  472. // print $page_body . "\n\n";
  473. do_update_wiki_page( 'Ht_en:' . $page_title, $page_body, MV_NS_MVD );
  474. }
  475. }
  476. /*
  477. * for each image add it to the image directory
  478. */
  479. function do_process_images( $stream, $force = false ) {
  480. global $mvLocalImgLoc, $MVStreams, $wgDBname;
  481. $dbr = wfGetDB( DB_SLAVE );
  482. $dbw = wfGetDB( DB_MASTER );
  483. // get all images for the current stream:
  484. $sql = "SELECT * FROM `metavid`.`image_archive`
  485. WHERE `stream_fk`= {$stream->id}";
  486. $image_res = $dbr->query( $sql );
  487. $img_count = $dbr->numRows( $image_res );
  488. print "Found " . $img_count . " images for stream " . $stream->name . "\n";
  489. // grab from metavid and copy to local directory structure:
  490. $i = $j = 0;
  491. $mv_stream_id = $MVStreams[$stream->name]->getStreamId();
  492. // if force we can clear out existing images:
  493. if ( $force ) {
  494. print "force update flag (remove all existing images)\n";
  495. $local_img_dir = MV_StreamImage::getLocalImageDir( $mv_stream_id );
  496. $res = $dbr->query( "SELECT * FROM `$wgDBname`.`mv_stream_images` WHERE `stream_id`={$mv_stream_id}" );
  497. while ( $row = $dbr->fetchObject( $res ) ) {
  498. $local_img_file = $local_img_dir . '/' . $row->time . '*.jpg';
  499. shell_exec( 'rm -f ' . $local_img_file );
  500. }
  501. // remove db entries:
  502. $dbw->query( "DELETE FROM `$wgDBname`.`mv_stream_images` WHERE `stream_id`={$mv_stream_id}" );
  503. }
  504. while ( $row = $dbr->fetchObject( $image_res ) ) {
  505. // if(isset($row->
  506. $relative_time = $row->time - $stream->adj_start_time;
  507. // status updates:
  508. if ( $i == 10 ) {
  509. print "On image $j of $img_count time: " . seconds2npt( $relative_time ) . " $metavid_img_url\n";
  510. $i = 0;
  511. }
  512. $j++;
  513. $i++;
  514. // get streamImage obj:
  515. $local_img_dir = MV_StreamImage::getLocalImageDir( $mv_stream_id );
  516. $metavid_img_url = 'http://metavid.org/image_media/' . $row->id . '.jpg';
  517. $local_img_file = $local_img_dir . '/' . $relative_time . '.jpg';
  518. // check if the image already exist in the new table
  519. $sql = "SELECT * FROM `$wgDBname`.`mv_stream_images` " .
  520. "WHERE `stream_id`={$mv_stream_id} " .
  521. "AND `time`=$relative_time";
  522. $img_check = $dbr->query( $sql );
  523. $doInsert = true;
  524. if ( $dbr->numRows( $img_check ) != 0 ) {
  525. // make sure its there and matches what it should be:
  526. if ( is_file( $local_img_file ) ) {
  527. $row = $dbr->fetchObject( $img_check );
  528. // print "file $local_img_file skiped, stream_id:" . $mv_stream_id . " time: " . seconds2npt($relative_time) . "\n";
  529. continue;
  530. } else {
  531. // grab but don't insert:
  532. $doInsert = false;
  533. }
  534. }
  535. if ( $doInsert ) {
  536. // insert:
  537. $dbw->insert( 'mv_stream_images', array (
  538. 'stream_id' => $MVStreams[$stream->name]->getStreamId(), 'time' => $relative_time ) );
  539. $img_id = $dbw->insertId();
  540. // $grab = exec('cd ' . $img_path . '; wget ' . $im_url);
  541. }
  542. if ( is_file( $local_img_file ) ) {
  543. echo "skipped $local_img_file \n";
  544. continue;
  545. }
  546. // print "run copy: $metavid_img_url, $local_img_file \n";
  547. if ( !copy( $metavid_img_url, $local_img_file ) ) {
  548. echo "failed to copy $metavid_img_url to $local_img_file...\n";
  549. } else {
  550. // all good don't report anything'
  551. // print "all good\n";
  552. }
  553. }
  554. }
  555. // given a stream name it pulls all metavid stream data and builds semantic wiki page
  556. function mv_semantic_stream_desc( & $mvTitle, & $stream ) {
  557. /*$sql = "SELECT * FROM `metavid`.`streams` WHERE `name` LIKE '" . $mvTitle->getStreamName() . "'";
  558. $dbr = wfGetDB(DB_SLAVE);
  559. $res = $dbr->query($sql);
  560. //echo "\n" . $sql . "\n";
  561. $stream = $dbr->fetchObject($res);*/
  562. //$stream_id = $stream->id;
  563. $out = '';
  564. //(if we have old version of stream copy over is properties)
  565. if( isset( $stream->org_start_time ) )
  566. $stream->date_start_time = $stream->org_start_time;
  567. $start_time = $stream->date_start_time;
  568. // add links/generic text at the start
  569. $date = date( 'Ymd', $start_time );
  570. $cspan_date = date( 'Y-m-d', $start_time );
  571. $ch_type = '';
  572. if ( strpos( $mvTitle->getStreamName(), 'house' ) !== false )
  573. $ch_type = 'h';
  574. if ( strpos( $mvTitle->getStreamName(), 'senate' ) !== false )
  575. $ch_type = 's';
  576. if ( $ch_type != '' ) {
  577. $out .= '==Official Record==' . "\n";
  578. $out .= '*[http://www.govtrack.us/congress/recordindex.xpd?date=' . $date .
  579. '&where=' . $ch_type .
  580. ' GovTrack Congressional Record]' . "\n\n";
  581. $out .= '*[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
  582. ' THOMAS Congressional Record]' . "\n\n";
  583. $out .= '*[http://thomas.loc.gov/cgi-bin/query/B?r110:@FIELD(FLD003+' . $ch_type . ')+@FIELD(DDATE+' . $date . ')' .
  584. ' THOMAS Extension of Remarks]' . "\n\n";
  585. }
  586. $dbw = wfGetDB( DB_WRITE );
  587. //clear out existing archive.org files for the current stream
  588. //$sql = "DELETE FROM `mv_stream_files` WHERE `stream_id`='{$stream->id}' AND `file_desc_msg` LIKE 'ao_file_%' LIMIT 10";
  589. //$dbw->query( $sql );
  590. //print "removed existing archive.org files for $stream->name \n";
  591. //just do a forced link to the archive.org details page
  592. //if ( $stream->archive_org != '' ) {
  593. // grab file list from archive.org:
  594. //require_once( 'scrape_and_insert.inc.php' );
  595. //$aos = new MV_ArchiveOrgScrape();
  596. //$file_list = $aos->getFileList( $stream->name );
  597. //if($file_list===false || count($file_list)==0) {
  598. // print 'no files on archive.org for'. $stream->name ."\n\n";
  599. // return '';
  600. //}
  601. $out .= '==More Media Sources==' . "\n";
  602. // all streams have congretional cronical:
  603. $out .= '*[http://www.c-spanarchives.org/congress/?q=node/69850&date=' . $cspan_date . '&hors=' . $ch_type .
  604. ' CSPAN\'s Congressional Chronicle]' . "\n";
  605. //if ( $file_list ) {
  606. $out .= '*[http://www.archive.org/details/mv_' . $stream->name .
  607. ' Archive.org hosted version]' . "\n";
  608. // also output 'direct' semantic links to alternate file qualities:
  609. /*$out .= "\n===Full File Links===\n";
  610. $found_ogg=false;
  611. foreach ( $file_list as $file ) {
  612. $name = str_replace( ' ', '_', $file[2] );
  613. $url = 'http://archive.org'.$file[1];
  614. $size = $file[3];
  615. // add these files into the mv_files table:
  616. // @@todo in the future we should tie the mv_files table to the semantic properties.
  617. // check if already present:
  618. $quality_msg = 'ao_file_' . $name;
  619. if($name=='Ogg_Video'){
  620. $found_ogg=true;
  621. }
  622. $path_type = 'url_file';
  623. if($found_ogg && $name=='512Kb_MPEG4'){
  624. $quality_msg = 'mv_archive_org_mp4';
  625. $path_type = 'mp4_stream';
  626. }
  627. //print "found ogg $found_ogg name: $name qm:$quality_msg\n";
  628. //output stream to wiki text:
  629. $out .= "*[{$url} $name] {$size}\n";
  630. $dbr = wfGetDB( DB_SLAVE );
  631. $res = $dbr->query( "SELECT * FROM `mv_stream_files`
  632. WHERE `stream_id`={$mvTitle->getStreamId()}
  633. AND `file_desc_msg`='{$quality_msg}'" );
  634. if ( $dbr->numRows( $res ) == 0 ) {
  635. $sql = "INSERT INTO `mv_stream_files` (`stream_id`,`duration`, `file_desc_msg`, `path_type`, `path`)" .
  636. " VALUES ('{$mvTitle->getStreamId()}','{$mvTitle->getDuration()}', '{$quality_msg}', '{$path_type}','{$url}' )";
  637. } else {
  638. $row = $dbr->fetchObject( $res );
  639. // update that msg key *just in case*
  640. $sql = "UPDATE `mv_stream_files` SET `path_type`='{$path_type}', `path`='$url' WHERE `id`={$row->id}";
  641. }
  642. $dbw->query( $sql );
  643. }
  644. $dbw->commit();
  645. */
  646. // more semantic properties
  647. $out .= "\n\n";
  648. $out .= '[[stream_duration::' . ( $mvTitle->getDuration() ) . '| ]]' . "\n";
  649. if ( $stream->date_start_time ) {
  650. $out .= '[[original_date::' . $stream->date_start_time . '| ]]';
  651. }
  652. //}
  653. //}
  654. // add stream category (based on sync status)
  655. //(only add if the wiki page does not exist)
  656. $wStreamTitle = Title::newFromText($stream->name, MV_NS_STREAM);
  657. if( !$wStreamTitle->exists() ) {
  658. switch( $stream->sync_status ) {
  659. case 'not_checked':
  660. $out .= "\n\n" . '[[Category:Stream Unchecked]]';
  661. break;
  662. case 'impossible':
  663. $out .= "\n\n" . '[[Category:Stream Out of Sync]]';
  664. break;
  665. case 'in_sync':
  666. $out .= "\n\n" . '[[Category:Stream Basic Sync]]';
  667. // other options [stream high quality sync ];
  668. break;
  669. }
  670. }
  671. return $out;
  672. }
  673. function mvd_consistancy_check(){
  674. //get all 2009 streams:
  675. $dbr = wfGetDB( DB_READ );
  676. $streams = array();
  677. $result = $dbr->select( 'mv_streams',
  678. '*',
  679. 'date_start_time >= '. mktime(0, 0, 0, 1, 1, 2009),
  680. __METHOD__
  681. );
  682. if ( $dbr->numRows( $result ) == 0 )die("no streams found"."\n". $dbr->lastQuery() ."\n");
  683. while ( $stream = $dbr->fetchObject( $result ) ) {
  684. //get all the mvds for this stream
  685. $mvd_res = $dbr->select( 'mv_mvd_index', '*', array('stream_id'=>$stream->id));
  686. while ( $mvd = $dbr->fetchObject( $mvd_res ) ) {
  687. //make sure the article exists:
  688. $mvdTitle = Title::newFromText($mvd->wiki_title, MV_NS_MVD);
  689. if($mvdTitle->exists()){
  690. //update the text:
  691. $mvdArticle = new Article ($mvdTitle);
  692. $text = $mvdArticle->getRawText();
  693. //find the spoken by or speech by text:
  694. $sb_pat = '/\[\[Spoken By(\:.)([^\]]*)]]/i';
  695. preg_match($sb_pat, $text, $matches );
  696. if(isset($matches[2])){
  697. $replacement = ($matches[2] == 'Unknown')?'':
  698. '[[Spoken By::'. str_replace('_', ' ', $matches[2]).']]';
  699. $text = preg_replace($sb_pat, $replacement, $text);
  700. }
  701. //do the same for speech by
  702. $sb_pat = '/\[\[Speech by(\:.)([^\]]*)]]/i';
  703. preg_match($sb_pat, $text, $matches );
  704. if(isset($matches[2])){
  705. $replacement = ($matches[2] == 'Unknown')?'':
  706. '[[Speech by::'. str_replace('_', ' ', $matches[2]).']]';
  707. $text = preg_replace($sb_pat, $replacement, $text);
  708. }
  709. //trim all double spaces
  710. $text = preg_replace('/[\s]+/', ' ', $text);
  711. //uc upper words:
  712. //$text = preg_replace("/[^A-Z]\.(\s)(\\w)/e", '".$1".strtoupper("$2")', $text);
  713. do_update_wiki_page( $mvdTitle, trim($text),'',true);
  714. }else{
  715. print "orphaned mvd: {$mvd->wiki_title} (should remove) \n";
  716. }
  717. }
  718. //die('only update one stream at a time');
  719. }
  720. }
  721. function do_bill_insert( $bill_key ) {
  722. include_once( 'scrape_and_insert.inc.php' );
  723. $mvScrape = new MV_BaseScraper();
  724. $myBillScraper = new MV_BillScraper();
  725. $congressNum = 111;
  726. print "do_bill_insert:: $bill_key downloading fresh bills.index.xml....\n ";
  727. //grab bill list with categories from govtrack
  728. $raw_govtrack_bill_data = $mvScrape->doRequest('http://www.govtrack.us/data/us/'.$congressNum.'/bills.index.xml', array(), true);
  729. //turn bill data into an array:
  730. preg_match_all("/<bill\s([^>]*)\>/U",$raw_govtrack_bill_data,$nodes);
  731. print "found " . count($nodes[1]) . " bills \n";
  732. $types = array('type', 'number', 'title', 'official-title', 'status', 'last-action');
  733. $billAry = array();
  734. foreach($nodes[1] as $bill_str){
  735. $bObj = array();
  736. preg_match_all('/([^=]*)="([^"]*)"/', $bill_str, $matches);
  737. foreach($matches[1] as $inx => $tkey){
  738. if(in_array(trim($tkey), $types)){
  739. $bObj[ trim($tkey) ] = $matches[2][$inx];
  740. }
  741. }
  742. //setup some keys:
  743. $bObj['GovTrackID'] = $bObj['type'] . $congressNum . '-' . $bObj['number'];
  744. $bObj['ThomasID'] = 'd'.$congressNum.':'.$bObj['type'].$bObj['number'].':';
  745. $bObj['OpenCongressBillID'] =$congressNum.'-'.$bObj['type'].$bObj['number'];
  746. $bObj['CongressSession'] = $congressNum;
  747. $tp = explode(':', $bObj['title']);
  748. $bObj['Bill Key'] = $tp[0];
  749. $maplightBillId = get_map_light_bill_id( $bObj );
  750. if($maplightBillId===false){
  751. print "Could not find maplight id for bill: " . $bObj['type'] . '+' . $bObj['number'] . "\n";
  752. $bObj['MapLightBillID'] = false;
  753. }else{
  754. $bObj['MapLightBillID'] = $maplightBillId;
  755. //now that we do have a maplight key get the interest info:
  756. $bObj['interests'] = $myBillScraper->proccMapLightBillIntrests($maplightBillId);
  757. }
  758. $billAry[] = $bObj;
  759. //do proccess the bill (insert into the wiki)
  760. print "ProccessBill::";
  761. $myBillScraper->processBill($bObj['GovTrackID'], $bObj['Bill Key'],$bObj['OpenCongressBillID'], $bObj['MapLightBillID'], false, false);
  762. }
  763. }
  764. function get_map_light_bill_id($bObj){
  765. include_once( 'scrape_and_insert.inc.php' );
  766. $mvScrape = new MV_BaseScraper();
  767. $rawBillPage = $mvScrape->doRequest('http://maplight.org/map/us/bill/search/' . $bObj['type'] . '+' . $bObj['number'] ) ;
  768. //get the basic zone:
  769. $sb = strpos($rawBillPage, '<h3>Bills numbered');
  770. if($sb===false){
  771. return false;
  772. }
  773. $se = strpos($rawBillPage, '<h3>Bills matching', $sb);
  774. if($se === false){
  775. return false;
  776. }
  777. $target_search_area = substr($rawBillPage, $sb, $se-$sb );
  778. //get the matchign area
  779. preg_match_all('/href=\"([^"]*)"[^(]*\(([^t]*)/', $target_search_area, $matches);
  780. foreach($matches[2] as $inx => $val){
  781. if($val == $bObj['CongressSession'] ){
  782. //remove the unused parts of the url
  783. return str_replace('/map/us/bill/', '', $matches[1][$inx]);
  784. }
  785. }
  786. return false;
  787. }
  788. function do_people_insert( $doInterestLookup = false, $forcePerson = '', $force = false ) {
  789. global $valid_attributes, $states_ary;
  790. $dbr = wfGetDB( DB_SLAVE );
  791. include_once( 'scrape_and_insert.inc.php' );
  792. $mvScrape = new MV_BaseScraper();
  793. //get all people from govtrack db ( should not have to do this all the time)
  794. $govtrackDB = array();
  795. //avoid duplicating the $govtrackDB array:
  796. getGovTrackPeopleDB( $govtrackDB );
  797. //get all people from the congress people category
  798. $result = $dbr->select( 'categorylinks', 'cl_sortkey', array (
  799. 'cl_to' => 'Congress_Person'
  800. )
  801. );
  802. if ( $dbr->numRows( $result ) == 0 )
  803. die( 'could not find people: ' . "\n" );
  804. $out = '';
  805. $person_ary = array();
  806. while ( $person = $dbr->fetchObject( $result ) ) {
  807. $person_ary[] = $person;
  808. }
  809. foreach ( $person_ary as $person) {
  810. $person_name = $person->cl_sortkey;
  811. //get person data from wiki:
  812. $person_title = Title::newFromText( $person_name );
  813. $smwStore =& smwfGetStore();
  814. //check for govtrack key in page
  815. $propTitle = Title::newFromText('GovTrack Person ID', SMW_NS_PROPERTY );
  816. $smwProps = $smwStore->getPropertyValues( $person_title, $propTitle );
  817. if ( count( $smwProps ) != 0 ) {
  818. $v = current( $smwProps );
  819. $person->gov_track_id = $v->getXSDValue();
  820. }else{
  821. print "person: $person_name has no GovTrack Person ID make sure to include this on their page\n";
  822. }
  823. if( isset($person->gov_track_id) ){
  824. setGovTrackSpecifcAttr($person, $govtrackDB[ $person->gov_track_id ]);
  825. }else{
  826. //check for govtrack key in $govtrackDB:
  827. foreach( $govtrackDB as $gov_track_person){
  828. if( isset($gov_track_person['metavidid']) && $gov_track_person['metavidid'] == str_replace(' ', '_',$person_name) ){
  829. setGovTrackSpecifcAttr($person, $gov_track_person);
  830. }
  831. }
  832. reset($govtrackDB);
  833. //did not find metavid id try name test:
  834. if( !isset($person->govtrack_id )){
  835. foreach( $govtrackDB as $gov_track_person){
  836. if(isset($gov_track_person['middlename'])){
  837. $gov_name = $gov_track_person['firstname'] .' '.
  838. substr($gov_track_person['middlename'],0,1) . '. ' .
  839. $gov_track_person['lastname'];
  840. //first check for exact match:
  841. if( strtolower($gov_name) == strtolower($person_name) ){
  842. setGovTrackSpecifcAttr($person, $gov_track_person);
  843. break;
  844. }
  845. }
  846. //else first last check:
  847. $nparts = split(' ', $person_name);
  848. if( strtolower( $gov_track_person['firstname']) == strtolower($nparts[0]) &&
  849. strtolower( $gov_track_person['lastname']) == strtolower( $nparts[count($nparts)-1] ) ){
  850. setGovTrackSpecifcAttr($person, $gov_track_person);
  851. break;
  852. }
  853. }
  854. }
  855. if(!isset($person->gov_track_id)){
  856. die("\n could not find gov track id for $person_name please add manually or remove from Congress_Person category\n ");
  857. }
  858. }
  859. //set the maplight key (not in sunlight api)
  860. $propTitle = Title::newFromText( 'MAPLight Person ID', SMW_NS_PROPERTY );
  861. $smwProps = $smwStore->getPropertyValues( $person_title, $propTitle );
  862. if ( count( $smwProps ) != 0 ) {
  863. $v = current( $smwProps );
  864. $mapk = $v->getXSDValue();
  865. $person->maplight_id = $v->getXSDValue();
  866. }else{
  867. print "person: $person_name has no MAPLight Person ID could not lookup with sunlight api?\n";
  868. }
  869. //set $person->name_ocr
  870. $propTitle = Title::newFromText( 'Name OCR', SMW_NS_PROPERTY );
  871. $smwProps = $smwStore->getPropertyValues( $person_title, $propTitle );
  872. if ( count( $smwProps ) != 0 ) {
  873. $v = current( $smwProps );
  874. $person->name_ocr = $v->getXSDValue();
  875. }
  876. $page_body = '{{Congress Person|' . "\n";
  877. foreach ( $valid_attributes as $dbKey => $attr ) {
  878. list ( $name, $desc ) = $attr;
  879. if( $dbKey == 'gov_track_id'){
  880. //we key all to govtrack id make sure its there:
  881. $page_body.="GovTrack Person ID=".$person->gov_track_id . "|\n";
  882. }elseif ( $dbKey == 'total_received' ) {
  883. if ( !$mapk ) {
  884. print 'no mapkey for total_received' . "\n";
  885. } else {
  886. $raw_results = $mvScrape->doRequest( 'http://www.maplight.org/map/us/legislator/' . $mapk );
  887. preg_match( '/Contributions\sReceived\:\s\$([^<]*)/', $raw_results, $matches );
  888. if ( isset( $matches[1] ) ) {
  889. $page_body .= "{$name}=\$" . $matches[1] . "|\n";
  890. }
  891. }
  892. } elseif($dbKey == 'roles'){
  893. if ( $person->$dbKey ) {
  894. $i=1;
  895. foreach($person->$dbKey as $role){
  896. $page_body.="Role $i Type=" . ucfirst($role['type'])."|\n";
  897. $page_body.="Role $i Party=" . $role['party']. "|\n";
  898. $page_body.="Role $i State=" . $role['state']. "|\n";
  899. $page_body.="Role $i Start Date=" . $role['startdate']."|\n";
  900. $page_body.="Role $i End Date=" . $role['enddate'] . "|\n";
  901. $i++;
  902. }
  903. }
  904. } elseif($dbKey == 'committee'){
  905. if ( isset($person->$dbKey) ) {
  906. $i = 1;
  907. foreach($person->$dbKey as $committee){
  908. if(isset($committee ['committee']))
  909. $page_body.="Committee $i= ".$committee ['committee'] ."|\n";
  910. if(isset($committee['subcommittee']))
  911. $page_body.="Subcommittee $i= ".$committee ['subcommittee'] ."|\n";
  912. if(isset($committee['role']))
  913. $page_body.="Committee Role $i= ".$committee ['role'] ."|\n";
  914. $i++;
  915. }
  916. }
  917. } elseif ( $dbKey == 'contribution_date_range' ) {
  918. if ( !$mapk ) {
  919. print 'out of order attr process missing mapk' . "\n";
  920. } else {
  921. $raw_results = $mvScrape->doRequest( 'http://www.maplight.org/map/us/legislator/' . $mapk );
  922. preg_match( '/Showing\scontributions<\/dt><dd>([^<]*)</', $raw_results, $matches );
  923. if ( isset( $matches[1] ) ) {
  924. $page_body .= "{$name}=" . $matches[1] . "|\n";
  925. }
  926. }
  927. } elseif ( $dbKey == 'maplight_id' ) {
  928. if ( !$person->$dbKey ) {
  929. // print 'do_maplight_id'."\n";
  930. // try to grab the maplight id
  931. $person_lookup = $govtrackDB[ $person->gov_track_id ];
  932. $raw_results = $mvScrape->doRequest( 'http://maplight.org/map/us/legislator/search/' . $person_lookup->lastname . '+' . $person->firstname );
  933. preg_match_all( '/map\/us\/legislator\/([^"]*)">(.*)<\/a>.*<td>([^<]*)<.*<td>([^<]*)<.*<td>([^<]*)<.*<td>([^<]*)</U', $raw_results, $matches );
  934. // do point system for match
  935. $point = array();
  936. $title_lookup = array( 'Rep.' => 'House', 'Sen.' => 'Senate' );
  937. if ( isset( $matches['2'][0] ) ) {
  938. foreach ( $matches['2'] as $k => $name_html ) {
  939. if ( !isset( $point[$k] ) )$point[$k] = 0;
  940. list( $lname, $fname ) = explode( ',', trim( strip_tags( $name_html ) ) );
  941. if ( strtolower( $person->first ) == strtolower( $fname ) )$point[$k] += 2;
  942. if ( strtolower( $person->last ) == strtolower( $lname ) )$point[$k] += 2;
  943. if ( $person_lookup['state'] == $matches['3'][$k] )$point[$k]++;
  944. if ( $person_lookup['district'] == $matches['4'][$k] )$point[$k]++;
  945. if ( $person_lookup['party'] == $matches['5'][$k] )$point[$k]++;
  946. if(isset($person_lookup['title'])){
  947. if ( isset( $title_lookup[ $person['title'] ]) ) {
  948. if ( $title_lookup[ $person['title'] ] == $matches['6'] )$point[$k]++;
  949. }
  950. }
  951. }
  952. $max = 0;
  953. $mapk = null;
  954. //print_r($matches);
  955. //die;
  956. foreach ( $point as $k => $v ) {
  957. if ( $v > $max ) {
  958. $mapk = $matches[1][$k];
  959. $max = $v;
  960. }
  961. }
  962. }
  963. } else {
  964. $mapk = $person->$dbKey;
  965. }
  966. $page_body .= "{$name}=" . $mapk . "|\n";
  967. } else {
  968. //try the $sulightData array
  969. if(isset($sulightData[ $dbKey ])){
  970. $page_body.= $name . '=' . $sulightData[ $dbKey ]."| \n";
  971. }else{
  972. if( isset($person->$dbKey) ){
  973. if ( trim( $person->$dbKey ) != '' ) {
  974. if ( $dbKey == 'state' ) $person->state = $states_ary[$person->state];
  975. $page_body .= "{$name}={$person->$dbKey}| \n";
  976. }
  977. }
  978. }
  979. }
  980. }
  981. // if we have the maplight key add in all contributions and process contributers
  982. if ( !$mapk ) {
  983. print 'missing mapkey' . "\n";
  984. } else {
  985. $raw_results = $mvScrape->doRequest( 'http://www.maplight.org/map/us/legislator/' . $mapk );
  986. preg_match_all( '/\/map\/us\/interest\/([^"]*)">([^<]*)<.*\$([^\<]*)</U', $raw_results, $matches );
  987. if ( isset( $matches[1] ) ) {
  988. foreach ( $matches[1] as $k => $val ) {
  989. $hr_inx = $k + 1;
  990. $page_body .= "Funding Interest $hr_inx=" . html_entity_decode( $matches[2][$k] ) . "|\n";
  991. $page_body .= "Funding Amount $hr_inx=\$" . $matches[3][$k] . "|\n";
  992. if ( $doInterestLookup ) {
  993. // make sure the intrest has been processed:
  994. do_proc_interest( $matches[1][$k], html_entity_decode( $matches[2][$k] ) );
  995. }
  996. // do_proc_interest('G1100','Chambers of commerce');
  997. }
  998. }
  999. }
  1000. // add in the full name attribute:
  1001. /*$page_body .= "Full Name=" . $person->title . ' ' . $person->first .
  1002. ' ' . $person->middle . ' ' . $person->last . "| \n";*/
  1003. //close:
  1004. $page_body .= '}}';
  1005. // add in basic info to be overwitten by transclude (from
  1006. /*$full_name = $person->title . ' ' . $person->first .
  1007. ' ' . $person->middle . ' ' . $person->last;
  1008. if ( trim( $full_name ) == '' )
  1009. $full_name = $person->name_clean;
  1010. $page_body .= "\n" . 'Person page For <b>' . $full_name . "</b><br />\n";*/
  1011. // "Text Spoken By [[Special:MediaSearch/person/{$person->name_clean}|$full_name]] ";
  1012. do_update_wiki_page( $person_title, $page_body, '', $force );
  1013. //die('only run on first person'."\n");
  1014. }
  1015. foreach ( $person_ary as $person ) {
  1016. $person_lookup = $govtrackDB[ $person->gov_track_id ];
  1017. // download/upload all the photos:
  1018. $imgTitle = Title :: makeTitle( NS_IMAGE, $person->cl_sortkey . '.jpg' );
  1019. // if(!$imgTitle->exists()){
  1020. global $wgTmpDirectory;
  1021. $url = 'http://www.govtrack.us/data/photos/' . $person->gov_track_id . '-100px.jpeg';
  1022. //check if url exists:
  1023. if( !url_exists($url)){
  1024. print " no image found for: {$person->cl_sortkey}\n";
  1025. continue;
  1026. }
  1027. // print $wgTmpDirectory . "\n";
  1028. $local_file = tempnam( $wgTmpDirectory, 'WEBUPLOAD' );
  1029. // copy file:
  1030. # Check if already there existence
  1031. $image = wfLocalFile( $imgTitle );
  1032. if ( $image->exists() ) {
  1033. echo ( $imgTitle->getDBkey() . " already in the wiki\n" );
  1034. continue;
  1035. }
  1036. for ( $ct = 0; $ct < 10; $ct++ ) {
  1037. if ( !@ copy( $url, $local_file ) ) {
  1038. print ( "failed to copy $url to local_file (tring again) \n" );
  1039. } else {
  1040. print "copy success\n";
  1041. $ct = 10;
  1042. }
  1043. if ( $ct == 9 )
  1044. print 'complete failure' . "\n";
  1045. }
  1046. # Stash the file
  1047. echo ( "Saving " . $imgTitle->getDBkey() . "..." );
  1048. $image = wfLocalFile( $imgTitle );
  1049. $archive = $image->publish( $local_file );
  1050. if ( !$archive->isGood() ) {
  1051. echo ( "failed.\n" );
  1052. continue;
  1053. }
  1054. echo ( "importing..." );
  1055. $comment = 'Image file for [[' . $person->name_clean . ']]';
  1056. $license = '';
  1057. if ( $image->recordUpload( $archive, $comment, $license ) ) {
  1058. # We're done!
  1059. echo ( "done.\n" );
  1060. } else {
  1061. echo ( "failed.\n" );
  1062. }
  1063. }
  1064. }
  1065. function setGovTrackSpecifcAttr(&$person, &$gov_track_person){
  1066. $person->gov_track_id = $gov_track_person['id'];
  1067. //also set govtrack only properties:
  1068. if(isset($gov_track_person['birthday']))
  1069. $person->birthday = $gov_track_person['birthday'];
  1070. if(isset($gov_track_person['religion']))
  1071. $person->religion = $gov_track_person['religion'];
  1072. if(isset($gov_track_person['youtubeid']))
  1073. $person->youtubeid = $gov_track_person['youtubeid'];
  1074. if(isset($gov_track_person['roles']))
  1075. $person->roles = $gov_track_person['roles'];
  1076. if(isset($gov_track_person['committee']))
  1077. $person->committee = $gov_track_person['committee'];
  1078. }
  1079. //loads a big xml file
  1080. function getGovTrackPeopleDB( &$govTrackDb){
  1081. include_once( 'scrape_and_insert.inc.php' );
  1082. $mvScrape = new MV_BaseScraper();
  1083. //get the last few people.xml databases (starting with most recent)
  1084. $raw_govtrack_data = $mvScrape->doRequest('http://www.govtrack.us/data/us/111/repstats/people.xml');
  1085. govtrackXMLtoARRAY($govTrackDb, $raw_govtrack_data);
  1086. $oneElevenCount = count($govTrackDb);
  1087. print "govTrackDb: populated " . count($govTrackDb) . " from govTrack people.xml \n";
  1088. //should have a well populated $govTrackDb
  1089. }
  1090. function govtrackXMLtoARRAY(&$govTrackDb, & $xmlstring) {
  1091. //normal XML parsing is too slow: use preg match:
  1092. preg_match_all("/<person([^>]*)>(.*)<\/person>/sU",$xmlstring,$nodes);
  1093. print "found " . count($nodes[1]) . " person nodes \n";
  1094. $poKeys = array();
  1095. foreach($nodes[1] as $pokey => $persons_attr){
  1096. preg_match_all("/([a-z]*)=\'([^\']*)\'/",$persons_attr, $attr);
  1097. $cur_person = array();
  1098. foreach($attr[1] as $key=>$key_name){
  1099. $cur_person[$key_name] = $attr[2][$key];
  1100. }
  1101. if(!isset( $govTrackDb[ $cur_person['id'] ])){
  1102. $govTrackDb[ $cur_person['id'] ] =$cur_person;
  1103. }
  1104. //committee and roles:
  1105. if(isset($nodes[2][$pokey])){
  1106. $persons_child_xml = $nodes[2][$pokey];
  1107. preg_match_all("/<role([^>]*)>/", $persons_child_xml, $roles);
  1108. if( count($roles[1] != 0)){
  1109. $govTrackDb[ $cur_person['id'] ]['roles']=array();
  1110. foreach($roles[1] as $role_attr){
  1111. preg_match_all("/([a-z]*)=\'([^\']*)\'/",$role_attr, $rattr);
  1112. $cur_role = array();
  1113. foreach($rattr[1] as $key=>$key_name){
  1114. $cur_role[$key_name] = $rattr[2][$key];
  1115. }
  1116. $govTrackDb[ $cur_person['id'] ]['roles'][]=$cur_role;
  1117. }
  1118. }
  1119. preg_match_all("/<current-committee-assignment([^>]*)>/", $persons_child_xml, $committee);
  1120. if(count($committee[1])!=0){
  1121. $govTrackDb[ $cur_person['id'] ]['committee']=array();
  1122. foreach($committee[1] as $cur_committee){
  1123. preg_match_all("/([a-z]*)=\'([^\']*)\'/", $cur_committee, $cattr);
  1124. $cur_com=array();
  1125. foreach($cattr[1] as $key=>$key_name){
  1126. $cur_com[ $key_name ] = $cattr[2][$key];
  1127. }
  1128. $govTrackDb[ $cur_person['id'] ]['committee'][] = $cur_com;
  1129. }
  1130. }
  1131. }
  1132. }
  1133. }
  1134. function do_proc_interest( $intrestKey, $intrestName ) {
  1135. global $mvMaxContribPerInterest, $mvMaxForAgainstBills;
  1136. include_once( 'scrape_and_insert.inc.php' );
  1137. $mvScrape = new MV_BillScraper();
  1138. $raw_results = $mvScrape->doRequest( 'http://www.maplight.org/map/us/interest/' . $intrestKey . '/view/all' );
  1139. $page_body = '{{Interest Group|' . "\n";
  1140. $page_body .= 'MAPLight Interest ID=' . $intrestKey . "|\n";
  1141. // get all people contributions:
  1142. preg_match_all( '/\/map\/us\/legislator\/([^"]*)">.*\$([^<]*)</U', $raw_results, $matches );
  1143. if ( isset( $matches[2] ) ) {
  1144. $i = 0;
  1145. foreach ( $matches[1] as $i => $person_id ) {
  1146. $hr_inx = $i + 1;
  1147. // we have to lookup the name:
  1148. $personName = $mvScrape->get_wiki_name_from_maplightid( $person_id );
  1149. if ( $personName ) {
  1150. $page_body .= "Funded Name $hr_inx=" . $personName . "|\n";
  1151. $page_body .= "Funded Amount $hr_inx=" . str_replace( ',', '', $matches[2][$i] ) . "|\n";
  1152. }
  1153. if ( $hr_inx == $mvMaxContribPerInterest )break;
  1154. $i++;
  1155. }
  1156. }
  1157. $intrest_bills_url = 'http://maplight.org/map/us/interest/' . $intrestKey . '/bills';
  1158. $raw_results = $mvScrape->doRequest( $intrest_bills_url );
  1159. // get all bills supported or opposed
  1160. preg_match_all( '/\/map\/us\/bill\/([^"]*)".*\/map\/us\/legislator.*<td>([^<]*)</U', $raw_results, $matches );
  1161. print 'bill:'.$intrest_bills_url . "\n";
  1162. //die;
  1163. $sinx = $oinx = 1;
  1164. if ( isset( $matches[1][0] ) ) {
  1165. $support_count = $oppse_count = 0;
  1166. foreach ( $matches[1] as $i => $bill_id ) {
  1167. // skip if we are maxed out
  1168. if ( $support_count == $mvMaxForAgainstBills )continue;
  1169. if ( $oppse_count == $mvMaxForAgainstBills )continue;
  1170. $hr_inx = $i + 1;
  1171. $bill_name = $mvScrape->get_bill_name_from_mapLight_id( $bill_id );
  1172. if ( $matches[2][$i] == 'Support' ) {
  1173. $page_body .= "Supported Bill $sinx=" . str_replace( '_', ' ', $bill_name ) . "|\n";
  1174. $sinx++;
  1175. } elseif ( $matches[2][$i] == 'Oppose' ) {
  1176. $page_body .= "Opposed Bill $oinx=" . str_replace( '_', ' ', $bill_name ) . "|\n";
  1177. $oinx++;
  1178. }
  1179. }
  1180. }
  1181. $page_body .= '}}';
  1182. $wTitle = Title::makeTitle( NS_MAIN, $intrestName );
  1183. print "Interest: ";
  1184. do_update_wiki_page( $wTitle, $page_body );
  1185. print "\n";
  1186. }
  1187. function do_rm_congress_persons() {
  1188. $dbr = wfGetDB( DB_SLAVE );
  1189. $result = $dbr->query( " SELECT *
  1190. FROM `categorylinks`
  1191. WHERE `cl_to` LIKE 'Congress_Person' " );
  1192. while ( $row = $dbr->fetchObject( $result ) ) {
  1193. $pTitle = Title::makeTitle( NS_MAIN, $row->cl_sortkey );
  1194. $pArticle = new Article( $pTitle );
  1195. $pArticle->doDeleteArticle( 'removed reason' );
  1196. print "removed title: " . $pTitle->getText() . "\n";
  1197. }
  1198. }
  1199. function mv_process_attr( $table, $stream_id ) {
  1200. global $start_time, $end_time;
  1201. $dbr = wfGetDB( DB_SLAVE );
  1202. $sql = "SELECT * FROM `metavid`.`$table` WHERE `stream_fk`=$stream_id";
  1203. $res = $dbr->query( $sql );
  1204. $out = '';
  1205. while ( $var = $dbr->fetchObject( $res ) ) {
  1206. $type_title = getTypeTitle( $var->type );
  1207. if ( $var->type == 'adj_start_time' )
  1208. $start_time = $var->value;
  1209. if ( $var->type == 'adj_end_time' )
  1210. $end_time = $var->value;
  1211. if ( $type_title != '' ) {
  1212. $reltype = ( $type_title[0] == 'rel' ) ? '::' : ':=';
  1213. $out .= '[[' . $var->type . ':=' . $var->value . '| ]]' . "\n";
  1214. }
  1215. }
  1216. return $out;
  1217. }
  1218. function getTypeTitle( $type ) {
  1219. switch ( $type ) {
  1220. case 'cspan_type' :
  1221. return array (
  1222. 'rel',
  1223. 'Government Event'
  1224. );
  1225. break;
  1226. case 'cspan_title' :
  1227. return array (
  1228. 'atr',
  1229. 'C-SPAN Title'
  1230. );
  1231. break;
  1232. case 'cspan_desc' :
  1233. return array (
  1234. 'atr',
  1235. 'C-SPAN Description'
  1236. );
  1237. break;
  1238. case 'adj_start_time' :
  1239. return array (
  1240. 'atr',
  1241. 'Unix Start Time'
  1242. );
  1243. break;
  1244. case 'adj_end_time' :
  1245. return array (
  1246. 'atr',
  1247. 'Unix End Time'
  1248. );
  1249. break;
  1250. default :
  1251. return '';
  1252. break;
  1253. }
  1254. }
  1255. // valid attributes dbkey=>semantic name
  1256. $valid_attributes = array (
  1257. 'name_ocr' => array (
  1258. 'Name OCR',
  1259. 'The Name as it appears in on screen video text',
  1260. 'string'
  1261. ),
  1262. 'maplight_id' => array(
  1263. 'MAPLight Person ID',
  1264. 'MAPLight person id for linking into maplight data',
  1265. 'string'
  1266. ),
  1267. 'osid' => array (
  1268. 'CRP ID',
  1269. 'Congress Person\'s <a href="http://www.opensecrets.org/">Open Secrets</a> Id',
  1270. 'string'
  1271. ),
  1272. 'gov_track_id' => array (
  1273. 'GovTrack Person ID',
  1274. 'Congress Person\' <a href="www.govtrack.us">govtrack.us</a> person ID',
  1275. 'string'
  1276. ),
  1277. 'birthday'=>array(
  1278. 'Birthday',
  1279. 'Birthday',
  1280. 'date'
  1281. ),
  1282. 'religion'=>array(
  1283. 'Religion',
  1284. 'Religion',
  1285. 'page'
  1286. ),
  1287. 'roles'=>array(
  1288. 'Roles',
  1289. 'Roles date ranges of congress activity',
  1290. 'string'
  1291. ),
  1292. 'committee'=>array(
  1293. 'Committee',
  1294. 'committees and sub commities with roles',
  1295. 'string'
  1296. ),
  1297. 'youtubeid'=>array(
  1298. 'YouTube ID',
  1299. 'YouTube ID',
  1300. 'string'
  1301. ),
  1302. 'bioguide' => array (
  1303. 'Bioguide ID',
  1304. 'Congressional Biographical Directory id',
  1305. 'string'
  1306. ),
  1307. 'title' => array (
  1308. 'Title',
  1309. 'Title (Sen. or Rep.)',
  1310. 'string'
  1311. ),
  1312. 'state' => array (
  1313. 'State',
  1314. 'State',
  1315. 'page'
  1316. ), // do look up
  1317. 'party' => array (
  1318. 'Party',
  1319. 'The Cogress Persons Political party',
  1320. 'page'
  1321. ),
  1322. 'first' => array(
  1323. 'First Name',
  1324. '(first name)',
  1325. 'string'
  1326. ),
  1327. 'middle' => array(
  1328. 'Middle Name',
  1329. '(middle name)',
  1330. 'string'
  1331. ),
  1332. 'last' => array(
  1333. 'Last Name',
  1334. '(last name)',
  1335. 'string'
  1336. ),
  1337. 'name_suffix'=>array(
  1338. 'Name Suffix',
  1339. 'Legislator\'s suffix (Jr., III, etc.) ',
  1340. 'string'
  1341. ),
  1342. 'district' => array(
  1343. 'District',
  1344. 'The district # page ie: 3rd District',
  1345. 'page'
  1346. ),
  1347. 'url' => array(
  1348. 'Home Page',
  1349. 'The representatives home page',
  1350. 'URL'
  1351. ),
  1352. 'total_received' => array(
  1353. 'Total Received',
  1354. 'The Total Contributions Received',
  1355. 'number'
  1356. ),
  1357. 'contribution_date_range' => array(
  1358. 'Contributions Date Range',