PageRenderTime 41ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/extensions/DataTransclusion/ImportMAB2.php

https://github.com/ChuguluGames/mediawiki-svn
PHP | 390 lines | 313 code | 74 blank | 3 comment | 74 complexity | a4dfddf7d5e1d1e59512d5b8a6fda37f MD5 | raw file
  1. <?php
  2. /**
  3. */
  4. if ( getenv( 'MW_INSTALL_PATH' ) ) {
  5. $IP = getenv( 'MW_INSTALL_PATH' );
  6. } else {
  7. $IP = dirname( __FILE__ ) . '/../..';
  8. if ( !file_exists( "$IP/LocalSettings.php" ) ) {
  9. $IP = dirname( __FILE__ ) . '/../../phase3';
  10. }
  11. }
  12. require_once( "$IP/maintenance/Maintenance.php" );
  13. class ImportMAB2 extends Maintenance {
  14. public function __construct( ) {
  15. parent::__construct();
  16. $this->addArg( "name", "name of a transclusion data source, as specified in \$wgDataTransclusionSources", true );
  17. $this->addArg( "file/dir", "directory containing MAB files, or a single MAB file, or - for stdin", true );
  18. $this->addArg( "blob_table", "database table for data blobs, without prefix", true );
  19. $this->addArg( "index_table", "database table for index entries, without prefix", true );
  20. $this->addOption( "create", "create database tables if they do not exist", false, false );
  21. $this->addOption( "truncate", "truncate (empty) database tables", false, false );
  22. $this->addOption( "prefix", "database table prefix. May contain a period (\".\") to reference tables in another database. If not given, the wiki's table prefix will be used", false, true );
  23. $this->addOption( "recursive", "recurse into subdirectories while importing MAB files", false, false );
  24. $this->addOption( "noblob", "don't write blob data, import index fields only", false, false );
  25. $this->addOption( "limit", "max number of files to process", false, true );
  26. $this->addOption( "debug", "don't write to the database, dump to console instead", false, false );
  27. $this->addOption( "multi-record", "read multiple records from a single file. Records may be separated by special lines matching --record-separator; if --record-separator is not given, all records are expected to start with filed number 001.", false, false );
  28. $this->addOption( "record-separator", "regular expression for lines separating records in a multi-record file. Implies --multi-record", false, true );
  29. $this->addOption( "id-list-field", "id field to compare against the id list.", false, true );
  30. $this->addOption( "id-list-file", "list of ids to import.", false, true );
  31. }
  32. public function createTables( ) {
  33. $db = wfGetDB( DB_MASTER );
  34. $this->output( "creating blob table {$this->blob_table}\n" );
  35. $sql = "CREATE TABLE IF NOT EXISTS " . $this->blob_table . " ( ";
  36. $sql .= " id INT(12) NOT NULL AUTO_INCREMENT, ";
  37. $sql .= " data BLOB NOT NULL, ";
  38. $sql .= " PRIMARY KEY (id) ";
  39. $sql .= ") ";
  40. $db->query( $sql, __METHOD__ );
  41. $this->output( "creating index table {$this->index_table}\n" );
  42. $sql = "CREATE TABLE IF NOT EXISTS " . $this->index_table . " ( ";
  43. $sql .= " field VARCHAR(255) NOT NULL, "; #FIXME: varchar vs varbinary!
  44. $sql .= " value VARCHAR(255) NOT NULL, "; #FIXME: varchar vs varbinary!
  45. $sql .= " data_id INT(12) NOT NULL, ";
  46. $sql .= " PRIMARY KEY (field, value, data_id) "; #NOTE: we don't require (field,value) to be unique!
  47. $sql .= ") ";
  48. $db->query( $sql, __METHOD__ );
  49. }
  50. public function truncateTables( ) {
  51. $db = wfGetDB( DB_MASTER );
  52. $this->output( "truncating blob table {$this->blob_table}\n" );
  53. $sql = "TRUNCATE TABLE " . $this->blob_table;
  54. $db->query( $sql, __METHOD__ );
  55. $this->output( "truncating index table {$this->index_table}\n" );
  56. $sql = "TRUNCATE TABLE " . $this->index_table;
  57. $db->query( $sql, __METHOD__ );
  58. }
  59. public function execute() {
  60. global $wgDataTransclusionSources;
  61. $this->debug = $this->hasOption( 'debug' );
  62. $this->noblob = $this->hasOption( 'noblob' );
  63. $recursive = $this->hasOption( 'recursive' );
  64. $limit = (int)$this->getOption( 'limit' );
  65. $this->recordSeparator = $this->getOption( 'record-separator' );
  66. $this->multiRecord = $this->recordSeparator || $this->hasOption( 'multi-record' );
  67. $this->idListField = $this->getOption( 'id-list-field' );
  68. $this->idListFile = $this->getOption( 'id-list-file' );
  69. $this->idList = null;
  70. $src = $this->mArgs[0];
  71. $dir = $this->mArgs[1];
  72. if ( !isset( $wgDataTransclusionSources[ $src ] ) ) {
  73. throw new MWException( "unknown transclusion data source '$src', not found in \$wgDataTransclusionSources" );
  74. }
  75. $this->output( "using settings for data transclusion source \"$src\".\n" );
  76. $this->source = DataTransclusionHandler::getDataSource( $src );
  77. if ( !( $this->source instanceof DBDataTransclusionSource ) ) {
  78. throw new MWException( "bad data source '$src': not compatible with DBDataTransclusionSource" );
  79. }
  80. $this->blob_table = $this->mArgs[2];
  81. $this->index_table = $this->mArgs[3];
  82. if ( $this->hasOption( 'prefix' ) ) {
  83. $prefix = $this->getOption( "prefix" );
  84. $this->blob_table = $prefix . $this->blob_table;
  85. $this->index_table = $prefix . $this->index_table;
  86. } else {
  87. $db = wfGetDB( DB_MASTER ); # we'll need the master anyway later
  88. $this->blob_table = $db->tableName( $this->blob_table );
  89. $this->index_table = $db->tableName( $this->index_table );
  90. }
  91. if ( !$this->debug ) {
  92. $this->output( "using tables {$this->blob_table} and {$this->index_table}.\n" );
  93. if ( $this->hasOption('create') ) {
  94. $this->output( "creating tables if neccessary.\n" );
  95. $this->createTables( $this->blob_table, $this->index_table );
  96. }
  97. if ( $this->hasOption('truncate') ) {
  98. $this->output( "truncating tables.\n" );
  99. $this->truncateTables( $this->blob_table, $this->index_table );
  100. }
  101. }
  102. $this->id_map = array();
  103. foreach ( $this->source->keyFields as $key ) {
  104. $this->id_map[ $key ] = MAB2RecordTransformer::getMABFields( $key );
  105. if ( !$this->id_map[ $key ] ) {
  106. $this->error( "unknown key field '$key', no MAB fields mapped.\n" );
  107. }
  108. }
  109. if ( $this->idListFile ) {
  110. $this->output( "loading id list from {$this->idListFile}.\n" );
  111. $this->idList = $this->loadList( $this->idListFile, $this->idListField );
  112. if ( $this->idList === false ) {
  113. $this->error( "failed to load id list from {$this->idListFile}.\n" );
  114. return;
  115. }
  116. }
  117. if ( $this->idList && $this->idListField ) {
  118. $this->output( "filtering by {$this->idListField} from {$this->idListFile}.\n" );
  119. }
  120. $dir = "php://stdin";
  121. if ( is_dir( $dir ) ) {
  122. $this->importDir( $dir, $recursive, $limit );
  123. } else {
  124. $this->importMabFile( $dir );
  125. }
  126. }
  127. public function importDir( $dir, $recursive = false, $limit = 0 ) {
  128. $dir = "$dir/";
  129. $this->output( "scanning directory $dir\n" );
  130. $d = opendir( $dir );
  131. if ( !$d ) {
  132. $this->error( "unable to open directory $dir!\n" );
  133. return false;
  134. }
  135. while( ( $file = readdir( $d ) ) ) {
  136. if ( $file == "." or $file == ".." ) {
  137. continue;
  138. }
  139. if ( is_dir( $dir . $file ) && $recursive ) {
  140. $this->importDir( $dir . $file, $recursive, $limit );
  141. continue;
  142. } elseif ( !is_file( $dir . $file ) ) {
  143. $this->output( "not a file: $dir/$file\n" );
  144. continue;
  145. }
  146. $ok = $this->importMabFile( $dir . $file );
  147. if ( !$ok ) {
  148. $this->output( "error processing $file\n" );
  149. }
  150. if ( $limit > 0 ) {
  151. $limit -= 1;
  152. if ( $limit <= 0 ) break;
  153. }
  154. }
  155. closedir( $d );
  156. }
  157. public function getIds( $rec ) {
  158. $ids = array();
  159. foreach ( $this->id_map as $field => $items ) {
  160. if ( !$items ) continue;
  161. foreach ( $items as $item ) {
  162. if ( isset( $rec[ $item ] ) ) {
  163. if ( !isset( $ids[ $field ] ) ) {
  164. $ids[ $field ] = array();
  165. }
  166. if ( is_array( $rec[ $item ] ) ) {
  167. foreach( $rec[ $item ] as $k => $v ) {
  168. $v = $this->source->normalize( $field, $v );
  169. $v = $this->source->convert( $field, $v );
  170. $ids[ $field ][] = $v;
  171. }
  172. } else {
  173. $v = $rec[ $item ];
  174. $v = $this->source->normalize( $field, $v );
  175. $v = $this->source->convert( $field, $v );
  176. $ids[ $field ][] = $v;
  177. }
  178. }
  179. }
  180. }
  181. return $ids;
  182. }
  183. public function storeRecord( $rec, $ids ) {
  184. $db = wfGetDB( DB_MASTER );
  185. $insert = array( 'data' => serialize($rec) );
  186. if ( $this->noblob ) {
  187. $id = 0;
  188. } else {
  189. $db->insert( $this->blob_table, $insert, __METHOD__ );
  190. $id = $db->insertId();
  191. }
  192. $insert = array();
  193. foreach ( $ids as $field => $values ) {
  194. foreach ( $values as $v ) {
  195. $insert[] = array(
  196. 'field' => $field,
  197. 'value' => $v,
  198. 'data_id' => $id );
  199. }
  200. }
  201. $db->insert( $this->index_table, $insert, __METHOD__, array( 'IGNORE' ) );
  202. }
  203. public function loadList( $file, $field = null ) {
  204. $f = fopen( $file, 'r' );
  205. if ( !$f ) return false;
  206. $list = array();
  207. while ( true ) {
  208. $s = fgets( $f );
  209. if ( $s === "" || $s === false ) {
  210. break;
  211. }
  212. $s = trim( $s );
  213. if ( $field ) $s = $this->source->normalize( $field, $s );
  214. $list[] = $s;
  215. }
  216. return $list;
  217. }
  218. public function importMabFile( $file ) {
  219. $f = fopen( $file, 'r' );
  220. if ( !$f ) return false;
  221. if ( $this->debug ) {
  222. print "== $file =======================\n";
  223. } elseif ( $this->multiRecord ) {
  224. $this->output( "reading records from $file\n" );
  225. }
  226. $eof = false;
  227. $pushed = false;
  228. while( !$eof ) {
  229. $rec = array();
  230. while( !$eof ) {
  231. if ( $pushed ) {
  232. $s = $pushed;
  233. $pushed = false;
  234. } else {
  235. $s = fgets( $f );
  236. }
  237. if ( $s === "" || $s === false ) {
  238. $eof = true;
  239. break;
  240. }
  241. if ( $rec && $this->recordSeparator && preg_match( $this->recordSeparator, $s ) ) {
  242. break; // next record
  243. }
  244. if ( preg_match( '/^(\d+[a-z]?)\s*([a-z])?=(.*$)/', $s, $m ) ) {
  245. $k = $m[1];
  246. $t = $m[2];
  247. $v = $m[3];
  248. if ( $rec && ($this->multiRecord && !$this->recordSeparator) && $k === "001" ) {
  249. $pushed = $s;
  250. # we expect 0001 to be the first thing in every record!
  251. break; // next record
  252. }
  253. if ( isset( $rec[$k] ) ) {
  254. if ( !is_array( $rec[$k] ) ) {
  255. $rec[$k] = array( $rec[$k] );
  256. }
  257. $rec[$k][] = $v;
  258. } else {
  259. $rec[$k] = $v;
  260. }
  261. }
  262. }
  263. if ( $rec ) {
  264. $ids = $this->getIds($rec);
  265. if ( !$ids ) {
  266. $this->output( "skipping part of file $file\n" );
  267. if ( $this->debug ) {
  268. var_export( $rec );
  269. print "------------------------------------\n";
  270. }
  271. continue;
  272. }
  273. $id = false;
  274. foreach ( $this->source->keyFields as $idf ) {
  275. if ( !empty( $ids[ $idf ] ) ) {
  276. $id = "$idf:" . $ids[$idf][0];
  277. }
  278. }
  279. if ( $this->idList && $this->idListField ) {
  280. $found = false;
  281. if ( !empty( $ids[ $this->idListField ] ) ) {
  282. foreach ( $ids[ $this->idListField ] as $v ) {
  283. if ( in_array( $v, $this->idList ) ) {
  284. $found = true;
  285. break;
  286. }
  287. }
  288. }
  289. if ( !$found ) {
  290. $this->output( "ignoring record #$id\n" );
  291. continue;
  292. }
  293. }
  294. if ( $this->debug ) {
  295. var_export( $ids );
  296. if ( !$this->noblob ) var_export( $rec );
  297. print "------------------------------------\n";
  298. } else {
  299. $this->output( "importing record $id\n" );
  300. $this->storeRecord($rec, $ids);
  301. }
  302. }
  303. }
  304. fclose( $f );
  305. return $rec;
  306. }
  307. }
  308. $maintClass = "ImportMAB2";
  309. require_once( DO_MAINTENANCE );