PageRenderTime 45ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/csv-to-dspace-xml.php

https://github.com/ottenhoff/dspace-utils
PHP | 419 lines | 316 code | 92 blank | 11 comment | 73 complexity | ab59487eea5495b33b150e46e640297b MD5 | raw file
  1. <?php
  2. set_time_limit(0);
  3. require_once 'XML/Serializer.php';
  4. if (floatval(phpversion()) < 5.0) {
  5. require_once 'PHP/Compat.php';
  6. require_once 'PHP/Compat/Function/file_put_contents.php';
  7. }
  8. $err = "";
  9. $badFiles = array();
  10. global $err, $badFiles;
  11. $commandsToRun = array();
  12. $out = array();
  13. if( count($argv) !== 4) {
  14. die("Bad number of arguments. Please supply a CSV filename to load, a directory to place the files, and the contents directory. \n\n");
  15. }
  16. $csvFile = trim($argv[1]);
  17. $dir = trim($argv[2]);
  18. $assets = trim($argv[3]);
  19. if( !file_exists($csvFile)) {
  20. die("Bad filename.");
  21. }
  22. if( !is_dir($dir)) {
  23. die("Bad directory.");
  24. }
  25. if( !is_dir($assets)) {
  26. die("Bad assets directory.");
  27. }
  28. $handle = fopen($csvFile, "r");
  29. $columns = array();
  30. $count = 0;
  31. while (($data = fgetcsv($handle, 10000, ",")) !== FALSE) {
  32. if($count == 0) {
  33. $columns = $data; // assign the column names
  34. }
  35. else {
  36. $arr = array();
  37. $acArr = array();
  38. $cmdArr = array();
  39. $fileNames = array();
  40. $cnt = count($data);
  41. $commands = array('collection id', 'collection', 'item owner', 'file location', 'file', 'filename', 'pdf', 'identifier');
  42. for($z = 0; $z < $cnt; $z++) {
  43. // clean up the user's column name
  44. $columns[$z] = strtolower(trim(str_replace("*","",$columns[$z])));
  45. if(empty($data[$z])) {
  46. }
  47. elseif(in_array(trim(strtolower($columns[$z])), $commands)) {
  48. switch($columns[$z]) {
  49. case "item owner":
  50. $cmdArr['owner'] = $data[$z];
  51. break;
  52. case "collection id":
  53. case "collection":
  54. if (!isset($cmdArr['coll'])) { // do not overwrite previous
  55. $tmp = $data[$z];
  56. if (strpos($tmp, "|") !== FALSE) {
  57. $cmdArr['coll'] = substr ($tmp, 0, strpos ($tmp, "|"));
  58. }
  59. else {
  60. $cmdArr['coll'] = $data[$z];
  61. }
  62. }
  63. break;
  64. case "file location":
  65. case "identifier":
  66. case "filename":
  67. case "file":
  68. case "pdf":
  69. $fileNames[] = trim($data[$z]);
  70. break;
  71. default:
  72. break;
  73. }
  74. }
  75. elseif (strpos($columns[$z], "ac.") === 0) {
  76. $ac = array();
  77. if(eregi('.', $columns[$z])) { // we have a special AC element
  78. $tmpArr = explode('.', $columns[$z]);
  79. if (trim($tmpArr[0]) == 'ac') {
  80. $ac['element'] = trim($tmpArr[1]);
  81. if(isset($tmpArr[2]) && trim($tmpArr[2]) != '') {
  82. $ac['qualifier'] = trim($tmpArr[2]);
  83. }
  84. else {
  85. $ac['qualifier'] = 'none';
  86. }
  87. }
  88. else {
  89. $ac['element'] = strtolower(trim($tmpArr[0]));
  90. if(isset($tmpArr[1]) && trim($tmpArr[1]) != '') {
  91. $ac['qualifier'] = strtolower(trim($tmpArr[1]));
  92. }
  93. else {
  94. $ac['qualifier'] = 'none';
  95. }
  96. }
  97. }
  98. $the_val = trim($data[$z]);
  99. $strlen = strlen($the_val);
  100. if ( substr($the_val, ($strlen-1),1) == ";") {
  101. $the_val = substr($the_val, 0, ($strlen-1));
  102. }
  103. // check if they are trying to do multiple values separated by pipes
  104. if (strpos($the_val, "||") !== FALSE) {
  105. $tmp = explode ("||", $the_val);
  106. foreach ($tmp AS $tt) {
  107. $acArr[] = array('_content' => $the_val, '_attributes' => $ac);
  108. }
  109. }
  110. else {
  111. $acArr[] = array('_content' => $the_val, '_attributes' => $ac);
  112. }
  113. }
  114. else {
  115. $dc = array();
  116. // we have a DC qualifier
  117. if(eregi('.', $columns[$z])) {
  118. $tmpArr = explode('.', $columns[$z]);
  119. if (trim($tmpArr[0]) == 'dc') {
  120. $dc['element'] = trim($tmpArr[1]);
  121. if(isset($tmpArr[2]) && trim($tmpArr[2]) != '') {
  122. $dc['qualifier'] = trim($tmpArr[2]);
  123. }
  124. else {
  125. $dc['qualifier'] = 'none';
  126. }
  127. }
  128. else {
  129. $dc['element'] = strtolower(trim($tmpArr[0]));
  130. if(isset($tmpArr[1]) && trim($tmpArr[1]) != '') {
  131. $dc['qualifier'] = strtolower(trim($tmpArr[1]));
  132. }
  133. else {
  134. $dc['qualifier'] = 'none';
  135. }
  136. }
  137. }
  138. $the_val = trim($data[$z]);
  139. $strlen = strlen($the_val);
  140. if ( substr($the_val, ($strlen-1),1) == ";") {
  141. $the_val = substr($the_val, 0, ($strlen-1));
  142. }
  143. // skip columns labeled ignore
  144. if (empty($dc['element']) || $dc['element'] == 'ignore') {
  145. continue;
  146. }
  147. // skip empty dates
  148. if ($the_val == '0000-00-00') {
  149. continue;
  150. }
  151. // split up on pipe character
  152. if (strpos($the_val, "||") !== FALSE) {
  153. $tmp = explode ("||", $the_val);
  154. foreach ($tmp AS $tt) {
  155. $arr[] = array('_content' => $tt, '_attributes' => $dc);
  156. }
  157. }
  158. else {
  159. $arr[] = array('_content' => $the_val, '_attributes' => $dc);
  160. }
  161. }
  162. }
  163. $dc_xml = serializeRow($arr);
  164. $ac_xml = serializeRow($acArr, "ac");
  165. $destination_dir = $dir;
  166. $command_dir = $dir;
  167. if (empty($cmdArr['coll'])) {
  168. echo "skipping row because no collection defined \n\n";
  169. continue;
  170. }
  171. if (strpos($cmdArr['coll'], "/") !== FALSE) {
  172. $tmp = explode("/", $cmdArr['coll']);
  173. $tmp2 = array_pop($tmp);
  174. $destination_dir .= "/" . $tmp2;
  175. $command_dir .= "/" . $tmp2;
  176. if (!is_dir($destination_dir)) {
  177. mkdir ($destination_dir);
  178. }
  179. }
  180. elseif (!empty($cmdArr['coll'])) {
  181. $destination_dir .= "/" . $cmdArr['coll'];
  182. $command_dir .= "/" . $cmdArr['coll'];
  183. if (!is_dir($destination_dir)) {
  184. mkdir ($destination_dir);
  185. }
  186. }
  187. $destination_dir .= "/" . $count;
  188. mkdir($destination_dir);
  189. file_put_contents($destination_dir . "/dublin_core.xml", $dc_xml);
  190. if (!empty($ac_xml)) {
  191. file_put_contents ($destination_dir . "/metadata_ac.xml", $ac_xml);
  192. }
  193. // moveFiles may modify the names of the files
  194. $fileNames = moveFilesToImportDir($fileNames, $assets, $destination_dir);
  195. // now can create the contents file
  196. $contents = createContentsFile($fileNames);
  197. file_put_contents($destination_dir . "/contents", $contents);
  198. $command = createCommand($cmdArr, $command_dir);
  199. $commandsToRun[md5($command)] = $command;
  200. }
  201. $count++;
  202. }
  203. file_put_contents("error.txt", $err);
  204. $c = "";
  205. foreach ($commandsToRun AS $key => $val) {
  206. $c .= $val . "\n";
  207. }
  208. file_put_contents("commands.sh", $c);
  209. if (count($badFiles) > 0) {
  210. file_put_contents("badfiles.txt", implode("\n", $badFiles));
  211. }
  212. function moveFilesToImportDir($files, $assetDir, $dir) {
  213. global $err, $badFiles;
  214. $filesToReturn = array();
  215. foreach($files AS $filename) {
  216. if(!file_exists($assetDir . "/" . $filename)) {
  217. $filename = str_replace('\\', '/', $filename);
  218. if (strpos($filename, '/') !== FALSE) {
  219. $tt = explode('/', $filename);
  220. $filename = array_pop($tt);
  221. }
  222. // try adding pdf to end of filename
  223. if(!file_exists($assetDir . "/" . $filename)) {
  224. $filename = str_replace (' ', ' ', $filename);
  225. }
  226. if(!file_exists($assetDir . "/" . $filename)) {
  227. $filename = str_replace ('version ', 'exam version ', $filename);
  228. }
  229. if(!file_exists($assetDir . "/" . $filename)) {
  230. $filename = str_replace ('memo.pdf', 'exam memo.pdf', $filename);
  231. }
  232. if(!file_exists($assetDir . "/" . $filename)) {
  233. $filename = str_replace ('commentary.pdf', 'exam commentary.pdf', $filename);
  234. }
  235. if(!file_exists($assetDir . "/" . $filename)) {
  236. $filename = str_replace ('Creditors', "Creditor's", $filename);
  237. }
  238. if(!file_exists($assetDir . "/" . $filename)) {
  239. $filename = str_replace ('- .pdf', '- exam.pdf', $filename);
  240. }
  241. if(!file_exists($assetDir . "/" . $filename)) {
  242. $filename = str_replace ('with answers', 'exam with answers', $filename);
  243. }
  244. if(!file_exists($assetDir . "/" . $filename)) {
  245. $filename = str_replace ('2011 Fall', '2011 Fall - Memo', $filename);
  246. }
  247. // try adding pdf to end of filename
  248. if(!file_exists($assetDir . "/" . $filename)) {
  249. $filename .= '.pdf';
  250. }
  251. // final strip of .pdf.pdf
  252. if(!file_exists($assetDir . "/" . $filename)) {
  253. $filename = str_replace ('.pdf.pdf', '.pdf', $filename);
  254. }
  255. if(!file_exists($assetDir . "/" . $filename)) {
  256. $err .= "Bad file: " . $assetDir . "/" . $filename . "\n";
  257. echo "Bad file: " . $assetDir . "/" . $filename . "\n";
  258. $badFiles[] = $filename;
  259. continue;
  260. }
  261. }
  262. $targetFile = $filename;
  263. if(eregi("/", $filename)) {
  264. $parts = explode("/", $filename);
  265. $tmp = "";
  266. foreach($parts AS $part) {
  267. $targetFile = $part;
  268. }
  269. }
  270. if(!copy($assetDir . "/" . $filename, $dir . "/" . $targetFile)) {
  271. die("bad copy: " . $assetDir . "/" . $filename . " to " . $dir . "/" . $targetFile);
  272. }
  273. $filesToReturn[] = $filename;
  274. }
  275. return $filesToReturn;
  276. }
  277. function createContentsFile($files) {
  278. $justNames = array();
  279. foreach($files AS $filename) {
  280. if(eregi("/", $filename)) {
  281. $parts = explode("/", $filename);
  282. $tmp = "";
  283. foreach($parts AS $part) {
  284. $tmp = $part;
  285. }
  286. $justNames[] = $tmp;
  287. }
  288. else {
  289. $justNames[] = $filename;
  290. }
  291. }
  292. return implode("\n", $justNames);
  293. }
  294. function createCommand($arr, $dir) {
  295. if (empty($arr['owner'])) {
  296. $arr['owner'] = 'user@example.edu';
  297. }
  298. return "bin/dspace import -a -e " . $arr['owner'] . " -c " . $arr['coll'] . " -s " . $dir . " -m " . $dir . "/import.map";
  299. }
  300. function serializeRow($arr, $schema="dc") {
  301. $serializer_options = array (
  302. 'addDecl' => TRUE,
  303. 'encoding' => 'utf-8',
  304. 'indent' => "\t",
  305. 'rootName' => 'dublin_core',
  306. 'rootAttributes' => array('schema' => $schema),
  307. 'defaultTagName' => 'dcvalue',
  308. 'scalarAsAttributes' => FALSE,
  309. 'attributesArray' => '_attributes',
  310. 'contentName' => '_content',
  311. );
  312. if (empty($arr) || count($arr) < 1) {
  313. return null;
  314. }
  315. $serializer = &new XML_Serializer($serializer_options);
  316. $serializer->setOption(XML_SERIALIZER_OPTION_CDATA_SECTIONS, true);
  317. $status = $serializer->serialize($arr);
  318. // Check whether serialization worked
  319. if (PEAR::isError($status)) {
  320. die($status->getMessage());
  321. }
  322. return $serializer->getSerializedData();
  323. }