PageRenderTime 40ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/vivoRDFExport/vivoRDFExport.php

https://bitbucket.org/vsposato/vivo-tools
PHP | 457 lines | 204 code | 56 blank | 197 comment | 22 complexity | 746557963465b3e8857aa6566a1cbf5b MD5 | raw file
  1. #!/usr/bin/php
  2. <?php
  3. // Retrieve command line options from user
  4. $commandOptions = getopt("u:s:o:p:e");
  5. // If the user didn't send exactly 3 values show usage and exit
  6. if (count($commandOptions) < 4) {
  7. printUsage();
  8. exit;
  9. }
  10. // Initialize variables that will be used throughout the program
  11. // $outputFileName - the name of the CSV file that will be generated
  12. $outputFileName = '';
  13. $outputFileName = $commandOptions['o'];
  14. // $baseURL - this is the URL of the SPARQL endpoint including query parameter string
  15. $baseURL = '';
  16. $baseURL = $commandOptions['u'];
  17. // $uniqueDataOnly - this is to determine whether we clean duplicates out of the parameter array or not
  18. // Since it doesn't take a value - we just need to make sure if the 'e' key isset
  19. $uniqueDataOnly = false;
  20. $uniqueDataOnly = isset($commandOptions['e']) ? true : false;
  21. // $outputFormat - this is the output format that we will be receiving from the SPARQL endpoint
  22. // TODO - We currently only support XML - may consider other options
  23. $outputFormat = '&output=xml';
  24. // $baseURL - this is the URL of the SPARQL endpoint including query parameter string
  25. $parameterFile = '';
  26. $parameterFile = $commandOptions['p'];
  27. // $sparqlFileName - this is the name of the file that holds the SPARQL query
  28. $sparqlFileName = '';
  29. $sparqlFileName = $commandOptions['s'];
  30. // Retrieve the query from the provided SPARQL filename
  31. $query = readSparqlFile($sparqlFileName);
  32. // Parse the parameter file and make an array that will hold the parameters
  33. $parameterArray = readParameterFile($parameterFile);
  34. // Create a parameter data only array to hold the replacement values to be used
  35. $dataArray = removeHeaderRows($parameterArray, 2, $uniqueDataOnly);
  36. // Create a header array that will contain the variable, data type, and index
  37. $headerArray = createHeaderArray($parameterArray[0], $parameterArray[1]);
  38. // Return the results of the queries to a string to be output to a file
  39. $xmlOutput = createRDFfromQuery($headerArray, $dataArray, $query, $baseURL, $outputFormat);
  40. // Retrieve the current microtime so that we can calulate length of time it took
  41. $startTime = microtime_float();
  42. // Determine if the file exists
  43. if (file_exists($outputFileName)) {
  44. // If it does exist - attempt to rename it by adding the start timestamp to the end of the filename
  45. $newFile = $outputFileName . $startTime;
  46. if (rename($outputFileName, $newFile)) {
  47. // If the rename was successful - open the file
  48. $fileHandle = fopen($outputFileName,'x');
  49. } else {
  50. // If the rename failed - error out
  51. echo "Error - output file exists and it can't be renamed - {$outputFileName}! \n";
  52. }
  53. } elseif (! file_exists($outputFileName)) {
  54. // If the file doesn't already exist - create it and open it for writing
  55. $fileHandle = fopen($outputFileName,'x');
  56. }
  57. // Determine if the file was opened correctly
  58. if ($fileHandle) {
  59. // If the file was opened properly, then output the xml
  60. fputs($fileHandle, $xmlOutput);
  61. // Close the file handle
  62. fclose($fileHandle);
  63. } else {
  64. // If the file was not opened properly - exit out
  65. echo "ERROR - Your file could not be opened! \n";
  66. }
  67. // Capture the current time in Unix timestamp
  68. $endTime = microtime_float();
  69. // Calculate number of seconds for execution and output it
  70. $timeToComplete = $endTime - $startTime;
  71. echo "It took " . $timeToComplete . " seconds to complete this operation! \n";
  72. /**
  73. * printUsage function.
  74. *
  75. * @access public
  76. * @return void
  77. */
  78. function printUsage() {
  79. // This function is only called if the user attempts to execute without correct parameters
  80. echo "\n";
  81. echo "vivoRDFExport tool usage: \n";
  82. echo "-u The URL to your SPARQL endpoint to include the query designtation - ie 'http://sparql.vivo.ufl.edu:3030/VIVO/query?query=' \n";
  83. echo "-s The path to the file containing your SPARQL query \n";
  84. echo "-o The path to the file you want to output to \n";
  85. echo "-p The path to the parameter CSV file you want to use to fill out your query \n";
  86. echo "-e A boolean as to whether you want to remove duplicate data elements from the parameter file input \n";
  87. echo "\n";
  88. }
  89. /**
  90. * readSparqlFile function.
  91. *
  92. * @access public
  93. * @param mixed $sparqlFile
  94. * @return void
  95. */
  96. function readSparqlFile($sparqlFile) {
  97. // First check to see that the file exists
  98. if (! file_exists($sparqlFile)) {
  99. // If it doesn't exist then error out
  100. echo "SPARQL query file does not exists - $sparqlFile";
  101. return false;
  102. }
  103. try {
  104. // Open the SPARQL query file
  105. $sparqlFileHandle = fopen($sparqlFile, 'r');
  106. // Initialize a blanke SPARQL query string
  107. $sparqlQuery = '';
  108. while (! feof($sparqlFileHandle) ) {
  109. // Get a line of data from the file and append it it to the SPARQL query string
  110. $sparqlQuery .= fgets($sparqlFileHandle);
  111. }
  112. // Return the SPARQL query back to the calling function
  113. return $sparqlQuery;
  114. } catch (Exception $e) {
  115. // Something happened and we couldn't complete the SPARQL query string so display the exception and exit
  116. echo "Exception in readSparqlFile function - $e";
  117. print_r($e);
  118. exit;
  119. }
  120. }
  121. /**
  122. * readParameterFile function.
  123. *
  124. * @access public
  125. * @param mixed $parameterFile
  126. * @return void
  127. */
  128. function readParameterFile($parameterFile) {
  129. // First check to see if the file exists
  130. if (! file_exists($parameterFile)) {
  131. // If it doesn't exist then error out
  132. echo "Parameter file does not exists - $$parameterFile";
  133. return false;
  134. }
  135. try {
  136. // Open the Parameter file
  137. $parameterFileHandle = fopen($parameterFile, 'r');
  138. // Initialize a blanke SPARQL query string
  139. $parameterArray = array();
  140. while (! feof($parameterFileHandle) ) {
  141. // Get a row of data from the CSV parameter file and add it to the numeric indexed array
  142. $parameterArray[] = fgetcsv($parameterFileHandle);
  143. }
  144. // Return the SPARQL query back to the calling function
  145. return $parameterArray;
  146. } catch (Exception $e) {
  147. // Something happened and we couldn't complete the read of the parameter CSV file
  148. echo "Exception in readParameterFile function - $e";
  149. print_r($e);
  150. exit;
  151. }
  152. }
  153. /**
  154. * createRDFfromQuery function.
  155. *
  156. * @access public
  157. * @param array $parameterArray
  158. * @param string $query
  159. * @param string $baseURL
  160. * @param string $outputFormat
  161. * @return string
  162. */
  163. function createRDFfromQuery($headerArray, $dataOnly, $query, $baseURL, $outputFormat) {
  164. // Create the master XML string
  165. $resultRDF = new DOMDocument('1.0');
  166. // Initialize a counter variable
  167. $rowCounter = 0;
  168. // Loop through each of the rows of data provided
  169. foreach ($dataOnly as $row) {
  170. // Replace parameters within query with values from data array
  171. $tempQuery = parameterizeQuery($headerArray, $query, $row);
  172. // Temporary variable to hold the RDF response from SPARQL
  173. $tempRDF = performSPARQLQuery(createFullURL($baseURL, $tempQuery, $outputFormat));
  174. if ($rowCounter == 0) {
  175. // If this is the first time through (or first result) then we need to keep the entire document as it has important namespace information
  176. try {
  177. // Attempt to run the processRDF and get results back
  178. $returnedXML = processRDF($tempRDF, false);
  179. if (is_string($returnedXML)) {
  180. // If it returns a String then this is good so process away
  181. $resultRDF->loadXML($returnedXML);
  182. // Increment the rowCounter so we can keep track of how many times through
  183. $rowCounter++;
  184. }
  185. } catch (Exception $e) {
  186. // Catch any exceptions that may come through - although we need to probably make this a little more robust
  187. echo "Returned a null result from processRDF - probably didn't find a match - $e";
  188. continue;
  189. }
  190. } elseif ($rowCounter >= 1) {
  191. // If this is not our first time throught (or first result) then we need only the child node of the results
  192. try {
  193. // Attempt to run the processRDF and get results back
  194. $returnedXML = processRDF($tempRDF, true);
  195. if (is_object($returnedXML)) {
  196. // If the processRDF returned an object - then we are good to continue processing - append this child to overall document
  197. $resultRDF->documentElement->appendChild($returnedXML);
  198. // Increment the rowCounter so we can keep track of how many times through
  199. $rowCounter++;
  200. }
  201. } catch (Exception $e) {
  202. // Catch any exceptions that may come through - although we need to probably make this a little more robust
  203. echo "Returned a null result from processRDF - probably didn't find a match - $e";
  204. continue;
  205. }
  206. }
  207. }
  208. // Return the resultant DOMDocument as XML so we can save it to a file
  209. return $resultRDF->saveXML();
  210. }
  211. /**
  212. * processRDF function.
  213. *
  214. * @access public
  215. * @param xml $inputRDF
  216. * @param boolean $stripHeaders
  217. * @return DOMNode or string
  218. */
  219. function processRDF($inputRDF, $stripHeaders) {
  220. // Instantiate the XML reader
  221. $rdfResult = new XMLReader();
  222. // Load the RDF passed in to the XML reader
  223. $rdfResult->XML($inputRDF, "UTF-8");
  224. // Check to see if we are supposed to remove the header information
  225. if ($stripHeaders) {
  226. // We are removing all the beginning header information
  227. // Loop through the XML tree to find what we are looking for
  228. while ($rdfResult->read()) {
  229. // Check to see if this is an ELEMENT node as opposed to a TEXT or attribute
  230. if ($rdfResult->nodeType == XMLReader::ELEMENT) {
  231. // Check to see if it is a description node - as this will hold the guts of the data
  232. if ($rdfResult->localName === "Description") {
  233. // Return the node back to the calling function
  234. return $rdfResult->expand();
  235. }
  236. }
  237. }
  238. } elseif (! $stripHeaders) {
  239. // We are not supposed to remove headers so send the entire XML document back
  240. $rdfResult->read();
  241. return $rdfResult->readOuterXML();
  242. }
  243. }
  244. /**
  245. * createFullURL function.
  246. *
  247. * @access public
  248. * @param string $baseURL
  249. * @param string $query
  250. * @param string $outputFormat
  251. * @return string URL
  252. */
  253. function createFullURL($baseURL, $query, $outputFormat) {
  254. // URL encode the query so that we can pass it as part of the URL
  255. $query = urlencode($query);
  256. // Return the full url to the calling function
  257. return ($baseURL . $query . $outputFormat);
  258. }
  259. /**
  260. * performSPARQLQuery function.
  261. *
  262. * @access public
  263. * @param string $fullURL
  264. * @param string $query
  265. * @return xml $curlReturn
  266. */
  267. function performSPARQLQuery($fullURL) {
  268. /*
  269. * This function will take a full URL and a query to execute and
  270. * it will perform the query using CURL. It will return the output
  271. * as a string.
  272. *
  273. */
  274. // Iniitialize CURL for communication with SPARQL endpoint
  275. $curlInit = curl_init();
  276. // Set options for CURL
  277. // Set the URL that CURL will talk with to the $fullURL built earlier
  278. curl_setopt($curlInit, CURLOPT_URL, $fullURL);
  279. // Set CURL to 'return' the value to the variable so that it can be processed
  280. curl_setopt($curlInit, CURLOPT_RETURNTRANSFER, true);
  281. // Execute the CURL and pass response back to $curlReturn
  282. $curlReturn = curl_exec($curlInit);
  283. // Close out the CURL
  284. curl_close($curlInit);
  285. /* echo "<pre>";
  286. print_r($$curlReturn);
  287. echo "</pre>"; */
  288. return $curlReturn;
  289. }
  290. /**
  291. * createHeaderArray function.
  292. *
  293. * @access public
  294. * @param array $headerRow
  295. * @param array $typeRow
  296. * @return array $headerArray
  297. */
  298. function createHeaderArray($headerRow, $typeRow) {
  299. // Create an array to hold the completed header array
  300. $headerArray = array();
  301. foreach ($headerRow as $index=>$value) {
  302. // Initialize a blank array to perform our work on
  303. $tempArray = array();
  304. // The index of this column in the header will be carried over to the index key
  305. $tempArray['index'] = $index;
  306. // Get the value type from the corresponding index of the typeRow
  307. $tempArray['valueType'] = $typeRow[$index];
  308. // Get the variable value from the header row value
  309. $tempArray['key'] = $value;
  310. // Place this in the next numeric index available in the array
  311. $headerArray[] = $tempArray;
  312. }
  313. /* echo "<pre>";
  314. print_r($headerArray);
  315. echo "</pre>";*/
  316. // Return the array to the calling function
  317. return $headerArray;
  318. }
  319. /**
  320. * parameterizeQuery function.
  321. *
  322. * @access public
  323. * @param array $headerRow
  324. * @param string $query
  325. * @param array $dataRow
  326. * @return string $parameterizedQuery
  327. */
  328. function parameterizeQuery($headerRow, $query, $dataRow) {
  329. // Initialize a query that can search and replaced
  330. $parameterizedQuery = $query;
  331. // Loop through the headerRow to determine which index is which variable
  332. foreach ($headerRow as $key=>$value) {
  333. // Create the needle that will be searched - this is the parameter we set in the query
  334. $needle = "[" . $value['key'] . "]";
  335. $index = $value['index'];
  336. // Check to see what type of parameter this is supposed to be so we handle the value correctly
  337. switch ($value['valueType']) {
  338. case "string":
  339. // If this is a string value then we need to wrap it in double quotes
  340. $replacement = "\"$dataRow[$index]\"";
  341. break;
  342. case "numeric":
  343. // If this is a numeric value then we don't need to do anything special
  344. $replacement = $dataRow[$index];
  345. break;
  346. default:
  347. // If nothing else treat it as a string
  348. $replacement = "\"$dataRow[$index]\"";
  349. break;
  350. }
  351. // Run the string replace using the needle and replacement created
  352. $parameterizedQuery = str_replace($needle, $replacement, $parameterizedQuery);
  353. }
  354. /*echo "<pre>";
  355. print_r($parameterizedQuery);
  356. echo "</pre>";*/
  357. // Return the query to the calling function
  358. return $parameterizedQuery;
  359. }
  360. /**
  361. * removeHeaderRows function.
  362. *
  363. * @access public
  364. * @param array $parameterArray
  365. * @param int $numOfHeaderRows
  366. * @return array $dataOnly
  367. */
  368. function removeHeaderRows($parameterArray, $numOfHeaderRows, $uniqueDataOnly) {
  369. // Set up a new array to hold the cleaned data
  370. $dataOnly = array();
  371. // Get the count of the elements in the array -
  372. $arrayElementCount = (count($parameterArray) - 1);
  373. // Loop through the elements starting with 2 to get rid of the header rows
  374. for ($i = $numOfHeaderRows; $i < $arrayElementCount; $i++) {
  375. // Add the row to the new data only array
  376. $dataOnly[] = $parameterArray[$i];
  377. }
  378. if ($uniqueDataOnly) {
  379. // Return a unique value only array
  380. return $dataOnly;
  381. } elseif (! $uniqueDataOnly) {
  382. // Return the new data only array to the calling function
  383. return $dataOnly;
  384. }
  385. }
  386. /**
  387. * removeDuplicatesFromDataParameters function.
  388. *
  389. * @access public
  390. * @param array $dataArray
  391. * @return array
  392. */
  393. function removeDuplicatesFromDataParameters($dataArray) {
  394. // Clean out duplicate values from our array
  395. //return array_unique($dataArray, SORT_STRING);
  396. }
  397. function microtime_float(){
  398. list($usec, $sec) = explode(" ", microtime());
  399. return ((float)$usec + (float)$sec);
  400. }
  401. ?>