PageRenderTime 45ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/commands/databases/PopulateDatabases.php

http://xerxes-portal.googlecode.com/
PHP | 407 lines | 227 code | 120 blank | 60 comment | 19 complexity | 7912eadf8f568625c2aea0b18313fdb6 MD5 | raw file
  1. <?php
  2. /**
  3. * Download and cache database information, type, and categories data from Metalib KB
  4. *
  5. * @author David Walker
  6. * @copyright 2008 California State University
  7. * @link http://xerxes.calstate.edu
  8. * @license http://www.gnu.org/licenses/
  9. * @version $Id: PopulateDatabases.php 1515 2010-11-25 13:55:26Z helix84@centrum.sk $
  10. * @package Xerxes
  11. * @uses Xerxes_Framework_Parser
  12. * @uses lib/xslt/marc-to-database.xsl
  13. */
  14. class Xerxes_Command_PopulateDatabases extends Xerxes_Command_Databases
  15. {
  16. private $configInstitute = ""; // config entry
  17. private $configPortal = ""; // config entry
  18. private $configLanguages = ""; // config entry
  19. private $configChunk = false; // config entry
  20. private $objSearch = null; // metasearch object
  21. private $category_count = 1; // to keep track of category id's
  22. public function doExecute()
  23. {
  24. // in case this is being called from the web, plaintext
  25. if ( $this->request->isCommandLine() == false )
  26. {
  27. header("Content-type: text/plain");
  28. }
  29. // set a higher than normal memory limit to account for
  30. // pulling down large knowledgebases
  31. $configMemory = $this->registry->getConfig("HARVEST_MEMORY_LIMIT", false, "500M");
  32. ini_set("memory_limit",$configMemory);
  33. echo "\n\nMETALIB KNOWLEDGEBASE PULL \n\n";
  34. // get configuration settings
  35. $this->configInstitute = $this->registry->getConfig("METALIB_INSTITUTE", true);
  36. $this->configPortal = $this->registry->getConfig("METALIB_PORTAL", false, $this->configInstitute);
  37. $this->configLanguages = $this->registry->getConfig("LANGUAGES", false);
  38. $this->configChunk = $this->registry->getConfig("CHUNK_KB_PULL", false, false);
  39. $configMetalibAddress = $this->registry->getConfig("METALIB_ADDRESS", true);
  40. $configMetalibUsername = $this->registry->getConfig("METALIB_USERNAME", true);
  41. $configMetalibPassword = $this->registry->getConfig("METALIB_PASSWORD", true);
  42. // metalib search object
  43. $this->objSearch = new Xerxes_MetaSearch($configMetalibAddress, $configMetalibUsername, $configMetalibPassword);
  44. // data map
  45. $objData = new Xerxes_DataMap();
  46. // clear the cache, while we're at it
  47. echo " Pruning cache table . . . ";
  48. $status = $objData->pruneCache();
  49. if ( $status != 1 )
  50. {
  51. throw new Exception("could not prune cache");
  52. }
  53. else
  54. {
  55. echo "done\n";
  56. }
  57. // now the real kb stuff
  58. $objData->beginTransaction();
  59. $arrSubjects = array(); // array of category and subcategory value objects
  60. $arrTypes = array(); // array of type value objects
  61. $arrDatabases = array(); // array of datatbase value objects
  62. echo " Flushing KB tables . . . ";
  63. $objData->clearKB();
  64. echo "done\n";
  65. echo " Fetching types . . . ";
  66. $arrTypes = $this->types();
  67. foreach ( $arrTypes as $objType )
  68. {
  69. $objData->addType($objType);
  70. }
  71. echo "done\n";
  72. echo " Fetching databases . . . ";
  73. $arrDatabases = $this->databases();
  74. foreach ( $arrDatabases as $objDatabase )
  75. {
  76. $objData->addDatabase($objDatabase);
  77. }
  78. echo "done\n";
  79. echo " Fetching categories and assigning databases . . . ";
  80. $languages = array(array("code" => "eng", "locale" => "C"));
  81. if ( $this->configLanguages != null )
  82. {
  83. $languages = $this->configLanguages->language;
  84. }
  85. foreach ( $languages as $language )
  86. {
  87. $locale = (string) $language["locale"];
  88. $code = (string) $language["code"];
  89. $oldlocale = setlocale( LC_CTYPE, 0 );
  90. setlocale( LC_CTYPE, $locale ); // this influences the iconv() call with 'ASCII//TRANSLIT' target
  91. $arrSubjects = $this->subjects($arrDatabases, $code);
  92. foreach( $arrSubjects as $objCategory )
  93. {
  94. $objData->addCategory($objCategory);
  95. }
  96. setlocale( LC_CTYPE, $oldlocale );
  97. }
  98. echo "done\n";
  99. echo " Synching user saved databases . . . ";
  100. $objData->synchUserDatabases();
  101. echo "done\n";
  102. echo " Committing changes . . . ";
  103. $objData->commit();
  104. echo "done\n";
  105. return 1;
  106. }
  107. /**
  108. * Fetch category and subcategory information from metalib and add to database
  109. *
  110. */
  111. private function subjects($arrDatabases, $lang)
  112. {
  113. $arrSubjects = array();
  114. // not actually specified
  115. $lang_metalib = strtoupper($lang);
  116. // fetch the categories from metalib
  117. $objXml = new DOMDocument();
  118. $objXml = $this->objSearch->categories(
  119. $this->configInstitute,
  120. $this->configPortal,
  121. $lang_metalib
  122. );
  123. $objXPath = new DOMXPath($objXml);
  124. $objCategories = $objXPath->query("//category_info");
  125. if ( $objCategories->length < 1 ) throw new Exception("Could not find any categories in the Metalib KB");
  126. // GET EACH CATEGORY
  127. foreach ( $objCategories as $objCategory )
  128. {
  129. $objDataCategory = new Xerxes_Data_Category();
  130. // extract category data and assign to object
  131. $objName = $objCategory->getElementsByTagName("category_name")->item(0);
  132. $strName = ""; if ( $objName != null ) $strName = $objName->nodeValue;
  133. // we'll use incrementer to uniquely identify the categories since metalib has no id for them
  134. $objDataCategory->id = $this->category_count;
  135. $objDataCategory->name = $strName;
  136. $objDataCategory->normalized = $this->normalize($strName);
  137. $objDataCategory->old = $this->normalizeOld($strName);
  138. $objDataCategory->lang = $lang;
  139. // GET EACH SUBCATEGORY
  140. $objSubCategories = $objCategory->getElementsByTagName("subcategory_info");
  141. // version 3 fix!
  142. if ( $objSubCategories->length == 0 )
  143. {
  144. $objSubCategories = $objCategory->getElementsByTagName("subcategory-info");
  145. }
  146. foreach ( $objSubCategories as $objSubCategory )
  147. {
  148. $objDataSubCategory = new Xerxes_Data_Subcategory();
  149. $objSubName = $objSubCategory->getElementsByTagName("subcategory_name")->item(0);
  150. $objSequence= $objSubCategory->getElementsByTagName("sequence")->item(0);
  151. $strSubName = ""; if ( $objSubName != null ) $strSubName = $objSubName->nodeValue;
  152. $strID = ""; if ( $objSequence != null ) $strID = $objSequence->nodeValue;
  153. $objDataSubCategory->metalib_id = $strID;
  154. $objDataSubCategory->name = $strSubName;
  155. // get the databases associated with this subcategory from metalib
  156. $objDatabasesXml = new DOMDocument();
  157. $objDatabasesXml = $this->objSearch->databasesSubCategory($strID, false);
  158. // extract just the database id
  159. $objXPath = new DOMXPath($objDatabasesXml);
  160. $objDatabases = $objXPath->query("//source_001");
  161. // GET EACH DATABASE ASSIGNED TO SUBCATEGORY
  162. foreach ( $objDatabases as $objDatabase )
  163. {
  164. $objData = new Xerxes_Data_Database();
  165. $objData->metalib_id = $objDatabase->nodeValue;
  166. // add it to the subcategory object only if the database already
  167. // exists in the KB, if not the case, then we've got mismatched
  168. // categories and databases from different institutes
  169. if ( array_key_exists($objData->metalib_id, $arrDatabases) )
  170. {
  171. array_push($objDataSubCategory->databases, $objData);
  172. }
  173. else
  174. {
  175. throw new Exception("Could not find database (" . $objData->metalib_id .
  176. ") assigned to category; make sure config entry ip_address is part " .
  177. "of the IP range associated with this Metalib instance");
  178. }
  179. }
  180. // add subcategory to the category object
  181. array_push($objDataCategory->subcategories, $objDataSubCategory);
  182. }
  183. $this->category_count++;
  184. // add category to master array
  185. array_push($arrSubjects, $objDataCategory);
  186. }
  187. return $arrSubjects;
  188. }
  189. /**
  190. * Pulls down type categories from Metalib and saves in cache
  191. *
  192. */
  193. private function types()
  194. {
  195. $arrTypes = array();
  196. // get types from metalib
  197. $objXml = new DOMDocument();
  198. $objXml = $this->objSearch->types($this->configInstitute);
  199. // extract just the type names
  200. $objXPath = new DOMXPath($objXml);
  201. $objTypes = $objXPath->query("//resource_type/@name");
  202. $x = 1;
  203. // cycle thru and add them to array of objects
  204. foreach ( $objTypes as $objType )
  205. {
  206. $objDataType = new Xerxes_Data_Type();
  207. $objDataType->id = $x;
  208. $objDataType->name = $objType->nodeValue;
  209. $objDataType->normalized = $this->normalize($objType->nodeValue);
  210. array_push($arrTypes, $objDataType);
  211. $x++;
  212. }
  213. return $arrTypes;
  214. }
  215. /**
  216. * Pulls down a compiled list of all database from Metalib and saves in database
  217. *
  218. */
  219. private function databases()
  220. {
  221. $arrDatabases = array();
  222. // get all databases and convert to local format
  223. $objXml = new DOMDocument();
  224. $objXml = $this->objSearch->allDatabases($this->configInstitute, true, $this->configChunk);
  225. $strXml = Xerxes_Framework_Parser::transform($objXml, "xsl/utility/marc-to-database.xsl");
  226. if ( $this->request->getProperty("test") )
  227. {
  228. $objXml->save("metalib.xml");
  229. file_put_contents("xerxes.xml", $strXml);
  230. }
  231. $strXml = Xerxes_Framework_Parser::transform($objXml, "xsl/utility/marc-to-database.xsl");
  232. // get just the database info
  233. $objSimple = new SimpleXMLElement($strXml);
  234. $arrDBs = $objSimple->xpath("//database");
  235. if ( count($arrDBs) < 1 )
  236. {
  237. throw new Exception("Could not find any databases in the Metalib KB. " .
  238. $this->objSearch->getWarnings(true) );
  239. }
  240. foreach ( $arrDBs as $objDatabase )
  241. {
  242. // populate data object with properties
  243. $objData = new Xerxes_Data_Database();
  244. $objData->metalib_id = (string) $objDatabase->metalib_id;
  245. $objData->title_display = (string) $objDatabase->title_display;
  246. $objData->type = (string) $objDatabase->type;
  247. $objData->data = $objDatabase->asXML();
  248. $arrDatabases[$objData->metalib_id] = $objData;
  249. }
  250. return $arrDatabases;
  251. }
  252. /**
  253. * Converts a sting to a normalized (no-spaces, non-letters) string
  254. *
  255. * @param string $strSubject original string
  256. * @return string normalized string
  257. */
  258. private function normalize($strSubject)
  259. {
  260. return Xerxes_Data_Category::normalize($strSubject);
  261. }
  262. private function normalizeOld($strSubject)
  263. {
  264. return Xerxes_Data_Category::normalize($strSubject);
  265. }
  266. /* We need to register a straight function for the XSL to call with php:function. Sorry. */
  267. public static function splitToNodeset($strList, $separator = ",")
  268. {
  269. $dom = new domdocument;
  270. $dom->loadXML("<list />");
  271. $docEl = $dom->documentElement;
  272. $arr = explode($separator, $strList);
  273. $found = false;
  274. foreach ($arr as $item)
  275. {
  276. if (! empty($item))
  277. {
  278. $found = true;
  279. $element = $dom->createElement("item", $item);
  280. $element->setAttribute("value", $item);
  281. $docEl->appendChild($element);
  282. }
  283. }
  284. return $dom->documentElement;
  285. }
  286. }
  287. ?>