PageRenderTime 26ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/classes/search/PaperSearchIndex.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 286 lines | 170 code | 36 blank | 80 comment | 21 complexity | 1f003bab9e38bc86ae12a4a64f39462c MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @file PaperSearchIndex.inc.php
  4. *
  5. * Copyright (c) 2000-2012 John Willinsky
  6. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  7. *
  8. * @class PaperSearchIndex
  9. * @ingroup search
  10. * @see PaperSearch
  11. *
  12. * @brief Class to add content to the paper search index.
  13. */
  14. //$Id$
  15. import('search.SearchFileParser');
  16. import('search.SearchHTMLParser');
  17. import('search.SearchHelperParser');
  18. import('search.PaperSearch');
  19. // Words are truncated to at most this length
  20. define('SEARCH_KEYWORD_MAX_LENGTH', 40);
  21. class PaperSearchIndex {
  22. /**
  23. * Index a block of text for an object.
  24. * @param $objectId int
  25. * @param $text string
  26. * @param $position int
  27. */
  28. function indexObjectKeywords($objectId, $text, &$position) {
  29. $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
  30. $keywords =& PaperSearchIndex::filterKeywords($text);
  31. for ($i = 0, $count = count($keywords); $i < $count; $i++) {
  32. if ($searchDao->insertObjectKeyword($objectId, $keywords[$i], $position) !== null) {
  33. $position += 1;
  34. }
  35. }
  36. }
  37. /**
  38. * Add a block of text to the search index.
  39. * @param $paperId int
  40. * @param $type int
  41. * @param $text string
  42. * @param $assocId int optional
  43. */
  44. function updateTextIndex($paperId, $type, $text, $assocId = null) {
  45. $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
  46. $objectId = $searchDao->insertObject($paperId, $type, $assocId);
  47. $position = 0;
  48. PaperSearchIndex::indexObjectKeywords($objectId, $text, $position);
  49. }
  50. /**
  51. * Add a file to the search index.
  52. * @param $paperId int
  53. * @param $type int
  54. * @param $fileId int
  55. */
  56. function updateFileIndex($paperId, $type, $fileId) {
  57. import('file.PaperFileManager');
  58. $fileMgr = new PaperFileManager($paperId);
  59. $file =& $fileMgr->getFile($fileId);
  60. if (isset($file)) {
  61. $parser =& SearchFileParser::fromFile($file);
  62. }
  63. if (isset($parser)) {
  64. if ($parser->open()) {
  65. $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
  66. $objectId = $searchDao->insertObject($paperId, $type, $fileId);
  67. $position = 0;
  68. while(($text = $parser->read()) !== false) {
  69. PaperSearchIndex::indexObjectKeywords($objectId, $text, $position);
  70. }
  71. $parser->close();
  72. }
  73. }
  74. }
  75. /**
  76. * Delete keywords from the search index.
  77. * @param $paperId int
  78. * @param $type int optional
  79. * @param $assocId int optional
  80. */
  81. function deleteTextIndex($paperId, $type = null, $assocId = null) {
  82. $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
  83. return $searchDao->deletePaperKeywords($paperId, $type, $assocId);
  84. }
  85. /**
  86. * Split a string into a clean array of keywords
  87. * @param $text string
  88. * @param $allowWildcards boolean
  89. * @return array of keywords
  90. */
  91. function &filterKeywords($text, $allowWildcards = false) {
  92. $minLength = Config::getVar('search', 'min_word_length');
  93. $stopwords =& PaperSearchIndex::loadStopwords();
  94. // Join multiple lines into a single string
  95. if (is_array($text)) $text = join("\n", $text);
  96. $cleanText = Core::cleanVar($text);
  97. // Remove punctuation
  98. $cleanText = String::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $cleanText);
  99. $cleanText = String::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $cleanText);
  100. $cleanText = String::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $cleanText);
  101. $cleanText = String::strtolower($cleanText);
  102. // Split into words
  103. $words = preg_split('/\s+/', $cleanText);
  104. // FIXME Do not perform further filtering for some fields, e.g., author names?
  105. // Remove stopwords
  106. $keywords = array();
  107. foreach ($words as $k) {
  108. if (!isset($stopwords[$k]) && String::strlen($k) >= $minLength && !is_numeric($k)) {
  109. $keywords[] = String::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
  110. }
  111. }
  112. return $keywords;
  113. }
  114. /**
  115. * Return list of stopwords.
  116. * FIXME Should this be locale-specific?
  117. * @return array with stopwords as keys
  118. */
  119. function &loadStopwords() {
  120. static $searchStopwords;
  121. if (!isset($searchStopwords)) {
  122. // Load stopwords only once per request (FIXME Cache?)
  123. $searchStopwords = array_count_values(array_filter(file(Config::getVar('general', 'registry_dir') . '/stopwords.txt'), create_function('&$a', 'return ($a = trim($a)) && !empty($a) && $a[0] != \'#\';')));
  124. $searchStopwords[''] = 1;
  125. }
  126. return $searchStopwords;
  127. }
  128. /**
  129. * Index paper metadata.
  130. * @param $paper Paper
  131. */
  132. function indexPaperMetadata(&$paper) {
  133. // Build author keywords
  134. $authorText = array();
  135. $authors = $paper->getAuthors();
  136. for ($i=0, $count=count($authors); $i < $count; $i++) {
  137. $author =& $authors[$i];
  138. array_push($authorText, $author->getFirstName());
  139. array_push($authorText, $author->getMiddleName());
  140. array_push($authorText, $author->getLastName());
  141. array_push($authorText, $author->getAffiliation());
  142. $bios = $author->getBiography(null);
  143. if (is_array($bios)) foreach ($bios as $bio) { // Localized
  144. array_push($authorText, strip_tags($bio));
  145. }
  146. }
  147. // Update search index
  148. $paperId = $paper->getId();
  149. PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_AUTHOR, $authorText);
  150. PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_TITLE, $paper->getTitle(null));
  151. $trackDao =& DAORegistry::getDAO('TrackDAO');
  152. $track =& $trackDao->getTrack($paper->getTrackId());
  153. PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_ABSTRACT, $paper->getAbstract(null));
  154. PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_DISCIPLINE, $paper->getDiscipline(null));
  155. PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_SUBJECT, array_merge(array_values((array) $paper->getSubjectClass(null)), array_values((array) $paper->getSubject(null))));
  156. PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_TYPE, $paper->getType(null));
  157. PaperSearchIndex::updateTextIndex(
  158. $paperId,
  159. PAPER_SEARCH_COVERAGE,
  160. array_merge(
  161. array_values((array) $paper->getCoverageGeo(null)),
  162. array_values((array) $paper->getCoverageChron(null)),
  163. array_values((array) $paper->getCoverageSample(null))
  164. )
  165. );
  166. // FIXME Index sponsors too?
  167. }
  168. /**
  169. * Index supp file metadata.
  170. * @param $suppFile object
  171. */
  172. function indexSuppFileMetadata(&$suppFile) {
  173. // Update search index
  174. $paperId = $suppFile->getPaperId();
  175. PaperSearchIndex::updateTextIndex(
  176. $paperId,
  177. PAPER_SEARCH_SUPPLEMENTARY_FILE,
  178. array_merge(
  179. array_values((array) $suppFile->getTitle(null)),
  180. array_values((array) $suppFile->getCreator(null)),
  181. array_values((array) $suppFile->getSubject(null)),
  182. array_values((array) $suppFile->getTypeOther(null)),
  183. array_values((array) $suppFile->getDescription(null)),
  184. array_values((array) $suppFile->getSource(null))
  185. ),
  186. $suppFile->getFileId()
  187. );
  188. }
  189. /**
  190. * Index all paper files (supplementary and galley).
  191. * @param $paper Paper
  192. */
  193. function indexPaperFiles(&$paper) {
  194. // Index supplementary files
  195. $fileDao =& DAORegistry::getDAO('SuppFileDAO');
  196. $files =& $fileDao->getSuppFilesByPaper($paper->getId());
  197. foreach ($files as $file) {
  198. if ($file->getFileId()) {
  199. PaperSearchIndex::updateFileIndex($paper->getId(), PAPER_SEARCH_SUPPLEMENTARY_FILE, $file->getFileId());
  200. }
  201. PaperSearchIndex::indexSuppFileMetadata($file);
  202. }
  203. unset($files);
  204. // Index galley files
  205. $fileDao =& DAORegistry::getDAO('PaperGalleyDAO');
  206. $files =& $fileDao->getGalleysByPaper($paper->getId());
  207. foreach ($files as $file) {
  208. if ($file->getFileId()) {
  209. PaperSearchIndex::updateFileIndex($paper->getId(), PAPER_SEARCH_GALLEY_FILE, $file->getFileId());
  210. }
  211. }
  212. }
  213. /**
  214. * Rebuild the search index for all conferences.
  215. */
  216. function rebuildIndex($log = false) {
  217. // Clear index
  218. if ($log) echo 'Clearing index ... ';
  219. $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
  220. // FIXME Abstract into PaperSearchDAO?
  221. $searchDao->update('DELETE FROM paper_search_object_keywords');
  222. $searchDao->update('DELETE FROM paper_search_objects');
  223. $searchDao->update('DELETE FROM paper_search_keyword_list');
  224. $searchDao->setCacheDir(Config::getVar('files', 'files_dir') . '/_db');
  225. $searchDao->_dataSource->CacheFlush();
  226. if ($log) echo "done\n";
  227. // Build index
  228. $schedConfDao =& DAORegistry::getDAO('SchedConfDAO');
  229. $paperDao =& DAORegistry::getDAO('PaperDAO');
  230. $schedConfs =& $schedConfDao->getSchedConfs();
  231. while (!$schedConfs->eof()) {
  232. $schedConf =& $schedConfs->next();
  233. $numIndexed = 0;
  234. if ($log) echo "Indexing \"", $schedConf->getFullTitle(), "\" ... ";
  235. $papers =& $paperDao->getPapersBySchedConfId($schedConf->getId());
  236. while (!$papers->eof()) {
  237. $paper =& $papers->next();
  238. if ($paper->getDateSubmitted()) {
  239. PaperSearchIndex::indexPaperMetadata($paper);
  240. PaperSearchIndex::indexPaperFiles($paper);
  241. $numIndexed++;
  242. }
  243. unset($paper);
  244. }
  245. if ($log) echo $numIndexed, " papers indexed\n";
  246. unset($schedConf);
  247. }
  248. }
  249. }
  250. ?>