PageRenderTime 39ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/pkp/classes/citation/NlmCitationDemultiplexerFilter.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 365 lines | 166 code | 47 blank | 152 comment | 21 complexity | b1fa79095d7c89b16e6586540fcce44c MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @file classes/citation/NlmCitationDemultiplexerFilter.inc.php
  4. *
  5. * Copyright (c) 2000-2012 John Willinsky
  6. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  7. *
  8. * @class NlmCitationDemultiplexerFilter
  9. * @ingroup citation
  10. *
  11. * @brief Filter that takes a list of NLM citation descriptions and joins
  12. * them into a single "best" citation.
  13. */
  14. // $Id$
  15. import('filter.Filter');
  16. class NlmCitationDemultiplexerFilter extends Filter {
  17. /**
  18. * @var Citation The original unfiltered citation required
  19. * to calculate the filter result confidence score.
  20. */
  21. var $_originalCitation;
  22. /**
  23. * Constructor
  24. */
  25. function NlmCitationDemultiplexerFilter() {
  26. parent::Filter();
  27. }
  28. //
  29. // Setters and Getters
  30. //
  31. /**
  32. * Set the original citation description
  33. * @param $originalCitation Citation
  34. */
  35. function setOriginalCitation(&$originalCitation) {
  36. $this->_originalCitation =& $originalCitation;
  37. }
  38. /**
  39. * Get the original citation description
  40. * @return Citation
  41. */
  42. function &getOriginalCitation() {
  43. return $this->_originalCitation;
  44. }
  45. //
  46. // Implementing abstract template methods from Filter
  47. //
  48. /**
  49. * @see Filter::process()
  50. * @param $input array incoming MetadataDescriptions
  51. * @return Citation
  52. */
  53. function &process(&$input) {
  54. // Initialize the array that will contain citations by confidence score.
  55. // This is a two-dimensional array that with the score as key and
  56. // the scored citations as values.
  57. $scoredCitations = array();
  58. // Iterate over the incoming NLM citation descriptions
  59. foreach ($input as $citationIndex => $filteredCitation) {
  60. if (is_null($filteredCitation)) continue;
  61. // FIXME: We should provide feedback to the end-user
  62. // about filters that caused an error.
  63. // If the publication type is not set, take a guess
  64. if (!$filteredCitation->hasStatement('[@publication-type]')) {
  65. $guessedPublicationType = $this->_guessPublicationType($filteredCitation);
  66. if (!is_null($guessedPublicationType)) {
  67. $filteredCitation->addStatement('[@publication-type]', $guessedPublicationType);
  68. }
  69. }
  70. // Calculate the score for this filtered citation
  71. $confidenceScore = $this->_filterConfidenceScore($filteredCitation, $this->_originalCitation);
  72. // Save the filtered result hashed by its confidence score.
  73. // We save them as a sub-array in case several citations
  74. // receive the same confidence score.
  75. if (!isset($scoredCitations[$confidenceScore])) {
  76. $scoredCitations[$confidenceScore] = array();
  77. }
  78. $scoredCitations[$confidenceScore][] =& $filteredCitation;
  79. unset ($filteredCitation);
  80. }
  81. // Get a single set of "best" values for the citation description
  82. // and set them in a new citation object.
  83. $citation =& $this->_guessValues($scoredCitations);
  84. return $citation;
  85. }
  86. /**
  87. * @see Filter::supports()
  88. * @param $input mixed
  89. * @param $output mixed
  90. * @return boolean
  91. */
  92. function supports(&$input, &$output) {
  93. // Check input type
  94. // Check the number of the input objects
  95. if (!(is_array($input) && count($input))) return false;
  96. // Iterate over the input objects and check their type.
  97. $inputFound = false;
  98. foreach($input as $metadataDescription) {
  99. if (!is_null($metadataDescription)) {
  100. // We need at least one non-null value
  101. $inputFound = true;
  102. if (!is_a($metadataDescription, 'MetadataDescription')) return false;
  103. $metadataSchema = $metadataDescription->getMetadataSchema();
  104. if (!is_a($metadataSchema, 'NlmCitationSchema')) return false;
  105. }
  106. }
  107. if (!$inputFound) return false;
  108. // Check output type
  109. if (is_null($output)) return true;
  110. return is_a($output, 'Citation');
  111. }
  112. //
  113. // Private helper methods
  114. //
  115. /**
  116. * Try to guess a citation's publication type based on detected elements
  117. * @param $metadataDescription MetadataDescription
  118. * @return integer one of NLM_PUBLICATION_TYPE_*
  119. */
  120. function _guessPublicationType(&$metadataDescription) {
  121. // If we already have a publication type, why should we guess one?
  122. assert(!$metadataDescription->hasStatement('[@publication-type]'));
  123. // Avoid deducing from a description that has only very few properties set
  124. // and may therefore be of low quality.
  125. $descriptionCompletenessIndicators = array(
  126. 'person-group[@person-group-type="editor"]', 'article-title', 'date'
  127. );
  128. foreach($descriptionCompletenessIndicators as $descriptionCompletenessIndicator) {
  129. if (!$metadataDescription->hasStatement($descriptionCompletenessIndicator)) return null;
  130. }
  131. // The following property names help us to guess the most probable publication type
  132. $typicalPropertyNames = array(
  133. 'volume' => NLM_PUBLICATION_TYPE_JOURNAL,
  134. 'issue' => NLM_PUBLICATION_TYPE_JOURNAL,
  135. 'season' => NLM_PUBLICATION_TYPE_JOURNAL,
  136. 'issn[@pub-type="ppub"]' => NLM_PUBLICATION_TYPE_JOURNAL,
  137. 'issn[@pub-type="epub"]' => NLM_PUBLICATION_TYPE_JOURNAL,
  138. 'pub-id[@pub-id-type="pmid"]' => NLM_PUBLICATION_TYPE_JOURNAL,
  139. 'person-group[@person-group-type="editor"]' => NLM_PUBLICATION_TYPE_BOOK,
  140. 'edition' => NLM_PUBLICATION_TYPE_BOOK,
  141. 'chapter-title' => NLM_PUBLICATION_TYPE_BOOK,
  142. 'isbn' => NLM_PUBLICATION_TYPE_BOOK,
  143. 'publisher-name' => NLM_PUBLICATION_TYPE_BOOK,
  144. 'publisher-loc' => NLM_PUBLICATION_TYPE_BOOK,
  145. 'conf-date' => NLM_PUBLICATION_TYPE_CONFPROC,
  146. 'conf-loc' => NLM_PUBLICATION_TYPE_CONFPROC,
  147. 'conf-name' => NLM_PUBLICATION_TYPE_CONFPROC,
  148. 'conf-sponsor' => NLM_PUBLICATION_TYPE_CONFPROC
  149. );
  150. $hitCounters = array(
  151. NLM_PUBLICATION_TYPE_JOURNAL => 0,
  152. NLM_PUBLICATION_TYPE_BOOK => 0,
  153. NLM_PUBLICATION_TYPE_CONFPROC => 0
  154. );
  155. $highestCounterValue = 0;
  156. $probablePublicationType = null;
  157. foreach($typicalPropertyNames as $typicalPropertyName => $currentProbablePublicationType) {
  158. if ($metadataDescription->hasStatement($currentProbablePublicationType)) {
  159. // Record the hit
  160. $hitCounters[$currentProbablePublicationType]++;
  161. // Is this currently the highest counter value?
  162. if ($hitCounters[$currentProbablePublicationType] > $highestCounterValue) {
  163. // This is the highest value
  164. $highestCounterValue = $hitCounters[$currentProbablePublicationType];
  165. $probablePublicationType = $currentProbablePublicationType;
  166. } elseif ($hitCounters[$currentProbablePublicationType] == $highestCounterValue) {
  167. // There are two counters with the same value, so no unique result
  168. $probablePublicationType = null;
  169. }
  170. }
  171. }
  172. // Return the publication type with the highest hit counter.
  173. return $probablePublicationType;
  174. }
  175. /**
  176. * Derive a confidence score calculated as the number of statements for a group
  177. * of expected properties.
  178. * @param $metadataDescription MetadataDescription
  179. * @param $originalCitation Citation
  180. * @return integer filter confidence score
  181. */
  182. function _filterConfidenceScore(&$metadataDescription, &$originalCitation) {
  183. // FIXME: Amend this algorithm by calculating the similarity between the edited
  184. // citation string and the citation description:
  185. // 1) For expected fields: See whether a similar text exists in the original
  186. // citation.
  187. // 2) Add up the number of characters that are similar and compare them to the
  188. // number of characters in the original text.
  189. // Find out how many of the expected properties were identified by the filter.
  190. $expectedProperties = array(
  191. 'person-group[@person-group-type="author"]', 'article-title', 'source',
  192. 'date', 'fpage', '[@publication-type]'
  193. );
  194. $setProperties = array_intersect($expectedProperties, $metadataDescription->getSetPropertyNames());
  195. $filterConfidenceScore = min(((count($setProperties) / count($expectedProperties))*100), 100);
  196. return $filterConfidenceScore;
  197. }
  198. /**
  199. * Take an array of citation parse/lookup results and derive a citation
  200. * with one "best" set of values.
  201. *
  202. * We determine the best values within the citations that have a score above
  203. * the given threshold. Citations with a score below the threshold will be
  204. * ignored.
  205. *
  206. * For these citations we count the frequency of values per meta-data property.
  207. * The most frequent value will be chosen as "best" value.
  208. *
  209. * If two values have the same frequency then decide based on the score. If
  210. * this is still ambivalent then return the first of the remaining values.
  211. *
  212. * This method will also calculate the overall parsing score for the target
  213. * citation.
  214. *
  215. * @param $scoredCitations
  216. * @param $scoreThreshold integer a number between 0 (=no threshold) and 100,
  217. * default: no threshold
  218. * @return Citation one citation with the "best" values set
  219. */
  220. function &_guessValues(&$scoredCitations, $scoreThreshold = 0) {
  221. assert($scoreThreshold >= 0 && $scoreThreshold <= 100);
  222. // Create the target citation description.
  223. $metadataSchema = new NlmCitationSchema();
  224. $targetDescription = new MetadataDescription($metadataSchema, ASSOC_TYPE_CITATION);
  225. // Step 1: List all values and max scores that have been identified for a given element
  226. // but only include values from results above a given scoring threshold
  227. // Initialize variables for the first step.
  228. $valuesByPropertyName = array();
  229. $maxScoresByPropertyNameAndValue = array();
  230. // Sort the scored citations by score with the highest score first.
  231. krsort($scoredCitations);
  232. foreach ($scoredCitations as $currentScore => $citationsForCurrentScore) {
  233. // Check whether the current score is below the threshold, if so
  234. // stop the loop. We've sorted our citations by score so the remaining
  235. // citations all have scores below the threshold and we can forget
  236. // about them.
  237. if ($currentScore < $scoreThreshold) {
  238. break;
  239. }
  240. foreach($citationsForCurrentScore as $citationForCurrentScore) {
  241. $statements = $citationForCurrentScore->getStatements();
  242. // Add the property values and scores of this citation
  243. // to the overall property lists
  244. foreach($statements as $propertyName => $value) {
  245. // Initialize sub-arrays if necessary
  246. if (!isset($valuesByPropertyName[$propertyName])) {
  247. $valuesByPropertyName[$propertyName] = array();
  248. }
  249. if (!isset($maxScoresByPropertyNameAndValue[$propertyName])) {
  250. $maxScoresByPropertyNameAndValue[$propertyName] = array();
  251. }
  252. // Add the value for the given property, as we want to count
  253. // value frequencies later, we explicitly allow duplicates.
  254. $valuesByPropertyName[$propertyName][] = serialize($value);
  255. // As we have ordered our citations descending by score, the
  256. // first score found for a value is also the maximum score.
  257. if (!isset($maxScoresByPropertyNameAndValue[$propertyName][serialize($value)])) {
  258. $maxScoresByPropertyNameAndValue[$propertyName][serialize($value)] = $currentScore;
  259. }
  260. }
  261. }
  262. }
  263. // Step 2: Find out the values that were occur most frequently for each element
  264. // and order these by score.
  265. foreach($valuesByPropertyName as $propertyName => $values) {
  266. // Count the occurrences of each value within the given element
  267. $valueFrequencies = array_count_values($values);
  268. // Order the most frequent values to the beginning of the array
  269. arsort($valueFrequencies);
  270. // Get the most frequent values (may be several if there are more than one
  271. // with the same frequency).
  272. $scoresOfMostFrequentValues = array();
  273. $previousValueFrequency = 0;
  274. foreach($valueFrequencies as $value => $valueFrequency) {
  275. // Only extract the most frequent values, jump out of the
  276. // loop when less frequent values start.
  277. if ($previousValueFrequency > $valueFrequency) break;
  278. $previousValueFrequency = $valueFrequency;
  279. $scoresOfMostFrequentValues[$value] =
  280. $maxScoresByPropertyNameAndValue[$propertyName][$value];
  281. }
  282. // Now we can order the most frequent values by score, starting
  283. // with the highest score.
  284. arsort($scoresOfMostFrequentValues);
  285. // Now get the first key which represents the value with the
  286. // highest frequency and the highest score.
  287. reset($scoresOfMostFrequentValues);
  288. $bestValue = unserialize(key($scoresOfMostFrequentValues));
  289. // Set the found "best" element value in the result citation.
  290. $statements = array($propertyName => $bestValue);
  291. $success = $targetDescription->setStatements($statements);
  292. assert($success);
  293. }
  294. // Calculate the average of all scores
  295. $overallScoreSum = 0;
  296. $overallScoreCount = 0;
  297. foreach ($scoredCitations as $currentScore => $citationsForCurrentScore) {
  298. $countCitationsForCurrentScore = count($citationsForCurrentScore);
  299. $overallScoreSum += $countCitationsForCurrentScore * $currentScore;
  300. $overallScoreCount += $countCitationsForCurrentScore;
  301. }
  302. $averageScore = $overallScoreSum / $overallScoreCount;
  303. // Get the max score (= the first key from scoredCitations
  304. // as these are sorted by score).
  305. reset($scoredCitations);
  306. $maxScore = key($scoredCitations);
  307. // Calculate the overall parse score as by weighing
  308. // the max score and the average score 50% each.
  309. // FIXME: This algorithm seems a bit arbitrary.
  310. $parseScore = ($maxScore + $averageScore) / 2;
  311. // Instantiate the target citation
  312. $targetCitation = new Citation();
  313. $targetCitation->injectMetadata($targetDescription);
  314. $targetCitation->setParseScore($parseScore);
  315. return $targetCitation;
  316. }
  317. }
  318. ?>