ocs /lib/pkp/classes/citation/NlmCitationDemultiplexerFilter.inc.php

Language PHP Lines 365
MD5 Hash b1fa79095d7c89b16e6586540fcce44c
Repository https://github.com/lib-uoguelph-ca/ocs.git View Raw File
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
<?php
/**
 * @file classes/citation/NlmCitationDemultiplexerFilter.inc.php
 *
 * Copyright (c) 2000-2012 John Willinsky
 * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
 *
 * @class NlmCitationDemultiplexerFilter
 * @ingroup citation
 *
 * @brief Filter that takes a list of NLM citation descriptions and joins
 *  them into a single "best" citation.
 */

// $Id$

import('filter.Filter');

class NlmCitationDemultiplexerFilter extends Filter {
	/**
	 * @var Citation The original unfiltered citation required
	 *  to calculate the filter result confidence score.
	 */
	var $_originalCitation;

	/**
	 * Constructor
	 */
	function NlmCitationDemultiplexerFilter() {
		parent::Filter();
	}

	//
	// Setters and Getters
	//
	/**
	 * Set the original citation description
	 * @param $originalCitation Citation
	 */
	function setOriginalCitation(&$originalCitation) {
		$this->_originalCitation =& $originalCitation;
	}

	/**
	 * Get the original citation description
	 * @return Citation
	 */
	function &getOriginalCitation() {
		return $this->_originalCitation;
	}


	//
	// Implementing abstract template methods from Filter
	//
	/**
	 * @see Filter::process()
	 * @param $input array incoming MetadataDescriptions
	 * @return Citation
	 */
	function &process(&$input) {
		// Initialize the array that will contain citations by confidence score.
		// This is a two-dimensional array that with the score as key and
		// the scored citations as values.
		$scoredCitations = array();

		// Iterate over the incoming NLM citation descriptions
		foreach ($input as $citationIndex => $filteredCitation) {
			if (is_null($filteredCitation)) continue;
			// FIXME: We should provide feedback to the end-user
			// about filters that caused an error.

			// If the publication type is not set, take a guess
			if (!$filteredCitation->hasStatement('[@publication-type]')) {
				$guessedPublicationType = $this->_guessPublicationType($filteredCitation);
				if (!is_null($guessedPublicationType)) {
					$filteredCitation->addStatement('[@publication-type]', $guessedPublicationType);
				}
			}

			// Calculate the score for this filtered citation
			$confidenceScore = $this->_filterConfidenceScore($filteredCitation, $this->_originalCitation);

			// Save the filtered result hashed by its confidence score.
			// We save them as a sub-array in case several citations
			// receive the same confidence score.
			if (!isset($scoredCitations[$confidenceScore])) {
				$scoredCitations[$confidenceScore] = array();
			}
			$scoredCitations[$confidenceScore][] =& $filteredCitation;
			unset ($filteredCitation);
		}

		// Get a single set of "best" values for the citation description
		// and set them in a new citation object.
		$citation =& $this->_guessValues($scoredCitations);
		return $citation;
	}

	/**
	 * @see Filter::supports()
	 * @param $input mixed
	 * @param $output mixed
	 * @return boolean
	 */
	function supports(&$input, &$output) {
		// Check input type
		// Check the number of the input objects
		if (!(is_array($input) && count($input))) return false;

		// Iterate over the input objects and check their type.
		$inputFound = false;
		foreach($input as $metadataDescription) {
			if (!is_null($metadataDescription)) {
				// We need at least one non-null value
				$inputFound = true;

				if (!is_a($metadataDescription, 'MetadataDescription')) return false;
				$metadataSchema = $metadataDescription->getMetadataSchema();
				if (!is_a($metadataSchema, 'NlmCitationSchema')) return false;
			}
		}
		if (!$inputFound) return false;

		// Check output type
		if (is_null($output)) return true;
		return is_a($output, 'Citation');
	}


	//
	// Private helper methods
	//
	/**
	 * Try to guess a citation's publication type based on detected elements
	 * @param $metadataDescription MetadataDescription
	 * @return integer one of NLM_PUBLICATION_TYPE_*
	 */
	function _guessPublicationType(&$metadataDescription) {
		// If we already have a publication type, why should we guess one?
		assert(!$metadataDescription->hasStatement('[@publication-type]'));

		// Avoid deducing from a description that has only very few properties set
		// and may therefore be of low quality.
		$descriptionCompletenessIndicators = array(
			'person-group[@person-group-type="editor"]', 'article-title', 'date'
		);
		foreach($descriptionCompletenessIndicators as $descriptionCompletenessIndicator) {
			if (!$metadataDescription->hasStatement($descriptionCompletenessIndicator)) return null;
		}

		// The following property names help us to guess the most probable publication type
		$typicalPropertyNames = array(
			'volume' => NLM_PUBLICATION_TYPE_JOURNAL,
			'issue' => NLM_PUBLICATION_TYPE_JOURNAL,
			'season' => NLM_PUBLICATION_TYPE_JOURNAL,
			'issn[@pub-type="ppub"]' => NLM_PUBLICATION_TYPE_JOURNAL,
			'issn[@pub-type="epub"]' => NLM_PUBLICATION_TYPE_JOURNAL,
			'pub-id[@pub-id-type="pmid"]' => NLM_PUBLICATION_TYPE_JOURNAL,
			'person-group[@person-group-type="editor"]' => NLM_PUBLICATION_TYPE_BOOK,
			'edition' => NLM_PUBLICATION_TYPE_BOOK,
			'chapter-title' => NLM_PUBLICATION_TYPE_BOOK,
			'isbn' => NLM_PUBLICATION_TYPE_BOOK,
			'publisher-name' => NLM_PUBLICATION_TYPE_BOOK,
			'publisher-loc' => NLM_PUBLICATION_TYPE_BOOK,
			'conf-date' => NLM_PUBLICATION_TYPE_CONFPROC,
			'conf-loc' => NLM_PUBLICATION_TYPE_CONFPROC,
			'conf-name' => NLM_PUBLICATION_TYPE_CONFPROC,
			'conf-sponsor' => NLM_PUBLICATION_TYPE_CONFPROC
		);

		$hitCounters = array(
			NLM_PUBLICATION_TYPE_JOURNAL => 0,
			NLM_PUBLICATION_TYPE_BOOK => 0,
			NLM_PUBLICATION_TYPE_CONFPROC => 0
		);
		$highestCounterValue = 0;
		$probablePublicationType = null;
		foreach($typicalPropertyNames as $typicalPropertyName => $currentProbablePublicationType) {
			if ($metadataDescription->hasStatement($currentProbablePublicationType)) {
				// Record the hit
				$hitCounters[$currentProbablePublicationType]++;

				// Is this currently the highest counter value?
				if ($hitCounters[$currentProbablePublicationType] > $highestCounterValue) {
					// This is the highest value
					$highestCounterValue = $hitCounters[$currentProbablePublicationType];
					$probablePublicationType = $currentProbablePublicationType;
				} elseif ($hitCounters[$currentProbablePublicationType] == $highestCounterValue) {
					// There are two counters with the same value, so no unique result
					$probablePublicationType = null;
				}
			}
		}

		// Return the publication type with the highest hit counter.
		return $probablePublicationType;
	}

	/**
	 * Derive a confidence score calculated as the number of statements for a group
	 * of expected properties.
	 * @param $metadataDescription MetadataDescription
	 * @param $originalCitation Citation
	 * @return integer filter confidence score
	 */
	function _filterConfidenceScore(&$metadataDescription, &$originalCitation) {
		// FIXME: Amend this algorithm by calculating the similarity between the edited
		// citation string and the citation description:
		// 1) For expected fields: See whether a similar text exists in the original
		//    citation.
		// 2) Add up the number of characters that are similar and compare them to the
		//    number of characters in the original text.

		// Find out how many of the expected properties were identified by the filter.
		$expectedProperties = array(
			'person-group[@person-group-type="author"]', 'article-title', 'source',
			'date', 'fpage', '[@publication-type]'
		);
		$setProperties = array_intersect($expectedProperties, $metadataDescription->getSetPropertyNames());
		$filterConfidenceScore = min(((count($setProperties) / count($expectedProperties))*100), 100);
		return $filterConfidenceScore;
	}

	/**
	 * Take an array of citation parse/lookup results and derive a citation
	 * with one "best" set of values.
	 *
	 * We determine the best values within the citations that have a score above
	 * the given threshold. Citations with a score below the threshold will be
	 * ignored.
	 *
	 * For these citations we count the frequency of values per meta-data property.
	 * The most frequent value will be chosen as "best" value.
	 *
	 * If two values have the same frequency then decide based on the score. If
	 * this is still ambivalent then return the first of the remaining values.
	 *
	 * This method will also calculate the overall parsing score for the target
	 * citation.
	 *
	 * @param $scoredCitations
	 * @param $scoreThreshold integer a number between 0 (=no threshold) and 100,
	 *  default: no threshold
	 * @return Citation one citation with the "best" values set
	 */
	function &_guessValues(&$scoredCitations, $scoreThreshold = 0) {
		assert($scoreThreshold >= 0 && $scoreThreshold <= 100);

		// Create the target citation description.
		$metadataSchema = new NlmCitationSchema();
		$targetDescription = new MetadataDescription($metadataSchema, ASSOC_TYPE_CITATION);

		// Step 1: List all values and max scores that have been identified for a given element
		//         but only include values from results above a given scoring threshold

		// Initialize variables for the first step.
		$valuesByPropertyName = array();
		$maxScoresByPropertyNameAndValue = array();

		// Sort the scored citations by score with the highest score first.
		krsort($scoredCitations);
		foreach ($scoredCitations as $currentScore => $citationsForCurrentScore) {
			// Check whether the current score is below the threshold, if so
			// stop the loop. We've sorted our citations by score so the remaining
			// citations all have scores below the threshold and we can forget
			// about them.
			if ($currentScore < $scoreThreshold) {
				break;
			}

			foreach($citationsForCurrentScore as $citationForCurrentScore) {
				$statements = $citationForCurrentScore->getStatements();

				// Add the property values and scores of this citation
				// to the overall property lists
				foreach($statements as $propertyName => $value) {
					// Initialize sub-arrays if necessary
					if (!isset($valuesByPropertyName[$propertyName])) {
						$valuesByPropertyName[$propertyName] = array();
					}
					if (!isset($maxScoresByPropertyNameAndValue[$propertyName])) {
						$maxScoresByPropertyNameAndValue[$propertyName] = array();
					}

					// Add the value for the given property, as we want to count
					// value frequencies later, we explicitly allow duplicates.
					$valuesByPropertyName[$propertyName][] = serialize($value);

					// As we have ordered our citations descending by score, the
					// first score found for a value is also the maximum score.
					if (!isset($maxScoresByPropertyNameAndValue[$propertyName][serialize($value)])) {
						$maxScoresByPropertyNameAndValue[$propertyName][serialize($value)] = $currentScore;
					}
				}
			}
		}

		// Step 2: Find out the values that were occur most frequently for each element
		//         and order these by score.

		foreach($valuesByPropertyName as $propertyName => $values) {
			// Count the occurrences of each value within the given element
			$valueFrequencies = array_count_values($values);

			// Order the most frequent values to the beginning of the array
			arsort($valueFrequencies);

			// Get the most frequent values (may be several if there are more than one
			// with the same frequency).
			$scoresOfMostFrequentValues = array();
			$previousValueFrequency = 0;
			foreach($valueFrequencies as $value => $valueFrequency) {
				// Only extract the most frequent values, jump out of the
				// loop when less frequent values start.
				if ($previousValueFrequency > $valueFrequency) break;
				$previousValueFrequency = $valueFrequency;

				$scoresOfMostFrequentValues[$value] =
						$maxScoresByPropertyNameAndValue[$propertyName][$value];
			}

			// Now we can order the most frequent values by score, starting
			// with the highest score.
			arsort($scoresOfMostFrequentValues);

			// Now get the first key which represents the value with the
			// highest frequency and the highest score.
			reset($scoresOfMostFrequentValues);
			$bestValue = unserialize(key($scoresOfMostFrequentValues));

			// Set the found "best" element value in the result citation.
			$statements = array($propertyName => $bestValue);
			$success = $targetDescription->setStatements($statements);
			assert($success);
		}

		// Calculate the average of all scores
		$overallScoreSum = 0;
		$overallScoreCount = 0;
		foreach ($scoredCitations as $currentScore => $citationsForCurrentScore) {
			$countCitationsForCurrentScore = count($citationsForCurrentScore);
			$overallScoreSum += $countCitationsForCurrentScore * $currentScore;
			$overallScoreCount += $countCitationsForCurrentScore;
		}
		$averageScore = $overallScoreSum / $overallScoreCount;

		// Get the max score (= the first key from scoredCitations
		// as these are sorted by score).
		reset($scoredCitations);
		$maxScore = key($scoredCitations);

		// Calculate the overall parse score as by weighing
		// the max score and the average score 50% each.
		// FIXME: This algorithm seems a bit arbitrary.
		$parseScore = ($maxScore + $averageScore) / 2;

		// Instantiate the target citation
		$targetCitation = new Citation();
		$targetCitation->injectMetadata($targetDescription);
		$targetCitation->setParseScore($parseScore);
		return $targetCitation;
	}
}
?>
Back to Top