PageRenderTime 165ms CodeModel.GetById 80ms app.highlight 16ms RepoModel.GetById 65ms app.codeStats 0ms

/lib/pkp/classes/citation/lookup/pubmed/PubmedNlmCitationSchemaFilter.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 360 lines | 201 code | 57 blank | 102 comment | 43 complexity | db351f12817ed9e2e8999ad406d9de9a MD5 | raw file
  1<?php
  2
  3/**
  4 * @defgroup citation_lookup_pubmed
  5 */
  6
  7/**
  8 * @file classes/citation/lookup/pubmed/PubmedNlmCitationSchemaFilter.inc.php
  9 *
 10 * Copyright (c) 2000-2012 John Willinsky
 11 * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
 12 *
 13 * @class PubmedNlmCitationSchemaFilter
 14 * @ingroup citation_lookup_pubmed
 15 *
 16 * @brief Filter that uses the Pubmed web
 17 *  service to identify a PMID and corresponding
 18 *  meta-data for a given NLM citation.
 19 */
 20
 21// $Id$
 22
 23import('citation.NlmCitationSchemaFilter');
 24
 25define('PUBMED_WEBSERVICE_ESEARCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi');
 26define('PUBMED_WEBSERVICE_EFETCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi');
 27define('PUBMED_WEBSERVICE_ELINK', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi');
 28
 29class PubmedNlmCitationSchemaFilter extends NlmCitationSchemaFilter {
 30	/** @var string */
 31	var $_email;
 32
 33	/**
 34	 * Constructor
 35	 * @param $email string FIXME: This could be PKP's technical
 36	 *  contact email as it is only used to report technical problems
 37	 *  with the query.
 38	 */
 39	function PubmedNlmCitationSchemaFilter($email = null) {
 40		assert(is_null($email) || is_string($email));
 41		$this->_email = $email;
 42
 43		parent::NlmCitationSchemaFilter(
 44			array(
 45				NLM_PUBLICATION_TYPE_JOURNAL,
 46				NLM_PUBLICATION_TYPE_CONFPROC
 47			)
 48		);
 49	}
 50
 51	//
 52	// Getters and Setters
 53	//
 54	/**
 55	 * Get the email
 56	 * @return string
 57	 */
 58	function getEmail() {
 59		return $this->_email;
 60	}
 61
 62	//
 63	// Implement template methods from Filter
 64	//
 65	/**
 66	 * @see Filter::process()
 67	 * @param $citationDescription MetadataDescription
 68	 * @return MetadataDescription
 69	 */
 70	function &process(&$citationDescription) {
 71		$pmid = $citationDescription->getStatement('pub-id[@pub-id-type="pmid"]');
 72
 73		// If the citation does not have a PMID, try to get one from eSearch
 74		// otherwise skip directly to eFetch.
 75		if (empty($pmid)) {
 76			// Initialize search result arrays.
 77			$pmidArrayFromAuthorsSearch = $pmidArrayFromTitleSearch = $pmidArrayFromStrictSearch = array();
 78
 79			// 1) Try a "loose" search based on the author list.
 80			//    (This works surprisingly well for pubmed.)
 81			$authors =& $citationDescription->getStatement('person-group[@person-group-type="author"]');
 82			import('metadata.nlm.NlmNameSchemaPersonStringFilter');
 83			$personNameFilter = new NlmNameSchemaPersonStringFilter(PERSON_STRING_FILTER_MULTIPLE, '%firstname%%initials%%prefix% %surname%%suffix%', ', ');
 84			$authorsString = (string)$personNameFilter->execute($authors);
 85			if (!empty($authorsString)) {
 86				$pmidArrayFromAuthorsSearch =& $this->_search($authorsString);
 87			}
 88
 89			// 2) Try a "loose" search based on the article title
 90			$articleTitle = (string)$citationDescription->getStatement('article-title');
 91			if (!empty($articleTitle)) {
 92				$pmidArrayFromTitleSearch =& $this->_search($articleTitle);
 93			}
 94
 95			// 3) Try a "strict" search based on as much information as possible
 96			$searchProperties = array(
 97				'article-title' => '',
 98				'person-group[@person-group-type="author"]' => '[Auth]',
 99				'source' => '[Jour]',
100				'date' => '[DP]',
101				'volume' => '[VI]',
102				'issue' => '[IP]',
103				'fpage' => '[PG]'
104			);
105			$searchTerms = '';
106			$statements = $citationDescription->getStatements();
107			foreach($searchProperties as $nlmProperty => $pubmedProperty) {
108				if (isset($statements[$nlmProperty])) {
109					if (!empty($searchTerms)) $searchTerms .= ' AND ';
110
111					// Special treatment for authors
112					if ($nlmProperty == 'person-group[@person-group-type="author"]') {
113						assert(isset($statements['person-group[@person-group-type="author"]'][0]));
114						$firstAuthor =& $statements['person-group[@person-group-type="author"]'][0];
115
116						// Add surname
117						$searchTerms .= (string)$firstAuthor->getStatement('surname');
118
119						// Add initial of the first given name
120						$givenNames = $firstAuthor->getStatement('given-names');
121						if (is_array($givenNames)) $searchTerms .= ' '.String::substr($givenNames[0], 0, 1);
122					} else {
123						$searchTerms .= $citationDescription->getStatement($nlmProperty);
124					}
125
126					$searchTerms .= $pubmedProperty;
127				}
128			}
129
130			$pmidArrayFromStrictSearch =& $this->_search($searchTerms);
131
132			// TODO: add another search like strict, but without article title
133			// e.g.  ...term=Baumgart+Dc[Auth]+AND+Lancet[Jour]+AND+2005[DP]+AND+366[VI]+AND+9492[IP]+AND+1210[PG]
134
135			// Compare the arrays to try to narrow it down to one PMID
136
137			switch (true) {
138				// strict search has a single result
139				case (count($pmidArrayFromStrictSearch) == 1):
140					$pmid = $pmidArrayFromStrictSearch[0];
141					break;
142
143				// 3-way union
144				case (count($intersect = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
145					$pmid = current($intersect);
146					break;
147
148				// 2-way union: title / strict
149				case (count($pmid_2way1 = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromStrictSearch)) == 1):
150					$pmid = current($pmid_2way1);
151					break;
152
153				// 2-way union: authors / strict
154				case (count($pmid_2way2 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
155					$pmid = current($pmid_2way2);
156					break;
157
158				// 2-way union: authors / title
159				case (count($pmid_2way3 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromTitleSearch)) == 1):
160					$pmid = current($pmid_2way3);
161					break;
162
163				// we only have one result for title
164				case (count($pmidArrayFromTitleSearch) == 1):
165					$pmid = $pmidArrayFromTitleSearch[0];
166					break;
167
168				// we only have one result for authors
169				case (count($pmidArrayFromAuthorsSearch) == 1):
170					$pmid = $pmidArrayFromAuthorsSearch[0];
171					break;
172
173				// we were unable to find a PMID
174				default:
175					$pmid = '';
176			}
177		}
178
179		// If we have a PMID, get a metadata array for it
180		if (!empty($pmid)) {
181			$citationDescription =& $this->_lookup($pmid, $citationDescription);
182			return $citationDescription;
183		}
184
185		// Nothing found
186		$nullVar = null;
187		return $nullVar;
188	}
189
190	//
191	// Private methods
192	//
193	/**
194	 * Searches the given search terms with the pubmed
195	 * eSearch and returns the found PMIDs as an array.
196	 * @param $searchTerms
197	 * @return array an array with PMIDs
198	 */
199	function &_search($searchTerms) {
200		$searchParams = array(
201			'db' => 'pubmed',
202			'tool' => 'pkp-wal',
203			'term' => $searchTerms
204		);
205		if (!is_null($this->getEmail())) $searchParams['email'] = $this->getEmail();
206
207		// Call the eSearch web service and get an XML result
208		if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ESEARCH, $searchParams))) {
209			$emptyArray = array();
210			return $emptyArray;
211		}
212
213		// Loop through any results we have and add them to a PMID array
214		$pmidArray = array();
215		foreach ($resultDOM->getElementsByTagName('Id') as $idNode) {
216			$pmidArray[] = $idNode->textContent;
217		}
218
219		return $pmidArray;
220	}
221
222	/**
223	 * Fills the given citation object with
224	 * meta-data retrieved from PubMed.
225	 * @param $pmid string
226	 * @param $citationDescription MetadataDescription
227	 * @return MetadataDescription
228	 */
229	function &_lookup($pmid, &$citationDescription) {
230		$nullVar = null;
231
232		// Use eFetch to get XML metadata for the given PMID
233		$lookupParams = array(
234			'db' => 'pubmed',
235			'mode' => 'xml',
236			'tool' => 'pkp-wal',
237			'id' => $pmid
238		);
239		if (!is_null($this->getEmail())) $lookupParams['email'] = $this->getEmail();
240
241		// Call the eFetch URL and get an XML result
242		if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_EFETCH, $lookupParams))) return $nullVar;
243
244		$metadata = array(
245			'pub-id[@pub-id-type="pmid"]' => $pmid,
246			'article-title' => $resultDOM->getElementsByTagName("ArticleTitle")->item(0)->textContent,
247			'source' => $resultDOM->getElementsByTagName("MedlineTA")->item(0)->textContent,
248		);
249
250		if ($resultDOM->getElementsByTagName("Volume")->length > 0)
251			$metadata['volume'] = $resultDOM->getElementsByTagName("Volume")->item(0)->textContent;
252		if ($resultDOM->getElementsByTagName("Issue")->length > 0)
253			$metadata['issue'] = $resultDOM->getElementsByTagName("Issue")->item(0)->textContent;
254
255		// get list of author full names
256		$nlmNameSchema = new NlmNameSchema();
257		foreach ($resultDOM->getElementsByTagName("Author") as $authorNode) {
258			if (!isset($metadata['person-group[@person-group-type="author"]']))
259				$metadata['person-group[@person-group-type="author"]'] = array();
260
261			// Instantiate an NLM name description
262			$authorDescription = new MetadataDescription($nlmNameSchema, ASSOC_TYPE_AUTHOR);
263
264			// Surname
265			$authorDescription->addStatement('surname', $authorNode->getElementsByTagName("LastName")->item(0)->textContent);
266
267			// Given names
268			$givenNamesString = '';
269			if ($authorNode->getElementsByTagName("FirstName")->length > 0) {
270				$givenNamesString = $authorNode->getElementsByTagName("FirstName")->item(0)->textContent;
271			} elseif ($authorNode->getElementsByTagName("ForeName")->length > 0) {
272				$givenNamesString = $authorNode->getElementsByTagName("ForeName")->item(0)->textContent;
273			}
274			if (!empty($givenNamesString)) {
275				foreach(explode(' ', $givenNamesString) as $givenName) $authorDescription->addStatement('given-names', String::trimPunctuation($givenName));
276			}
277
278			// Suffix
279			if ($authorNode->getElementsByTagName("Suffix")->length > 0)
280				$authorDescription->addStatement('suffix', $authorNode->getElementsByTagName("Suffix")->item(0)->textContent);
281
282			// Include collective names
283			/*if ($resultDOM->getElementsByTagName("CollectiveName")->length > 0 && $authorNode->getElementsByTagName("CollectiveName")->item(0)->textContent != '') {
284				// FIXME: This corresponds to an NLM-citation <collab> tag and should be part of the Metadata implementation
285			}*/
286
287			$metadata['person-group[@person-group-type="author"]'][] =& $authorDescription;
288			unset($authorDescription);
289		}
290
291		// Extract pagination
292		if (String::regexp_match_get("/^[:p\.\s]*(?P<fpage>[Ee]?\d+)(-(?P<lpage>\d+))?/", $resultDOM->getElementsByTagName("MedlinePgn")->item(0)->textContent, $pages)) {
293			$fPage = (integer)$pages['fpage'];
294			$metadata['fpage'] = $fPage;
295			if (!empty($pages['lpage'])) {
296				$lPage = (integer)$pages['lpage'];
297
298				// Deal with shortcuts like '382-7'
299				if ($lPage < $fPage) {
300					$lPage = (integer)(String::substr($pages['fpage'], 0, -String::strlen($pages['lpage'])).$pages['lpage']);
301				}
302
303				$metadata['lpage'] = $lPage;
304			}
305		}
306
307		// Get publication date
308		// TODO: The publication date could be in multiple places
309		if ($resultDOM->getElementsByTagName("ArticleDate")->length > 0) {
310			$publicationDate = $resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Year")->item(0)->textContent.
311			                   '-'.str_pad($resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Month")->item(0)->textContent, 2, '0', STR_PAD_LEFT).
312			                   '-'.str_pad($resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Day")->item(0)->textContent, 2, '0', STR_PAD_LEFT);
313			$metadata['date'] = $publicationDate;
314		}
315
316		// Get publication type
317		if ($resultDOM->getElementsByTagName("PublicationType")->length > 0) {
318			foreach($resultDOM->getElementsByTagName("PublicationType") as $publicationType) {
319				// The vast majority of items on PubMed are articles so catch these...
320				if (String::strpos(String::strtolower($publicationType->textContent), 'article') !== false) {
321					$metadata['[@publication-type]'] = NLM_PUBLICATION_TYPE_JOURNAL;
322					break;
323				}
324			}
325		}
326
327		// Get DOI if it exists
328		foreach ($resultDOM->getElementsByTagName("ArticleId") as $idNode) {
329			if ($idNode->getAttribute('IdType') == 'doi')
330				$metadata['pub-id[@pub-id-type="doi"]'] = $idNode->textContent;
331		}
332
333		// Use eLink utility to find fulltext links
334		$lookupParams = array(
335			'dbfrom' => 'pubmed',
336			'cmd' => 'llinks',
337			'tool' => 'pkp-wal',
338			'id' => $pmid
339		);
340		if(!is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ELINK, $lookupParams))) {
341			// Get a list of possible links
342			foreach ($resultDOM->getElementsByTagName("ObjUrl") as $linkOut) {
343				$attributes = '';
344				foreach ($linkOut->getElementsByTagName("Attribute") as $attribute) $attributes .= String::strtolower($attribute->textContent).' / ';
345
346				// Only add links to open access resources
347				if (String::strpos($attributes, "subscription") === false && String::strpos($attributes, "membership") === false &&
348						String::strpos($attributes, "fee") === false && $attributes != "") {
349					$links[] = $linkOut->getElementsByTagName("Url")->item(0)->textContent;
350				}
351			}
352
353			// Take the first link if we have any left (presumably pubmed returns them in preferential order)
354			if (isset($links[0])) $metadata['uri'] = $links[0];
355		}
356
357		return $this->addMetadataArrayToNlmCitationDescription($metadata, $citationDescription);
358	}
359}
360?>