PubmedNlmCitationSchemaFilter.inc.php

/lib/pkp/classes/citation/lookup/pubmed/PubmedNlmCitationSchemaFilter.inc.php

https://github.com/lib-uoguelph-ca/ocs · PHP · 360 lines · 201 code · 57 blank · 102 comment · 43 complexity · db351f12817ed9e2e8999ad406d9de9a MD5 · raw file

<?php

/**
 * @defgroup citation_lookup_pubmed
 */

/**
 * @file classes/citation/lookup/pubmed/PubmedNlmCitationSchemaFilter.inc.php
 *
 * Copyright (c) 2000-2012 John Willinsky
 * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
 *
 * @class PubmedNlmCitationSchemaFilter
 * @ingroup citation_lookup_pubmed
 *
 * @brief Filter that uses the Pubmed web
 *  service to identify a PMID and corresponding
 *  meta-data for a given NLM citation.
 */

// $Id$

import('citation.NlmCitationSchemaFilter');

define('PUBMED_WEBSERVICE_ESEARCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi');
define('PUBMED_WEBSERVICE_EFETCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi');
define('PUBMED_WEBSERVICE_ELINK', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi');

class PubmedNlmCitationSchemaFilter extends NlmCitationSchemaFilter {
	/** @var string */
	var $_email;

	/**
	 * Constructor
	 * @param $email string FIXME: This could be PKP's technical
	 *  contact email as it is only used to report technical problems
	 *  with the query.
	 */
	function PubmedNlmCitationSchemaFilter($email = null) {
		assert(is_null($email) || is_string($email));
		$this->_email = $email;

		parent::NlmCitationSchemaFilter(
			array(
				NLM_PUBLICATION_TYPE_JOURNAL,
				NLM_PUBLICATION_TYPE_CONFPROC
			)
		);
	}

	//
	// Getters and Setters
	//
	/**
	 * Get the email
	 * @return string
	 */
	function getEmail() {
		return $this->_email;
	}

	//
	// Implement template methods from Filter
	//
	/**
	 * @see Filter::process()
	 * @param $citationDescription MetadataDescription
	 * @return MetadataDescription
	 */
	function &process(&$citationDescription) {
		$pmid = $citationDescription->getStatement('pub-id[@pub-id-type="pmid"]');

		// If the citation does not have a PMID, try to get one from eSearch
		// otherwise skip directly to eFetch.
		if (empty($pmid)) {
			// Initialize search result arrays.
			$pmidArrayFromAuthorsSearch = $pmidArrayFromTitleSearch = $pmidArrayFromStrictSearch = array();

			// 1) Try a "loose" search based on the author list.
			//    (This works surprisingly well for pubmed.)
			$authors =& $citationDescription->getStatement('person-group[@person-group-type="author"]');
			import('metadata.nlm.NlmNameSchemaPersonStringFilter');
			$personNameFilter = new NlmNameSchemaPersonStringFilter(PERSON_STRING_FILTER_MULTIPLE, '%firstname%%initials%%prefix% %surname%%suffix%', ', ');
			$authorsString = (string)$personNameFilter->execute($authors);
			if (!empty($authorsString)) {
				$pmidArrayFromAuthorsSearch =& $this->_search($authorsString);
			}

			// 2) Try a "loose" search based on the article title
			$articleTitle = (string)$citationDescription->getStatement('article-title');
			if (!empty($articleTitle)) {
				$pmidArrayFromTitleSearch =& $this->_search($articleTitle);
			}

			// 3) Try a "strict" search based on as much information as possible
			$searchProperties = array(
				'article-title' => '',
				'person-group[@person-group-type="author"]' => '[Auth]',
				'source' => '[Jour]',
				'date' => '[DP]',
				'volume' => '[VI]',
				'issue' => '[IP]',
				'fpage' => '[PG]'
			);
			$searchTerms = '';
			$statements = $citationDescription->getStatements();
			foreach($searchProperties as $nlmProperty => $pubmedProperty) {
				if (isset($statements[$nlmProperty])) {
					if (!empty($searchTerms)) $searchTerms .= ' AND ';

					// Special treatment for authors
					if ($nlmProperty == 'person-group[@person-group-type="author"]') {
						assert(isset($statements['person-group[@person-group-type="author"]'][0]));
						$firstAuthor =& $statements['person-group[@person-group-type="author"]'][0];

						// Add surname
						$searchTerms .= (string)$firstAuthor->getStatement('surname');

						// Add initial of the first given name
						$givenNames = $firstAuthor->getStatement('given-names');
						if (is_array($givenNames)) $searchTerms .= ' '.String::substr($givenNames[0], 0, 1);
					} else {
						$searchTerms .= $citationDescription->getStatement($nlmProperty);
					}

					$searchTerms .= $pubmedProperty;
				}
			}

			$pmidArrayFromStrictSearch =& $this->_search($searchTerms);

			// TODO: add another search like strict, but without article title
			// e.g.  ...term=Baumgart+Dc[Auth]+AND+Lancet[Jour]+AND+2005[DP]+AND+366[VI]+AND+9492[IP]+AND+1210[PG]

			// Compare the arrays to try to narrow it down to one PMID

			switch (true) {
				// strict search has a single result
				case (count($pmidArrayFromStrictSearch) == 1):
					$pmid = $pmidArrayFromStrictSearch[0];
					break;

				// 3-way union
				case (count($intersect = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
					$pmid = current($intersect);
					break;

				// 2-way union: title / strict
				case (count($pmid_2way1 = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromStrictSearch)) == 1):
					$pmid = current($pmid_2way1);
					break;

				// 2-way union: authors / strict
				case (count($pmid_2way2 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
					$pmid = current($pmid_2way2);
					break;

				// 2-way union: authors / title
				case (count($pmid_2way3 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromTitleSearch)) == 1):
					$pmid = current($pmid_2way3);
					break;

				// we only have one result for title
				case (count($pmidArrayFromTitleSearch) == 1):
					$pmid = $pmidArrayFromTitleSearch[0];
					break;

				// we only have one result for authors
				case (count($pmidArrayFromAuthorsSearch) == 1):
					$pmid = $pmidArrayFromAuthorsSearch[0];
					break;

				// we were unable to find a PMID
				default:
					$pmid = '';
			}
		}

		// If we have a PMID, get a metadata array for it
		if (!empty($pmid)) {
			$citationDescription =& $this->_lookup($pmid, $citationDescription);
			return $citationDescription;
		}

		// Nothing found
		$nullVar = null;
		return $nullVar;
	}

	//
	// Private methods
	//
	/**
	 * Searches the given search terms with the pubmed
	 * eSearch and returns the found PMIDs as an array.
	 * @param $searchTerms
	 * @return array an array with PMIDs
	 */
	function &_search($searchTerms) {
		$searchParams = array(
			'db' => 'pubmed',
			'tool' => 'pkp-wal',
			'term' => $searchTerms
		);
		if (!is_null($this->getEmail())) $searchParams['email'] = $this->getEmail();

		// Call the eSearch web service and get an XML result
		if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ESEARCH, $searchParams))) {
			$emptyArray = array();
			return $emptyArray;
		}

		// Loop through any results we have and add them to a PMID array
		$pmidArray = array();
		foreach ($resultDOM->getElementsByTagName('Id') as $idNode) {
			$pmidArray[] = $idNode->textContent;
		}

		return $pmidArray;
	}

	/**
	 * Fills the given citation object with
	 * meta-data retrieved from PubMed.
	 * @param $pmid string
	 * @param $citationDescription MetadataDescription
	 * @return MetadataDescription
	 */
	function &_lookup($pmid, &$citationDescription) {
		$nullVar = null;

		// Use eFetch to get XML metadata for the given PMID
		$lookupParams = array(
			'db' => 'pubmed',
			'mode' => 'xml',
			'tool' => 'pkp-wal',
			'id' => $pmid
		);
		if (!is_null($this->getEmail())) $lookupParams['email'] = $this->getEmail();

		// Call the eFetch URL and get an XML result
		if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_EFETCH, $lookupParams))) return $nullVar;

		$metadata = array(
			'pub-id[@pub-id-type="pmid"]' => $pmid,
			'article-title' => $resultDOM->getElementsByTagName("ArticleTitle")->item(0)->textContent,
			'source' => $resultDOM->getElementsByTagName("MedlineTA")->item(0)->textContent,
		);

		if ($resultDOM->getElementsByTagName("Volume")->length > 0)
			$metadata['volume'] = $resultDOM->getElementsByTagName("Volume")->item(0)->textContent;
		if ($resultDOM->getElementsByTagName("Issue")->length > 0)
			$metadata['issue'] = $resultDOM->getElementsByTagName("Issue")->item(0)->textContent;

		// get list of author full names
		$nlmNameSchema = new NlmNameSchema();
		foreach ($resultDOM->getElementsByTagName("Author") as $authorNode) {
			if (!isset($metadata['person-group[@person-group-type="author"]']))
				$metadata['person-group[@person-group-type="author"]'] = array();

			// Instantiate an NLM name description
			$authorDescription = new MetadataDescription($nlmNameSchema, ASSOC_TYPE_AUTHOR);

			// Surname
			$authorDescription->addStatement('surname', $authorNode->getElementsByTagName("LastName")->item(0)->textContent);

			// Given names
			$givenNamesString = '';
			if ($authorNode->getElementsByTagName("FirstName")->length > 0) {
				$givenNamesString = $authorNode->getElementsByTagName("FirstName")->item(0)->textContent;
			} elseif ($authorNode->getElementsByTagName("ForeName")->length > 0) {
				$givenNamesString = $authorNode->getElementsByTagName("ForeName")->item(0)->textContent;
			}
			if (!empty($givenNamesString)) {
				foreach(explode(' ', $givenNamesString) as $givenName) $authorDescription->addStatement('given-names', String::trimPunctuation($givenName));
			}

			// Suffix
			if ($authorNode->getElementsByTagName("Suffix")->length > 0)
				$authorDescription->addStatement('suffix', $authorNode->getElementsByTagName("Suffix")->item(0)->textContent);

			// Include collective names
			/*if ($resultDOM->getElementsByTagName("CollectiveName")->length > 0 && $authorNode->getElementsByTagName("CollectiveName")->item(0)->textContent != '') {
				// FIXME: This corresponds to an NLM-citation <collab> tag and should be part of the Metadata implementation
			}*/

			$metadata['person-group[@person-group-type="author"]'][] =& $authorDescription;
			unset($authorDescription);
		}

		// Extract pagination
		if (String::regexp_match_get("/^[:p\.\s]*(?P<fpage>[Ee]?\d+)(-(?P<lpage>\d+))?/", $resultDOM->getElementsByTagName("MedlinePgn")->item(0)->textContent, $pages)) {
			$fPage = (integer)$pages['fpage'];
			$metadata['fpage'] = $fPage;
			if (!empty($pages['lpage'])) {
				$lPage = (integer)$pages['lpage'];

				// Deal with shortcuts like '382-7'
				if ($lPage < $fPage) {
					$lPage = (integer)(String::substr($pages['fpage'], 0, -String::strlen($pages['lpage'])).$pages['lpage']);
				}

				$metadata['lpage'] = $lPage;
			}
		}

		// Get publication date
		// TODO: The publication date could be in multiple places
		if ($resultDOM->getElementsByTagName("ArticleDate")->length > 0) {
			$publicationDate = $resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Year")->item(0)->textContent.
			                   '-'.str_pad($resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Month")->item(0)->textContent, 2, '0', STR_PAD_LEFT).
			                   '-'.str_pad($resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Day")->item(0)->textContent, 2, '0', STR_PAD_LEFT);
			$metadata['date'] = $publicationDate;
		}

		// Get publication type
		if ($resultDOM->getElementsByTagName("PublicationType")->length > 0) {
			foreach($resultDOM->getElementsByTagName("PublicationType") as $publicationType) {
				// The vast majority of items on PubMed are articles so catch these...
				if (String::strpos(String::strtolower($publicationType->textContent), 'article') !== false) {
					$metadata['[@publication-type]'] = NLM_PUBLICATION_TYPE_JOURNAL;
					break;
				}
			}
		}

		// Get DOI if it exists
		foreach ($resultDOM->getElementsByTagName("ArticleId") as $idNode) {
			if ($idNode->getAttribute('IdType') == 'doi')
				$metadata['pub-id[@pub-id-type="doi"]'] = $idNode->textContent;
		}

		// Use eLink utility to find fulltext links
		$lookupParams = array(
			'dbfrom' => 'pubmed',
			'cmd' => 'llinks',
			'tool' => 'pkp-wal',
			'id' => $pmid
		);
		if(!is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ELINK, $lookupParams))) {
			// Get a list of possible links
			foreach ($resultDOM->getElementsByTagName("ObjUrl") as $linkOut) {
				$attributes = '';
				foreach ($linkOut->getElementsByTagName("Attribute") as $attribute) $attributes .= String::strtolower($attribute->textContent).' / ';

				// Only add links to open access resources
				if (String::strpos($attributes, "subscription") === false && String::strpos($attributes, "membership") === false &&
						String::strpos($attributes, "fee") === false && $attributes != "") {
					$links[] = $linkOut->getElementsByTagName("Url")->item(0)->textContent;
				}
			}

			// Take the first link if we have any left (presumably pubmed returns them in preferential order)
			if (isset($links[0])) $metadata['uri'] = $links[0];
		}

		return $this->addMetadataArrayToNlmCitationDescription($metadata, $citationDescription);
	}
}
?>
Alerts (3)

'var' Legacy var keyword detected; use public/private/protected for class properties
31
Complexity hotspot; lines 347 to 348 (total complexity: 4)
347 348