PageRenderTime 53ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/pkp/classes/citation/lookup/pubmed/PubmedNlmCitationSchemaFilter.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 360 lines | 201 code | 57 blank | 102 comment | 43 complexity | db351f12817ed9e2e8999ad406d9de9a MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @defgroup citation_lookup_pubmed
  4. */
  5. /**
  6. * @file classes/citation/lookup/pubmed/PubmedNlmCitationSchemaFilter.inc.php
  7. *
  8. * Copyright (c) 2000-2012 John Willinsky
  9. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  10. *
  11. * @class PubmedNlmCitationSchemaFilter
  12. * @ingroup citation_lookup_pubmed
  13. *
  14. * @brief Filter that uses the Pubmed web
  15. * service to identify a PMID and corresponding
  16. * meta-data for a given NLM citation.
  17. */
  18. // $Id$
  19. import('citation.NlmCitationSchemaFilter');
  20. define('PUBMED_WEBSERVICE_ESEARCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi');
  21. define('PUBMED_WEBSERVICE_EFETCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi');
  22. define('PUBMED_WEBSERVICE_ELINK', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi');
  23. class PubmedNlmCitationSchemaFilter extends NlmCitationSchemaFilter {
  24. /** @var string */
  25. var $_email;
  26. /**
  27. * Constructor
  28. * @param $email string FIXME: This could be PKP's technical
  29. * contact email as it is only used to report technical problems
  30. * with the query.
  31. */
  32. function PubmedNlmCitationSchemaFilter($email = null) {
  33. assert(is_null($email) || is_string($email));
  34. $this->_email = $email;
  35. parent::NlmCitationSchemaFilter(
  36. array(
  37. NLM_PUBLICATION_TYPE_JOURNAL,
  38. NLM_PUBLICATION_TYPE_CONFPROC
  39. )
  40. );
  41. }
  42. //
  43. // Getters and Setters
  44. //
  45. /**
  46. * Get the email
  47. * @return string
  48. */
  49. function getEmail() {
  50. return $this->_email;
  51. }
  52. //
  53. // Implement template methods from Filter
  54. //
  55. /**
  56. * @see Filter::process()
  57. * @param $citationDescription MetadataDescription
  58. * @return MetadataDescription
  59. */
  60. function &process(&$citationDescription) {
  61. $pmid = $citationDescription->getStatement('pub-id[@pub-id-type="pmid"]');
  62. // If the citation does not have a PMID, try to get one from eSearch
  63. // otherwise skip directly to eFetch.
  64. if (empty($pmid)) {
  65. // Initialize search result arrays.
  66. $pmidArrayFromAuthorsSearch = $pmidArrayFromTitleSearch = $pmidArrayFromStrictSearch = array();
  67. // 1) Try a "loose" search based on the author list.
  68. // (This works surprisingly well for pubmed.)
  69. $authors =& $citationDescription->getStatement('person-group[@person-group-type="author"]');
  70. import('metadata.nlm.NlmNameSchemaPersonStringFilter');
  71. $personNameFilter = new NlmNameSchemaPersonStringFilter(PERSON_STRING_FILTER_MULTIPLE, '%firstname%%initials%%prefix% %surname%%suffix%', ', ');
  72. $authorsString = (string)$personNameFilter->execute($authors);
  73. if (!empty($authorsString)) {
  74. $pmidArrayFromAuthorsSearch =& $this->_search($authorsString);
  75. }
  76. // 2) Try a "loose" search based on the article title
  77. $articleTitle = (string)$citationDescription->getStatement('article-title');
  78. if (!empty($articleTitle)) {
  79. $pmidArrayFromTitleSearch =& $this->_search($articleTitle);
  80. }
  81. // 3) Try a "strict" search based on as much information as possible
  82. $searchProperties = array(
  83. 'article-title' => '',
  84. 'person-group[@person-group-type="author"]' => '[Auth]',
  85. 'source' => '[Jour]',
  86. 'date' => '[DP]',
  87. 'volume' => '[VI]',
  88. 'issue' => '[IP]',
  89. 'fpage' => '[PG]'
  90. );
  91. $searchTerms = '';
  92. $statements = $citationDescription->getStatements();
  93. foreach($searchProperties as $nlmProperty => $pubmedProperty) {
  94. if (isset($statements[$nlmProperty])) {
  95. if (!empty($searchTerms)) $searchTerms .= ' AND ';
  96. // Special treatment for authors
  97. if ($nlmProperty == 'person-group[@person-group-type="author"]') {
  98. assert(isset($statements['person-group[@person-group-type="author"]'][0]));
  99. $firstAuthor =& $statements['person-group[@person-group-type="author"]'][0];
  100. // Add surname
  101. $searchTerms .= (string)$firstAuthor->getStatement('surname');
  102. // Add initial of the first given name
  103. $givenNames = $firstAuthor->getStatement('given-names');
  104. if (is_array($givenNames)) $searchTerms .= ' '.String::substr($givenNames[0], 0, 1);
  105. } else {
  106. $searchTerms .= $citationDescription->getStatement($nlmProperty);
  107. }
  108. $searchTerms .= $pubmedProperty;
  109. }
  110. }
  111. $pmidArrayFromStrictSearch =& $this->_search($searchTerms);
  112. // TODO: add another search like strict, but without article title
  113. // e.g. ...term=Baumgart+Dc[Auth]+AND+Lancet[Jour]+AND+2005[DP]+AND+366[VI]+AND+9492[IP]+AND+1210[PG]
  114. // Compare the arrays to try to narrow it down to one PMID
  115. switch (true) {
  116. // strict search has a single result
  117. case (count($pmidArrayFromStrictSearch) == 1):
  118. $pmid = $pmidArrayFromStrictSearch[0];
  119. break;
  120. // 3-way union
  121. case (count($intersect = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
  122. $pmid = current($intersect);
  123. break;
  124. // 2-way union: title / strict
  125. case (count($pmid_2way1 = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromStrictSearch)) == 1):
  126. $pmid = current($pmid_2way1);
  127. break;
  128. // 2-way union: authors / strict
  129. case (count($pmid_2way2 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
  130. $pmid = current($pmid_2way2);
  131. break;
  132. // 2-way union: authors / title
  133. case (count($pmid_2way3 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromTitleSearch)) == 1):
  134. $pmid = current($pmid_2way3);
  135. break;
  136. // we only have one result for title
  137. case (count($pmidArrayFromTitleSearch) == 1):
  138. $pmid = $pmidArrayFromTitleSearch[0];
  139. break;
  140. // we only have one result for authors
  141. case (count($pmidArrayFromAuthorsSearch) == 1):
  142. $pmid = $pmidArrayFromAuthorsSearch[0];
  143. break;
  144. // we were unable to find a PMID
  145. default:
  146. $pmid = '';
  147. }
  148. }
  149. // If we have a PMID, get a metadata array for it
  150. if (!empty($pmid)) {
  151. $citationDescription =& $this->_lookup($pmid, $citationDescription);
  152. return $citationDescription;
  153. }
  154. // Nothing found
  155. $nullVar = null;
  156. return $nullVar;
  157. }
  158. //
  159. // Private methods
  160. //
  161. /**
  162. * Searches the given search terms with the pubmed
  163. * eSearch and returns the found PMIDs as an array.
  164. * @param $searchTerms
  165. * @return array an array with PMIDs
  166. */
  167. function &_search($searchTerms) {
  168. $searchParams = array(
  169. 'db' => 'pubmed',
  170. 'tool' => 'pkp-wal',
  171. 'term' => $searchTerms
  172. );
  173. if (!is_null($this->getEmail())) $searchParams['email'] = $this->getEmail();
  174. // Call the eSearch web service and get an XML result
  175. if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ESEARCH, $searchParams))) {
  176. $emptyArray = array();
  177. return $emptyArray;
  178. }
  179. // Loop through any results we have and add them to a PMID array
  180. $pmidArray = array();
  181. foreach ($resultDOM->getElementsByTagName('Id') as $idNode) {
  182. $pmidArray[] = $idNode->textContent;
  183. }
  184. return $pmidArray;
  185. }
  186. /**
  187. * Fills the given citation object with
  188. * meta-data retrieved from PubMed.
  189. * @param $pmid string
  190. * @param $citationDescription MetadataDescription
  191. * @return MetadataDescription
  192. */
  193. function &_lookup($pmid, &$citationDescription) {
  194. $nullVar = null;
  195. // Use eFetch to get XML metadata for the given PMID
  196. $lookupParams = array(
  197. 'db' => 'pubmed',
  198. 'mode' => 'xml',
  199. 'tool' => 'pkp-wal',
  200. 'id' => $pmid
  201. );
  202. if (!is_null($this->getEmail())) $lookupParams['email'] = $this->getEmail();
  203. // Call the eFetch URL and get an XML result
  204. if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_EFETCH, $lookupParams))) return $nullVar;
  205. $metadata = array(
  206. 'pub-id[@pub-id-type="pmid"]' => $pmid,
  207. 'article-title' => $resultDOM->getElementsByTagName("ArticleTitle")->item(0)->textContent,
  208. 'source' => $resultDOM->getElementsByTagName("MedlineTA")->item(0)->textContent,
  209. );
  210. if ($resultDOM->getElementsByTagName("Volume")->length > 0)
  211. $metadata['volume'] = $resultDOM->getElementsByTagName("Volume")->item(0)->textContent;
  212. if ($resultDOM->getElementsByTagName("Issue")->length > 0)
  213. $metadata['issue'] = $resultDOM->getElementsByTagName("Issue")->item(0)->textContent;
  214. // get list of author full names
  215. $nlmNameSchema = new NlmNameSchema();
  216. foreach ($resultDOM->getElementsByTagName("Author") as $authorNode) {
  217. if (!isset($metadata['person-group[@person-group-type="author"]']))
  218. $metadata['person-group[@person-group-type="author"]'] = array();
  219. // Instantiate an NLM name description
  220. $authorDescription = new MetadataDescription($nlmNameSchema, ASSOC_TYPE_AUTHOR);
  221. // Surname
  222. $authorDescription->addStatement('surname', $authorNode->getElementsByTagName("LastName")->item(0)->textContent);
  223. // Given names
  224. $givenNamesString = '';
  225. if ($authorNode->getElementsByTagName("FirstName")->length > 0) {
  226. $givenNamesString = $authorNode->getElementsByTagName("FirstName")->item(0)->textContent;
  227. } elseif ($authorNode->getElementsByTagName("ForeName")->length > 0) {
  228. $givenNamesString = $authorNode->getElementsByTagName("ForeName")->item(0)->textContent;
  229. }
  230. if (!empty($givenNamesString)) {
  231. foreach(explode(' ', $givenNamesString) as $givenName) $authorDescription->addStatement('given-names', String::trimPunctuation($givenName));
  232. }
  233. // Suffix
  234. if ($authorNode->getElementsByTagName("Suffix")->length > 0)
  235. $authorDescription->addStatement('suffix', $authorNode->getElementsByTagName("Suffix")->item(0)->textContent);
  236. // Include collective names
  237. /*if ($resultDOM->getElementsByTagName("CollectiveName")->length > 0 && $authorNode->getElementsByTagName("CollectiveName")->item(0)->textContent != '') {
  238. // FIXME: This corresponds to an NLM-citation <collab> tag and should be part of the Metadata implementation
  239. }*/
  240. $metadata['person-group[@person-group-type="author"]'][] =& $authorDescription;
  241. unset($authorDescription);
  242. }
  243. // Extract pagination
  244. if (String::regexp_match_get("/^[:p\.\s]*(?P<fpage>[Ee]?\d+)(-(?P<lpage>\d+))?/", $resultDOM->getElementsByTagName("MedlinePgn")->item(0)->textContent, $pages)) {
  245. $fPage = (integer)$pages['fpage'];
  246. $metadata['fpage'] = $fPage;
  247. if (!empty($pages['lpage'])) {
  248. $lPage = (integer)$pages['lpage'];
  249. // Deal with shortcuts like '382-7'
  250. if ($lPage < $fPage) {
  251. $lPage = (integer)(String::substr($pages['fpage'], 0, -String::strlen($pages['lpage'])).$pages['lpage']);
  252. }
  253. $metadata['lpage'] = $lPage;
  254. }
  255. }
  256. // Get publication date
  257. // TODO: The publication date could be in multiple places
  258. if ($resultDOM->getElementsByTagName("ArticleDate")->length > 0) {
  259. $publicationDate = $resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Year")->item(0)->textContent.
  260. '-'.str_pad($resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Month")->item(0)->textContent, 2, '0', STR_PAD_LEFT).
  261. '-'.str_pad($resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Day")->item(0)->textContent, 2, '0', STR_PAD_LEFT);
  262. $metadata['date'] = $publicationDate;
  263. }
  264. // Get publication type
  265. if ($resultDOM->getElementsByTagName("PublicationType")->length > 0) {
  266. foreach($resultDOM->getElementsByTagName("PublicationType") as $publicationType) {
  267. // The vast majority of items on PubMed are articles so catch these...
  268. if (String::strpos(String::strtolower($publicationType->textContent), 'article') !== false) {
  269. $metadata['[@publication-type]'] = NLM_PUBLICATION_TYPE_JOURNAL;
  270. break;
  271. }
  272. }
  273. }
  274. // Get DOI if it exists
  275. foreach ($resultDOM->getElementsByTagName("ArticleId") as $idNode) {
  276. if ($idNode->getAttribute('IdType') == 'doi')
  277. $metadata['pub-id[@pub-id-type="doi"]'] = $idNode->textContent;
  278. }
  279. // Use eLink utility to find fulltext links
  280. $lookupParams = array(
  281. 'dbfrom' => 'pubmed',
  282. 'cmd' => 'llinks',
  283. 'tool' => 'pkp-wal',
  284. 'id' => $pmid
  285. );
  286. if(!is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ELINK, $lookupParams))) {
  287. // Get a list of possible links
  288. foreach ($resultDOM->getElementsByTagName("ObjUrl") as $linkOut) {
  289. $attributes = '';
  290. foreach ($linkOut->getElementsByTagName("Attribute") as $attribute) $attributes .= String::strtolower($attribute->textContent).' / ';
  291. // Only add links to open access resources
  292. if (String::strpos($attributes, "subscription") === false && String::strpos($attributes, "membership") === false &&
  293. String::strpos($attributes, "fee") === false && $attributes != "") {
  294. $links[] = $linkOut->getElementsByTagName("Url")->item(0)->textContent;
  295. }
  296. }
  297. // Take the first link if we have any left (presumably pubmed returns them in preferential order)
  298. if (isset($links[0])) $metadata['uri'] = $links[0];
  299. }
  300. return $this->addMetadataArrayToNlmCitationDescription($metadata, $citationDescription);
  301. }
  302. }
  303. ?>