PageRenderTime 41ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/pkp/classes/metadata/nlm/PersonStringNlmNameSchemaFilter.inc.php

https://github.com/lib-uoguelph-ca/ocs
PHP | 301 lines | 137 code | 47 blank | 117 comment | 19 complexity | 4896e86f7a2f14f454174319dbff9668 MD5 | raw file
Possible License(s): GPL-2.0
  1. <?php
  2. /**
  3. * @file classes/metadata/PersonStringNlmNameSchemaFilter.inc.php
  4. *
  5. * Copyright (c) 2000-2012 John Willinsky
  6. * Distributed under the GNU GPL v2. For full terms see the file docs/COPYING.
  7. *
  8. * @class PersonStringNlmNameSchemaFilter
  9. * @ingroup metadata_nlm
  10. * @see NlmNameSchema
  11. *
  12. * @brief Filter that converts from a string
  13. * to an (array of) NLM name description(s).
  14. */
  15. // $Id$
  16. import('metadata.nlm.NlmPersonStringFilter');
  17. class PersonStringNlmNameSchemaFilter extends NlmPersonStringFilter {
  18. /** @var integer */
  19. var $_assocType;
  20. /** @var boolean */
  21. var $_filterTitle;
  22. /** @var boolean */
  23. var $_filterDegrees;
  24. /**
  25. * Constructor
  26. */
  27. function PersonStringNlmNameSchemaFilter($assocType, $filterMode = PERSON_STRING_FILTER_SINGLE, $filterTitle = false, $filterDegrees = false) {
  28. assert(in_array($assocType, array(ASSOC_TYPE_AUTHOR, ASSOC_TYPE_EDITOR)));
  29. $this->_assocType = $assocType;
  30. $this->_filterTitle = $filterTitle;
  31. $this->_filterDegrees = $filterDegrees;
  32. parent::NlmPersonStringFilter($filterMode);
  33. }
  34. //
  35. // Setters and Getters
  36. //
  37. /**
  38. * Get the association type
  39. * @return integer
  40. */
  41. function &getAssocType() {
  42. return $this->_assocType;
  43. }
  44. /**
  45. * Do we parse for a title?
  46. * @return boolean
  47. */
  48. function getFilterTitle() {
  49. return $this->_filterTitle;
  50. }
  51. /**
  52. * Set whether we parse for a title
  53. * @param $filterTitle boolean
  54. */
  55. function setFilterTitle($filterTitle) {
  56. $this->_filterTitle = (boolean)$filterTitle;
  57. }
  58. /**
  59. * Do we parse for degrees?
  60. * @return boolean
  61. */
  62. function getFilterDegrees() {
  63. return $this->_filterDegrees;
  64. }
  65. /**
  66. * Set whether we parse for degrees
  67. * @param $filterDegrees boolean
  68. */
  69. function setFilterDegrees($filterDegrees) {
  70. $this->_filterDegrees = (boolean)$filterDegrees;
  71. }
  72. //
  73. // Implement template methods from Filter
  74. //
  75. /**
  76. * @see Filter::supports()
  77. * @param $input mixed
  78. * @param $output mixed
  79. * @return boolean
  80. */
  81. function supports(&$input, &$output) {
  82. // Check input type
  83. if (!is_string($input)) return false;
  84. // Check output type
  85. if (is_null($output)) return true;
  86. return $this->isValidPersonDescription($output);
  87. }
  88. /**
  89. * Transform a person string to an (array of) NLM name description(s).
  90. * @see Filter::process()
  91. * @param $input string
  92. * @return mixed Either a MetadataDescription or an array of MetadataDescriptions
  93. */
  94. function &process(&$input) {
  95. switch ($this->getFilterMode()) {
  96. case PERSON_STRING_FILTER_MULTIPLE:
  97. return $this->_parsePersonsString($input, $this->_filterTitle, $this->_filterDegrees);
  98. case PERSON_STRING_FILTER_SINGLE:
  99. return $this->_parsePersonString($input, $this->_filterTitle, $this->_filterDegrees);
  100. default:
  101. assert(false);
  102. }
  103. }
  104. //
  105. // Private helper methods
  106. //
  107. /**
  108. * Converts a string with multiple persons
  109. * to an array of NLM name descriptions.
  110. *
  111. * @param $personsString string
  112. * @param $title boolean true to parse for title
  113. * @param $degrees boolean true to parse for degrees
  114. * @return array an array of NLM name descriptions or null
  115. * if the string could not be converted
  116. */
  117. function &_parsePersonsString($personsString, $title, $degrees) {
  118. // Remove "et al"
  119. $personsString = String::regexp_replace('/et ?al$/', '', $personsString);
  120. // Remove punctuation
  121. $personsString = trim($personsString, ':;,');
  122. // Cut the authors string into pieces
  123. $personStrings = String::iterativeExplode(array(':', ';'), $personsString);
  124. // Only try to cut by comma if the pieces contain more
  125. // than one word to avoid splitting between last name and
  126. // first name.
  127. if (count($personStrings) == 1) {
  128. if (String::regexp_match('/^((\w+\s+)+\w+\s*,)+\s*((\w+\s+)+\w+)$/i', $personStrings[0])) {
  129. $personStrings = explode(',', $personStrings[0]);
  130. }
  131. }
  132. // Parse persons
  133. $persons = array();
  134. foreach ($personStrings as $personString) {
  135. $persons[] =& $this->_parsePersonString($personString, $title, $degrees);
  136. }
  137. return $persons;
  138. }
  139. /**
  140. * Converts a string with a single person
  141. * to an NLM name description.
  142. *
  143. * TODO: add initials from all given names to initials
  144. * element
  145. *
  146. * @param $personString string
  147. * @param $title boolean true to parse for title
  148. * @param $degrees boolean true to parse for degrees
  149. * @return MetadataDescription an NLM name description or null
  150. * if the string could not be converted
  151. */
  152. function &_parsePersonString($personString, $title, $degrees) {
  153. // Expressions to parse person strings, ported from CiteULike person
  154. // plugin, see http://svn.citeulike.org/svn/plugins/person.tcl
  155. static $personRegex = array(
  156. 'title' => '(?:His (?:Excellency|Honou?r)\s+|Her (?:Excellency|Honou?r)\s+|The Right Honou?rable\s+|The Honou?rable\s+|Right Honou?rable\s+|The Rt\.? Hon\.?\s+|The Hon\.?\s+|Rt\.? Hon\.?\s+|Mr\.?\s+|Ms\.?\s+|M\/s\.?\s+|Mrs\.?\s+|Miss\.?\s+|Dr\.?\s+|Sir\s+|Dame\s+|Prof\.?\s+|Professor\s+|Doctor\s+|Mister\s+|Mme\.?\s+|Mast(?:\.|er)?\s+|Lord\s+|Lady\s+|Madam(?:e)?\s+|Priv\.-Doz\.\s+)+',
  157. 'degrees' => '(,\s+(?:[A-Z\.]+))+',
  158. 'initials' => '(?:(?:[A-Z]\.){1,4})|(?:(?:[A-Z]\.\s){1,3}[A-Z])|(?:[A-Z]{1,4})|(?:(?:[A-Z]\.-?){1,4})|(?:(?:[A-Z]\.-?){1,3}[A-Z])|(?:(?:[A-Z]-){1,3}[A-Z])|(?:(?:[A-Z]\s){1,3}[A-Z])|(?:(?:[A-Z] ){1,3}[A-Z]\.)|(?:[A-Z]-(?:[A-Z]\.){1,3})',
  159. 'prefix' => 'Dell(?:[a|e])?(?:\s|$)|Dalle(?:\s|$)|D[a|e]ll\'(?:\s|$)|Dela(?:\s|$)|Del(?:\s|$)|[Dd]e(?:\s|$)(?:La(?:\s|$)|Los(?:\s|$))?|[Dd]e(?:\s|$)|[Dd][a|i|u](?:\s|$)|L[a|e|o](?:\s|$)|[D|L|O]\'|St\.?(?:\s|$)|San(?:\s|$)|[Dd]en(?:\s|$)|[Vv]on(?:\s|$)(?:[Dd]er(?:\s|$))?|(?:[Ll][ea](?:\s|$))?[Vv]an(?:\s|$)(?:[Dd]e(?:n|r)?(?:\s|$))?',
  160. 'givenName' => '(?:[^ \t\n\r\f\v,.;()]{2,}|[^ \t\n\r\f\v,.;()]{2,}\-[^ \t\n\r\f\v,.;()]{2,})'
  161. );
  162. // The expressions for given name, suffix and surname are the same
  163. $personRegex['surname'] = $personRegex['suffix'] = $personRegex['givenName'];
  164. // Shortcut for prefixed surname
  165. $personRegexPrefixedSurname = "(?P<prefix>(?:".$personRegex['prefix'].")?)(?P<surname>".$personRegex['surname'].")";
  166. // Instantiate the target person description
  167. $metadataSchema = new NlmNameSchema();
  168. $personDescription = new MetadataDescription($metadataSchema, $this->_assocType);
  169. // Clean the person string
  170. $personString = trim($personString);
  171. // 1. Extract title and degree from the person string and use this as suffix
  172. $suffixString = '';
  173. $results = array();
  174. if ($title && String::regexp_match_get('/^('.$personRegex['title'].')/i', $personString, $results)) {
  175. $suffixString = trim($results[1], ',:; ');
  176. $personString = String::regexp_replace('/^('.$personRegex['title'].')/i', '', $personString);
  177. }
  178. if ($degrees && String::regexp_match_get('/('.$personRegex['degrees'].')$/i', $personString, $results)) {
  179. $degreesArray = explode(',', trim($results[1], ','));
  180. foreach($degreesArray as $key => $degree) {
  181. $degreesArray[$key] = String::trimPunctuation($degree);
  182. }
  183. $suffixString .= ' - '.implode('; ', $degreesArray);
  184. $personString = String::regexp_replace('/('.$personRegex['degrees'].')$/i', '', $personString);
  185. }
  186. if (!empty($suffixString)) $personDescription->addStatement('suffix', $suffixString);
  187. // Space initials when followed by a given name or last name.
  188. $personString = String::regexp_replace('/([A-Z])\.([A-Z][a-z])/', '\1. \2', $personString);
  189. // 2. Extract names and initials from the person string
  190. // The parser expressions are ordered by specificity. The most specific expressions
  191. // come first. Only if these specific expressions don't work will we turn to less
  192. // specific ones. This avoids parsing errors. It also explains why we don't use the
  193. // ?-quantifier for optional elements like initials or middle name where they could
  194. // be misinterpreted.
  195. $personExpressions = array(
  196. // All upper surname
  197. '/^'.$personRegexPrefixedSurname.'$/i',
  198. // Several permutations of name elements, ordered by specificity
  199. '/^(?P<initials>'.$personRegex['initials'].')\s'.$personRegexPrefixedSurname.'$/',
  200. '/^'.$personRegexPrefixedSurname.',?\s(?P<initials>'.$personRegex['initials'].')$/',
  201. '/^'.$personRegexPrefixedSurname.',\s(?P<givenName>'.$personRegex['givenName'].')\s(?P<initials>'.$personRegex['initials'].')$/',
  202. '/^(?P<givenName>'.$personRegex['givenName'].')\s(?P<initials>'.$personRegex['initials'].')\s'.$personRegexPrefixedSurname.'$/',
  203. '/^'.$personRegexPrefixedSurname.',\s(?P<givenName>(?:'.$personRegex['givenName'].'\s)+)(?P<initials>'.$personRegex['initials'].')$/',
  204. '/^(?P<givenName>(?:'.$personRegex['givenName'].'\s)+)(?P<initials>'.$personRegex['initials'].')\s'.$personRegexPrefixedSurname.'$/',
  205. '/^'.$personRegexPrefixedSurname.',(?P<givenName>(?:\s'.$personRegex['givenName'].')+)$/',
  206. '/^(?P<givenName>(?:'.$personRegex['givenName'].'\s)+)'.$personRegexPrefixedSurname.'$/',
  207. // DRIVER guidelines 2.0 name syntax
  208. '/^\s*(?P<surname>'.$personRegex['surname'].')(?P<suffix>(?:\s+'.$personRegex['suffix'].')?)\s*,\s*(?P<initials>(?:'.$personRegex['initials'].')?)\s*\((?P<givenName>(?:\s*'.$personRegex['givenName'].')+)\s*\)\s*(?P<prefix>(?:'.$personRegex['prefix'].')?)$/',
  209. // Catch-all expression
  210. '/^(?P<surname>.*)$/'
  211. );
  212. $results = array();
  213. foreach ($personExpressions as $expressionId => $personExpression) {
  214. if ($nameFound = String::regexp_match_get($personExpression, $personString, $results)) {
  215. // Given names
  216. if (!empty($results['givenName'])) {
  217. // Split given names
  218. $givenNames = explode(' ', trim($results['givenName']));
  219. foreach($givenNames as $givenName) {
  220. $personDescription->addStatement('given-names', $givenName);
  221. unset($givenName);
  222. }
  223. }
  224. // Initials (will also be saved as given names)
  225. if (!empty($results['initials'])) {
  226. $results['initials'] = str_replace(array('.', '-', ' '), array('', '', ''), $results['initials']);
  227. for ($initialNum = 0; $initialNum < String::strlen($results['initials']); $initialNum++) {
  228. $initial = $results['initials'][$initialNum];
  229. $personDescription->addStatement('given-names', $initial);
  230. unset($initial);
  231. }
  232. }
  233. // Surname
  234. if (!empty($results['surname'])) {
  235. // Correct all-upper surname
  236. if (strtoupper($results['surname']) == $results['surname']) {
  237. $results['surname'] = ucwords(strtolower($results['surname']));
  238. }
  239. $personDescription->addStatement('surname', $results['surname']);
  240. }
  241. // Prefix/Suffix
  242. foreach(array('prefix', 'suffix') as $propertyName) {
  243. if (!empty($results[$propertyName])) {
  244. $results[$propertyName] = trim($results[$propertyName]);
  245. $personDescription->addStatement($propertyName, $results[$propertyName]);
  246. }
  247. }
  248. break;
  249. }
  250. }
  251. return $personDescription;
  252. }
  253. }
  254. ?>