PageRenderTime 24ms CodeModel.GetById 36ms RepoModel.GetById 0ms app.codeStats 0ms

/plack/lib/Paperpile/Formats/XMP.pm

https://github.com/jondo/paperpile
Perl | 453 lines | 391 code | 38 blank | 24 comment | 206 complexity | cd6d46a1fdfe05c495b06ad4d89c0a50 MD5 | raw file
  1. # Copyright 2009-2011 Paperpile
  2. #
  3. # This file is part of Paperpile
  4. #
  5. # Paperpile is free software: you can redistribute it and/or modify it
  6. # under the terms of the GNU Affero General Public License as
  7. # published by the Free Software Foundation, either version 3 of the
  8. # License, or (at your option) any later version.
  9. # Paperpile is distributed in the hope that it will be useful, but
  10. # WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. # Affero General Public License for more details. You should have
  13. # received a copy of the GNU Affero General Public License along with
  14. # Paperpile. If not, see http://www.gnu.org/licenses.
  15. package Paperpile::Formats::XMP;
  16. use Mouse;
  17. use XML::Simple;
  18. use Data::Dumper;
  19. use Paperpile::Library::Publication;
  20. use Paperpile::Library::Author;
  21. extends 'Paperpile::Formats';
  22. has 'content' => ( is => 'rw', isa => 'Str', default => '' );
  23. sub BUILD {
  24. my $self = shift;
  25. $self->format('XMP');
  26. $self->readable(1);
  27. $self->writable(0);
  28. }
  29. sub read {
  30. my $self = shift;
  31. if ( $self->file and !$self->content ) {
  32. open( FILE, $self->file );
  33. my ( $buf, $data, $n );
  34. my $last_entry = '';
  35. my $buffer = '';
  36. my $flag = 0;
  37. while ( ( $n = read FILE, $data, 16 ) != 0 ) {
  38. #print "$n bytes read $data\n";
  39. my $tmp = $last_entry . $data;
  40. if ( $tmp =~ m/(.*<?xpacket end=[^>]*>).*/ ) {
  41. if ( $data =~ m/([^>]*>).*/ ) {
  42. $buffer .= $1;
  43. }
  44. last;
  45. }
  46. if ( $flag == 1 ) {
  47. $buffer .= $data;
  48. }
  49. if ( $tmp =~ m/.*(<\?xpacket begin=.*)/ ) {
  50. $buffer = $1;
  51. $flag = 1;
  52. }
  53. $last_entry = $data;
  54. }
  55. close(FILE);
  56. $self->content($buffer);
  57. }
  58. my $xmp = $self->content;
  59. my $pub = Paperpile::Library::Publication->new( pubtype => 'ARTICLE' );
  60. return $pub if ( !$xmp );
  61. return $pub if ( $xmp eq '' );
  62. return $pub if ( $xmp !~ m/^<\?xpacket begin=/ );
  63. my (
  64. $title, $authors, $journal, $issue, $volume, $year, $month,
  65. $ISSN, $pages, $doi, $abstract, $booktitle, $url, $pmid,
  66. $arxivid, $start_page, $end_page, $publisher, $dummy, $keywords
  67. );
  68. my $xml = new XML::Simple;
  69. my $data = undef;
  70. eval { $data = $xml->XMLin( $xmp, ForceArray => 1 ) };
  71. return $pub if ( !$data );
  72. # parse as seen for NPG PDFs
  73. my $tmp0 = $data->{'rdf:RDF'}->[0]->{'rdf:Description'};
  74. return $pub if ( !$tmp0 );
  75. my $tmp1 = ( $tmp0 =~ m/^ARRAY/ ) ? $tmp0 : [];
  76. foreach my $entry ( @{$tmp1} ) {
  77. foreach my $key ( keys %{$entry} ) {
  78. #print $key, " =================================\n";
  79. #print Dumper( $entry->{$key} );
  80. # PRISM
  81. if ( lc($key) eq 'prism:number' ) {
  82. next if ( $entry->{$key} =~ m/^HASH/ );
  83. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  84. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  85. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  86. $issue = $entry->{$key}->[0];
  87. } else {
  88. $issue = $entry->{$key};
  89. }
  90. }
  91. if ( lc($key) eq 'prism:volume' ) {
  92. next if ( $entry->{$key} =~ m/^HASH/ );
  93. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  94. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  95. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  96. $volume = $entry->{$key}->[0];
  97. } else {
  98. $volume = $entry->{$key};
  99. }
  100. }
  101. if ( lc($key) eq 'prism:startingpage' ) {
  102. next if ( $entry->{$key} =~ m/^HASH/ );
  103. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  104. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  105. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  106. $start_page = $entry->{$key}->[0];
  107. } else {
  108. $start_page = $entry->{$key};
  109. }
  110. }
  111. if ( lc($key) eq 'prism:endingpage' ) {
  112. next if ( $entry->{$key} =~ m/^HASH/ );
  113. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  114. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  115. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  116. $end_page = $entry->{$key}->[0];
  117. } else {
  118. $end_page = $entry->{$key};
  119. }
  120. }
  121. if ( lc($key) eq 'prism:doi' ) {
  122. next if ( $entry->{$key} =~ m/^HASH/ );
  123. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  124. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  125. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  126. $doi = $entry->{$key}->[0];
  127. } else {
  128. $doi = $entry->{$key};
  129. }
  130. }
  131. if ( lc($key) eq 'prism:issn' ) {
  132. next if ( $entry->{$key} =~ m/^HASH/ );
  133. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  134. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  135. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  136. $ISSN = $entry->{$key}->[0];
  137. } else {
  138. $ISSN = $entry->{$key};
  139. }
  140. }
  141. if ( lc($key) eq 'prism:publicationdate' ) {
  142. my $ref = $entry->{$key}->[0]->{'rdf:Bag'}->[0]->{'rdf:li'};
  143. next if ( !$ref );
  144. next if ( $ref !~ m/^ARRAY/ );
  145. if ( $ref->[0] =~ m/(\d{4})-\d\d-\d\d/ ) {
  146. $year = $1;
  147. }
  148. if ( $ref->[0] =~ m/^(\d{4})$/ ) {
  149. $year = $1;
  150. }
  151. }
  152. # Dublin Core
  153. if ( lc($key) eq 'dc:creator' ) {
  154. next if ( $entry->{$key} !~ m/^ARRAY/ );
  155. next if ( $entry->{$key}->[0] !~ m/^HASH/ );
  156. my $ref = $entry->{$key}->[0]->{'rdf:Seq'}->[0]->{'rdf:li'};
  157. if ($ref) {
  158. next if ( $ref !~ m/^ARRAY/ );
  159. my @creators = ();
  160. foreach my $creator ( @{$ref} ) {
  161. next if ( $creator =~ m/^ARRAY/ );
  162. next if ( $creator =~ m/^HASH/ );
  163. next if ( $creator !~ m/[a-z]/i );
  164. if ( $creator =~ m/et\sal\.$/ ) {
  165. @creators = ();
  166. last;
  167. }
  168. push @creators, Paperpile::Library::Author->new()->parse_freestyle($creator)->bibtex();
  169. }
  170. $authors = join( " and ", @creators );
  171. }
  172. }
  173. if ( lc($key) eq 'dc:title' ) {
  174. my $ref = $entry->{$key}->[0]->{'rdf:Alt'}->[0]->{'rdf:li'};
  175. next if ( !$ref );
  176. next if ( $ref !~ m/^ARRAY/ );
  177. $title = $ref->[0]->{'content'} if ( $ref->[0]->{'content'} );
  178. }
  179. if ( lc($key) eq 'dc:identifier' ) {
  180. my $tmp_doi = '';
  181. next if ( $entry->{$key} =~ m/^HASH/ );
  182. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  183. $tmp_doi = $entry->{$key}->[0];
  184. } else {
  185. $tmp_doi = $entry->{$key};
  186. }
  187. if ( $tmp_doi =~ m/^doi:(.*)/ and !$doi ) {
  188. $doi = $1;
  189. }
  190. if ( $tmp_doi =~ m/^.*doi\.org\/(10\..*)/ and !$doi ) {
  191. $doi = $1;
  192. }
  193. if ( $tmp_doi =~ m/^(10\..*)/ and !$doi ) {
  194. $doi = $1;
  195. }
  196. }
  197. if ( lc($key) eq 'dc:publisher' ) {
  198. my $ref = $entry->{$key}->[0]->{'rdf:Bag'}->[0]->{'rdf:li'};
  199. $publisher = $ref->[0] if ( $ref->[0] );
  200. }
  201. if ( lc($key) eq 'dc:date' ) {
  202. my $ref = $entry->{$key}->[0]->{'rdf:Seq'}->[0]->{'rdf:li'};
  203. next if ( !$ref );
  204. next if ( $ref !~ m/^ARRAY/ );
  205. if ( $ref->[0] =~ m/(\d{4})-\d\d-\d\d/ ) {
  206. $year = $1 if ( !$year );
  207. }
  208. if ( $ref->[0] =~ m/^(\d{4})$/ ) {
  209. $year = $1 if ( !$year );
  210. }
  211. }
  212. if ( lc($key) eq 'dc:description' ) {
  213. my $ref = $entry->{$key}->[0]->{'rdf:Alt'}->[0]->{'rdf:li'};
  214. next if ( !$ref );
  215. next if ( $ref !~ m/^ARRAY/ );
  216. $dummy = $ref->[0]->{'content'} if ( $ref->[0]->{'content'} );
  217. }
  218. # BibteXmp used by JabRef
  219. if ( lc($key) eq 'bibtex:journal' ) {
  220. next if ( $entry->{$key} =~ m/^HASH/ );
  221. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  222. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  223. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  224. $journal = $entry->{$key}->[0];
  225. } else {
  226. $journal = $entry->{$key};
  227. }
  228. }
  229. if ( lc($key) eq 'bibtex:volume' ) {
  230. next if ( $entry->{$key} =~ m/^HASH/ );
  231. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  232. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  233. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  234. $volume = $entry->{$key}->[0];
  235. } else {
  236. $volume = $entry->{$key};
  237. }
  238. }
  239. if ( lc($key) eq 'bibtex:number' ) {
  240. next if ( $entry->{$key} =~ m/^HASH/ );
  241. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  242. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  243. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  244. $issue = $entry->{$key}->[0];
  245. } else {
  246. $issue = $entry->{$key};
  247. }
  248. }
  249. if ( lc($key) eq 'bibtex:pages' ) {
  250. next if ( $entry->{$key} =~ m/^HASH/ );
  251. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  252. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  253. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  254. $pages = $entry->{$key}->[0];
  255. } else {
  256. $pages = $entry->{$key};
  257. }
  258. }
  259. if ( lc($key) eq 'bibtex:abstract' ) {
  260. next if ( $entry->{$key} =~ m/^HASH/ );
  261. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  262. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  263. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  264. $abstract = $entry->{$key}->[0];
  265. } else {
  266. $abstract = $entry->{$key};
  267. }
  268. }
  269. if ( lc($key) eq 'bibtex:doi' ) {
  270. next if ( $entry->{$key} =~ m/^HASH/ );
  271. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  272. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  273. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  274. $doi = $entry->{$key}->[0] if ( !$doi );
  275. } else {
  276. $doi = $entry->{$key} if ( !$doi );
  277. }
  278. }
  279. if ( lc($key) eq 'bibtex:pmid' ) {
  280. next if ( $entry->{$key} =~ m/^HASH/ );
  281. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  282. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  283. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  284. $pmid = $entry->{$key}->[0];
  285. } else {
  286. $pmid = $entry->{$key};
  287. }
  288. }
  289. if ( lc($key) eq 'bibtex:year' ) {
  290. next if ( $entry->{$key} =~ m/^HASH/ );
  291. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  292. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  293. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  294. $year = $entry->{$key}->[0];
  295. } else {
  296. $year = $entry->{$key};
  297. }
  298. }
  299. if ( lc($key) eq 'bibtex:month' ) {
  300. next if ( $entry->{$key} =~ m/^HASH/ );
  301. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  302. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  303. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  304. $month = $entry->{$key}->[0];
  305. } else {
  306. $month = $entry->{$key};
  307. }
  308. }
  309. if ( lc($key) eq 'bibtex:keywords' ) {
  310. next if ( $entry->{$key} =~ m/^HASH/ );
  311. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  312. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  313. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  314. $keywords = $entry->{$key}->[0];
  315. } else {
  316. $keywords = $entry->{$key};
  317. }
  318. }
  319. if ( lc($key) eq 'bibtex:url' ) {
  320. next if ( $entry->{$key} =~ m/^HASH/ );
  321. if ( $entry->{$key} =~ m/^ARRAY/ ) {
  322. next if ( $entry->{$key}->[0] =~ m/^ARRAY/ );
  323. next if ( $entry->{$key}->[0] =~ m/^HASH/ );
  324. $url = $entry->{$key}->[0];
  325. } else {
  326. $url = $entry->{$key};
  327. }
  328. }
  329. }
  330. }
  331. if ( $start_page and $end_page ) {
  332. $pages = "$start_page-$end_page";
  333. }
  334. if ( $start_page and !$end_page ) {
  335. $pages = "$start_page";
  336. }
  337. if ($pages) {
  338. $pages =~ s/-+/-/g;
  339. }
  340. if ($volume) {
  341. if ( $volume =~ m/^\d+$/ ) {
  342. $volume = undef if ( $volume < 1 );
  343. }
  344. }
  345. if ($issue) {
  346. if ( $issue =~ m/^\d+$/ ) {
  347. $issue = undef if ( $issue < 1 );
  348. }
  349. }
  350. # title filtering
  351. if ($title) {
  352. if ( $title =~ m/^doi:(.*)/ ) {
  353. $doi = $1 if ( !$doi );
  354. $title = undef;
  355. }
  356. }
  357. if ($title) {
  358. if ( $title =~ m/^.*doi\.org\/(10\..*)/ ) {
  359. $doi = $1;
  360. $title = undef;
  361. }
  362. }
  363. if ($title) {
  364. if ( $title =~ m/^(10\..*)/ ) {
  365. $doi = $1;
  366. $title = undef;
  367. }
  368. }
  369. if ($title) {
  370. $title =~ s/\s+/ /g;
  371. my $title_flag = 0;
  372. $title_flag = 1 if ( $title =~ m/(\.doc|\.tex|\.dvi|\.ps|\.pdf|\.rtf|\.qxd|\.fm|\.fm\)|\.eps)$/ );
  373. $title_flag = 1 if ( $title =~ m/^\s*$/ );
  374. $title_flag = 1 if ( $title =~ m/^Microsoft/ );
  375. $title_flag = 1 if ( $title =~ m/^gk[a-z]\d+/i );
  376. $title_flag = 1 if ( $title =~ m/^Title/i );
  377. $title_flag = 1 if ( $title =~ m/\.\.\.$/ );
  378. $title_flag = 1 if ( $title =~ m/^LNCS/ );
  379. my $nr_words = ($title =~ tr/ / /);
  380. $title_flag = 1 if ( $nr_words <= 1 );
  381. $title = undef if ( $title_flag == 1 );
  382. }
  383. if ($authors) {
  384. my $authors_flag = 0;
  385. $authors_flag = 1 if ( $authors =~ m/^\d/ );
  386. $authors_flag = 1 if ( $authors =~ m/^Author/ );
  387. $authors_flag = 1 if ( $authors =~ m/.*,\s*$/ );
  388. $authors = undef if ( $authors_flag == 1 );
  389. }
  390. # parse journal name from description dummy tag
  391. # works for XMP from NPG
  392. if ( $volume and $start_page and !$journal ) {
  393. if ( $dummy =~ m/^([^\d]+)\s+$volume,\s+$start_page/ ) {
  394. $journal = $1;
  395. }
  396. }
  397. $pub->journal($journal) if $journal;
  398. $pub->volume($volume) if $volume;
  399. $pub->issue($issue) if $issue;
  400. $pub->year($year) if $year;
  401. $pub->month($month) if $month;
  402. $pub->pages($pages) if $pages;
  403. $pub->abstract($abstract) if $abstract;
  404. $pub->title($title) if $title;
  405. $pub->doi($doi) if $doi;
  406. $pub->issn($ISSN) if $ISSN;
  407. $pub->pmid($pmid) if $pmid;
  408. $pub->eprint($arxivid) if $arxivid;
  409. $pub->authors($authors) if $authors;
  410. $pub->publisher($publisher) if $publisher;
  411. $pub->keywords($keywords) if $keywords;
  412. $pub->url($url) if $url;
  413. return $pub;
  414. }
  415. 1;