PageRenderTime 70ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/modules/bibrecord/lib/bibrecord.py

https://github.com/chokribr/invenio-1
Python | 1898 lines | 1837 code | 14 blank | 47 comment | 24 complexity | 71aa8a445c3535b820ce0b586bb83959 MD5 | raw file
Possible License(s): GPL-2.0
  1. # -*- coding: utf-8 -*-
  2. ##
  3. ## This file is part of Invenio.
  4. ## Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
  5. ##
  6. ## Invenio is free software; you can redistribute it and/or
  7. ## modify it under the terms of the GNU General Public License as
  8. ## published by the Free Software Foundation; either version 2 of the
  9. ## License, or (at your option) any later version.
  10. ##
  11. ## Invenio is distributed in the hope that it will be useful, but
  12. ## WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ## General Public License for more details.
  15. ##
  16. ## You should have received a copy of the GNU General Public License
  17. ## along with Invenio; if not, write to the Free Software Foundation, Inc.,
  18. ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  19. """BibRecord - XML MARC processing library for Invenio.
  20. For API, see create_record(), record_get_field_instances() and friends
  21. in the source code of this file in the section entitled INTERFACE.
  22. Note: Does not access the database, the input is MARCXML only."""
  23. ### IMPORT INTERESTING MODULES AND XML PARSERS
  24. import re
  25. import sys
  26. from cStringIO import StringIO
  27. if sys.hexversion < 0x2040000:
  28. # pylint: disable=W0622
  29. from sets import Set as set
  30. # pylint: enable=W0622
  31. from invenio.bibrecord_config import CFG_MARC21_DTD, \
  32. CFG_BIBRECORD_WARNING_MSGS, CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL, \
  33. CFG_BIBRECORD_DEFAULT_CORRECT, CFG_BIBRECORD_PARSERS_AVAILABLE, \
  34. InvenioBibRecordParserError, InvenioBibRecordFieldError
  35. from invenio.config import CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
  36. from invenio.textutils import encode_for_xml
  37. # Some values used for the RXP parsing.
  38. TAG, ATTRS, CHILDREN = 0, 1, 2
  39. # Find out about the best usable parser:
  40. AVAILABLE_PARSERS = []
  41. # Do we remove singletons (empty tags)?
  42. # NOTE: this is currently set to True as there are some external workflow
  43. # exploiting singletons, e.g. bibupload -c used to delete fields, and
  44. # bibdocfile --fix-marc called on a record where the latest document
  45. # has been deleted.
  46. CFG_BIBRECORD_KEEP_SINGLETONS = True
  47. try:
  48. import pyRXP
  49. if 'pyrxp' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  50. AVAILABLE_PARSERS.append('pyrxp')
  51. except ImportError:
  52. pass
  53. try:
  54. from lxml import etree
  55. if 'lxml' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  56. AVAILABLE_PARSERS.append('lxml')
  57. except ImportError:
  58. pass
  59. try:
  60. import Ft.Xml.Domlette
  61. if '4suite' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  62. AVAILABLE_PARSERS.append('4suite')
  63. except ImportError:
  64. pass
  65. except Exception, err:
  66. from warnings import warn
  67. warn("Error when importing 4suite: %s" % err)
  68. pass
  69. try:
  70. import xml.dom.minidom
  71. import xml.parsers.expat
  72. if 'minidom' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  73. AVAILABLE_PARSERS.append('minidom')
  74. except ImportError:
  75. pass
  76. ### INTERFACE / VISIBLE FUNCTIONS
  77. def create_field(subfields=None, ind1=' ', ind2=' ', controlfield_value='',
  78. global_position=-1):
  79. """
  80. Returns a field created with the provided elements. Global position is
  81. set arbitrary to -1."""
  82. if subfields is None:
  83. subfields = []
  84. ind1, ind2 = _wash_indicators(ind1, ind2)
  85. field = (subfields, ind1, ind2, controlfield_value, global_position)
  86. _check_field_validity(field)
  87. return field
  88. def create_records(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
  89. correct=CFG_BIBRECORD_DEFAULT_CORRECT, parser='',
  90. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  91. """Creates a list of records from the marcxml description. Returns a
  92. list of objects initiated by the function create_record(). Please
  93. see that function's docstring."""
  94. # Use the DOTALL flag to include newlines.
  95. regex = re.compile('<record.*?>.*?</record>', re.DOTALL)
  96. record_xmls = regex.findall(marcxml)
  97. return [create_record(record_xml, verbose=verbose, correct=correct,
  98. parser=parser, keep_singletons=keep_singletons) for record_xml in record_xmls]
  99. def create_record(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
  100. correct=CFG_BIBRECORD_DEFAULT_CORRECT, parser='',
  101. sort_fields_by_indicators=False,
  102. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  103. """Creates a record object from the marcxml description.
  104. Uses the best parser available in CFG_BIBRECORD_PARSERS_AVAILABLE or
  105. the parser specified.
  106. The returned object is a tuple (record, status_code, list_of_errors),
  107. where status_code is 0 when there are errors, 1 when no errors.
  108. The return record structure is as follows:
  109. Record := {tag : [Field]}
  110. Field := (Subfields, ind1, ind2, value)
  111. Subfields := [(code, value)]
  112. For example:
  113. ______
  114. |record|
  115. ------
  116. __________________________|_______________________________________
  117. |record['001'] |record['909'] |record['520'] |
  118. | | | |
  119. [list of fields] [list of fields] [list of fields] ...
  120. | ______|______________ |
  121. |[0] |[0] |[1] | |[0]
  122. ___|_____ _____|___ ___|_____ ... ____|____
  123. |Field 001| |Field 909| |Field 909| |Field 520|
  124. --------- --------- --------- ---------
  125. | _______________|_________________ | |
  126. ... |[0] |[1] |[2] | ... ...
  127. | | | |
  128. [list of subfields] 'C' '4'
  129. ___|__________________________________________
  130. | | |
  131. ('a', 'value') ('b', 'value for subfield b') ('a', 'value for another a')
  132. @param marcxml: an XML string representation of the record to create
  133. @param verbose: the level of verbosity: 0 (silent), 1-2 (warnings),
  134. 3(strict:stop when errors)
  135. @param correct: 1 to enable correction of marcxml syntax. Else 0.
  136. @return: a tuple (record, status_code, list_of_errors), where status
  137. code is 0 where there are errors, 1 when no errors"""
  138. # Select the appropriate parser.
  139. parser = _select_parser(parser)
  140. try:
  141. if parser == 'pyrxp':
  142. rec = _create_record_rxp(marcxml, verbose, correct,
  143. keep_singletons=keep_singletons)
  144. elif parser == 'lxml':
  145. rec = _create_record_lxml(marcxml, verbose, correct,
  146. keep_singletons=keep_singletons)
  147. elif parser == '4suite':
  148. rec = _create_record_4suite(marcxml,
  149. keep_singletons=keep_singletons)
  150. elif parser == 'minidom':
  151. rec = _create_record_minidom(marcxml,
  152. keep_singletons=keep_singletons)
  153. except InvenioBibRecordParserError, ex1:
  154. return (None, 0, str(ex1))
  155. # _create_record = {
  156. # 'pyrxp': _create_record_rxp,
  157. # 'lxml': _create_record_lxml,
  158. # '4suite': _create_record_4suite,
  159. # 'minidom': _create_record_minidom,
  160. # }
  161. # try:
  162. # rec = _create_record[parser](marcxml, verbose)
  163. # except InvenioBibRecordParserError, ex1:
  164. # return (None, 0, str(ex1))
  165. if sort_fields_by_indicators:
  166. _record_sort_by_indicators(rec)
  167. errs = []
  168. if correct:
  169. # Correct the structure of the record.
  170. errs = _correct_record(rec)
  171. return (rec, int(not errs), errs)
  172. def filter_field_instances(field_instances, filter_subcode, filter_value, filter_mode = 'e'):
  173. """ Filters given field and returns only that field instances
  174. that contain filter_subcode with given filter_value.
  175. As an input for search function accepts output from
  176. record_get_field_instances function.
  177. Function can be run in three modes:
  178. 'e' - looking for exact match in subfield value
  179. 's' - looking for substring in subfield value
  180. 'r' - looking for regular expression in subfield value
  181. Example:
  182. record_filter_field(record_get_field_instances(rec, '999', '%', '%'), 'y', '2001')
  183. In this case filter_subcode is 'y' and
  184. filter_value is '2001'.
  185. @param field_instances: output from record_get_field_instances
  186. @param filter_subcode: name of the subfield
  187. @type filter_subcode: string
  188. @param filter_value: value of the subfield
  189. @type filter_value: string
  190. @param filter_mode: 'e','s' or 'r'
  191. """
  192. matched = []
  193. if filter_mode == 'e':
  194. to_match = (filter_subcode, filter_value)
  195. for instance in field_instances:
  196. if to_match in instance[0]:
  197. matched.append(instance)
  198. elif filter_mode == 's':
  199. for instance in field_instances:
  200. for subfield in instance[0]:
  201. if subfield[0] == filter_subcode and \
  202. subfield[1].find(filter_value) > -1:
  203. matched.append(instance)
  204. break
  205. elif filter_mode == 'r':
  206. reg_exp = re.compile(filter_value)
  207. for instance in field_instances:
  208. for subfield in instance[0]:
  209. if subfield[0] == filter_subcode and \
  210. reg_exp.match(subfield[1]) is not None:
  211. matched.append(instance)
  212. break
  213. return matched
  214. def record_drop_duplicate_fields(record):
  215. """
  216. Return a record where all the duplicate fields have been removed.
  217. Fields are considered identical considering also the order of their
  218. subfields.
  219. """
  220. out = {}
  221. position = 0
  222. tags = sorted(record.keys())
  223. for tag in tags:
  224. fields = record[tag]
  225. out[tag] = []
  226. current_fields = set()
  227. for full_field in fields:
  228. field = (tuple(full_field[0]),) + full_field[1:4]
  229. if field not in current_fields:
  230. current_fields.add(field)
  231. position += 1
  232. out[tag].append(full_field[:4] + (position,))
  233. return out
  234. def records_identical(rec1, rec2, skip_005=True, ignore_field_order=False, ignore_subfield_order=False, ignore_duplicate_subfields=False, ignore_duplicate_controlfields=False):
  235. """
  236. Return True if rec1 is identical to rec2, regardless of a difference
  237. in the 005 tag (i.e. the timestamp).
  238. """
  239. rec1_keys = set(rec1.keys())
  240. rec2_keys = set(rec2.keys())
  241. if skip_005:
  242. rec1_keys.discard("005")
  243. rec2_keys.discard("005")
  244. if rec1_keys != rec2_keys:
  245. return False
  246. for key in rec1_keys:
  247. if ignore_duplicate_controlfields and key.startswith('00'):
  248. if set(field[3] for field in rec1[key]) != set(field[3] for field in rec2[key]):
  249. return False
  250. continue
  251. rec1_fields = rec1[key]
  252. rec2_fields = rec2[key]
  253. if len(rec1_fields) != len(rec2_fields):
  254. ## They already differs in length...
  255. return False
  256. if ignore_field_order:
  257. ## We sort the fields, first by indicators and then by anything else
  258. rec1_fields = sorted(rec1_fields, key=lambda elem: (elem[1], elem[2], elem[3], elem[0]))
  259. rec2_fields = sorted(rec2_fields, key=lambda elem: (elem[1], elem[2], elem[3], elem[0]))
  260. else:
  261. ## We sort the fields, first by indicators, then by global position and then by anything else
  262. rec1_fields = sorted(rec1_fields, key=lambda elem: (elem[1], elem[2], elem[4], elem[3], elem[0]))
  263. rec2_fields = sorted(rec2_fields, key=lambda elem: (elem[1], elem[2], elem[4], elem[3], elem[0]))
  264. for field1, field2 in zip(rec1_fields, rec2_fields):
  265. if ignore_duplicate_subfields:
  266. if field1[1:4] != field2[1:4] or set(field1[0]) != set(field2[0]):
  267. return False
  268. elif ignore_subfield_order:
  269. if field1[1:4] != field2[1:4] or sorted(field1[0]) != sorted(field2[0]):
  270. return False
  271. elif field1[:4] != field2[:4]:
  272. return False
  273. return True
  274. def record_get_field_instances(rec, tag="", ind1=" ", ind2=" "):
  275. """Returns the list of field instances for the specified tag and
  276. indicators of the record (rec).
  277. Returns empty list if not found.
  278. If tag is empty string, returns all fields
  279. Parameters (tag, ind1, ind2) can contain wildcard %.
  280. @param rec: a record structure as returned by create_record()
  281. @param tag: a 3 characters long string
  282. @param ind1: a 1 character long string
  283. @param ind2: a 1 character long string
  284. @param code: a 1 character long string
  285. @return: a list of field tuples (Subfields, ind1, ind2, value,
  286. field_position_global) where subfields is list of (code, value)"""
  287. if not rec:
  288. return []
  289. if not tag:
  290. return rec.items()
  291. else:
  292. out = []
  293. ind1, ind2 = _wash_indicators(ind1, ind2)
  294. if '%' in tag:
  295. # Wildcard in tag. Check all possible
  296. for field_tag in rec:
  297. if _tag_matches_pattern(field_tag, tag):
  298. for possible_field_instance in rec[field_tag]:
  299. if (ind1 in ('%', possible_field_instance[1]) and
  300. ind2 in ('%', possible_field_instance[2])):
  301. out.append(possible_field_instance)
  302. else:
  303. # Completely defined tag. Use dict
  304. for possible_field_instance in rec.get(tag, []):
  305. if (ind1 in ('%', possible_field_instance[1]) and
  306. ind2 in ('%', possible_field_instance[2])):
  307. out.append(possible_field_instance)
  308. return out
  309. def record_add_field(rec, tag, ind1=' ', ind2=' ', controlfield_value='',
  310. subfields=None, field_position_global=None, field_position_local=None):
  311. """
  312. Adds a new field into the record.
  313. If field_position_global or field_position_local is specified then
  314. this method will insert the new field at the desired position.
  315. Otherwise a global field position will be computed in order to
  316. insert the field at the best position (first we try to keep the
  317. order of the tags and then we insert the field at the end of the
  318. fields with the same tag).
  319. If both field_position_global and field_position_local are present,
  320. then field_position_local takes precedence.
  321. @param rec: the record data structure
  322. @param tag: the tag of the field to be added
  323. @param ind1: the first indicator
  324. @param ind2: the second indicator
  325. @param controlfield_value: the value of the controlfield
  326. @param subfields: the subfields (a list of tuples (code, value))
  327. @param field_position_global: the global field position (record wise)
  328. @param field_position_local: the local field position (tag wise)
  329. @return: the global field position of the newly inserted field or -1 if the
  330. operation failed
  331. """
  332. error = _validate_record_field_positions_global(rec)
  333. if error:
  334. # FIXME one should write a message here
  335. pass
  336. # Clean the parameters.
  337. if subfields is None:
  338. subfields = []
  339. ind1, ind2 = _wash_indicators(ind1, ind2)
  340. if controlfield_value and (ind1 != ' ' or ind2 != ' ' or subfields):
  341. return -1
  342. # Detect field number to be used for insertion:
  343. # Dictionaries for uniqueness.
  344. tag_field_positions_global = {}.fromkeys([field[4]
  345. for field in rec.get(tag, [])])
  346. all_field_positions_global = {}.fromkeys([field[4]
  347. for fields in rec.values()
  348. for field in fields])
  349. if field_position_global is None and field_position_local is None:
  350. # Let's determine the global field position of the new field.
  351. if tag in rec:
  352. try:
  353. field_position_global = max([field[4] for field in rec[tag]]) \
  354. + 1
  355. except IndexError:
  356. if tag_field_positions_global:
  357. field_position_global = max(tag_field_positions_global) + 1
  358. elif all_field_positions_global:
  359. field_position_global = max(all_field_positions_global) + 1
  360. else:
  361. field_position_global = 1
  362. else:
  363. if tag in ('FMT', 'FFT', 'BDR', 'BDM'):
  364. # Add the new tag to the end of the record.
  365. if tag_field_positions_global:
  366. field_position_global = max(tag_field_positions_global) + 1
  367. elif all_field_positions_global:
  368. field_position_global = max(all_field_positions_global) + 1
  369. else:
  370. field_position_global = 1
  371. else:
  372. # Insert the tag in an ordered way by selecting the
  373. # right global field position.
  374. immediate_lower_tag = '000'
  375. for rec_tag in rec:
  376. if (tag not in ('FMT', 'FFT', 'BDR', 'BDM') and
  377. immediate_lower_tag < rec_tag < tag):
  378. immediate_lower_tag = rec_tag
  379. if immediate_lower_tag == '000':
  380. field_position_global = 1
  381. else:
  382. field_position_global = rec[immediate_lower_tag][-1][4] + 1
  383. field_position_local = len(rec.get(tag, []))
  384. _shift_field_positions_global(rec, field_position_global, 1)
  385. elif field_position_local is not None:
  386. if tag in rec:
  387. if field_position_local >= len(rec[tag]):
  388. field_position_global = rec[tag][-1][4] + 1
  389. else:
  390. field_position_global = rec[tag][field_position_local][4]
  391. _shift_field_positions_global(rec, field_position_global, 1)
  392. else:
  393. if all_field_positions_global:
  394. field_position_global = max(all_field_positions_global) + 1
  395. else:
  396. # Empty record.
  397. field_position_global = 1
  398. elif field_position_global is not None:
  399. # If the user chose an existing global field position, shift all the
  400. # global field positions greater than the input global field position.
  401. if tag not in rec:
  402. if all_field_positions_global:
  403. field_position_global = max(all_field_positions_global) + 1
  404. else:
  405. field_position_global = 1
  406. field_position_local = 0
  407. elif field_position_global < min(tag_field_positions_global):
  408. field_position_global = min(tag_field_positions_global)
  409. _shift_field_positions_global(rec, min(tag_field_positions_global),
  410. 1)
  411. field_position_local = 0
  412. elif field_position_global > max(tag_field_positions_global):
  413. field_position_global = max(tag_field_positions_global) + 1
  414. _shift_field_positions_global(rec,
  415. max(tag_field_positions_global) + 1, 1)
  416. field_position_local = len(rec.get(tag, []))
  417. else:
  418. if field_position_global in tag_field_positions_global:
  419. _shift_field_positions_global(rec, field_position_global, 1)
  420. field_position_local = 0
  421. for position, field in enumerate(rec[tag]):
  422. if field[4] == field_position_global + 1:
  423. field_position_local = position
  424. # Create the new field.
  425. newfield = (subfields, ind1, ind2, str(controlfield_value),
  426. field_position_global)
  427. rec.setdefault(tag, []).insert(field_position_local, newfield)
  428. # Return new field number:
  429. return field_position_global
  430. def record_has_field(rec, tag):
  431. """
  432. Checks if the tag exists in the record.
  433. @param rec: the record data structure
  434. @param the: field
  435. @return: a boolean
  436. """
  437. return tag in rec
  438. def record_delete_field(rec, tag, ind1=' ', ind2=' ',
  439. field_position_global=None, field_position_local=None):
  440. """
  441. If global field position is specified, deletes the field with the
  442. corresponding global field position.
  443. If field_position_local is specified, deletes the field with the
  444. corresponding local field position and tag.
  445. Else deletes all the fields matching tag and optionally ind1 and
  446. ind2.
  447. If both field_position_global and field_position_local are present,
  448. then field_position_local takes precedence.
  449. @param rec: the record data structure
  450. @param tag: the tag of the field to be deleted
  451. @param ind1: the first indicator of the field to be deleted
  452. @param ind2: the second indicator of the field to be deleted
  453. @param field_position_global: the global field position (record wise)
  454. @param field_position_local: the local field position (tag wise)
  455. @return: the list of deleted fields
  456. """
  457. error = _validate_record_field_positions_global(rec)
  458. if error:
  459. # FIXME one should write a message here.
  460. pass
  461. if tag not in rec:
  462. return False
  463. ind1, ind2 = _wash_indicators(ind1, ind2)
  464. deleted = []
  465. newfields = []
  466. if field_position_global is None and field_position_local is None:
  467. # Remove all fields with tag 'tag'.
  468. for field in rec[tag]:
  469. if field[1] != ind1 or field[2] != ind2:
  470. newfields.append(field)
  471. else:
  472. deleted.append(field)
  473. rec[tag] = newfields
  474. elif field_position_global is not None:
  475. # Remove the field with 'field_position_global'.
  476. for field in rec[tag]:
  477. if (field[1] != ind1 and field[2] != ind2 or
  478. field[4] != field_position_global):
  479. newfields.append(field)
  480. else:
  481. deleted.append(field)
  482. rec[tag] = newfields
  483. elif field_position_local is not None:
  484. # Remove the field with 'field_position_local'.
  485. try:
  486. del rec[tag][field_position_local]
  487. except IndexError:
  488. return []
  489. if not rec[tag]:
  490. # Tag is now empty, remove it.
  491. del rec[tag]
  492. return deleted
  493. def record_delete_fields(rec, tag, field_positions_local=None):
  494. """
  495. Delete all/some fields defined with MARC tag 'tag' from record 'rec'.
  496. @param rec: a record structure.
  497. @type rec: tuple
  498. @param tag: three letter field.
  499. @type tag: string
  500. @param field_position_local: if set, it is the list of local positions
  501. within all the fields with the specified tag, that should be deleted.
  502. If not set all the fields with the specified tag will be deleted.
  503. @type field_position_local: sequence
  504. @return: the list of deleted fields.
  505. @rtype: list
  506. @note: the record is modified in place.
  507. """
  508. if tag not in rec:
  509. return []
  510. new_fields, deleted_fields = [], []
  511. for position, field in enumerate(rec.get(tag, [])):
  512. if field_positions_local is None or position in field_positions_local:
  513. deleted_fields.append(field)
  514. else:
  515. new_fields.append(field)
  516. if new_fields:
  517. rec[tag] = new_fields
  518. else:
  519. del rec[tag]
  520. return deleted_fields
  521. def record_add_fields(rec, tag, fields, field_position_local=None,
  522. field_position_global=None):
  523. """
  524. Adds the fields into the record at the required position. The
  525. position is specified by the tag and the field_position_local in
  526. the list of fields.
  527. @param rec: a record structure
  528. @param tag: the tag of the fields
  529. to be moved
  530. @param field_position_local: the field_position_local to which the
  531. field will be inserted. If not specified, appends the fields to
  532. the tag.
  533. @param a: list of fields to be added
  534. @return: -1 if the operation failed, or the field_position_local
  535. if it was successful
  536. """
  537. if field_position_local is None and field_position_global is None:
  538. for field in fields:
  539. record_add_field(rec, tag, ind1=field[1],
  540. ind2=field[2], subfields=field[0],
  541. controlfield_value=field[3])
  542. else:
  543. fields.reverse()
  544. for field in fields:
  545. record_add_field(rec, tag, ind1=field[1], ind2=field[2],
  546. subfields=field[0], controlfield_value=field[3],
  547. field_position_local=field_position_local,
  548. field_position_global=field_position_global)
  549. return field_position_local
  550. def record_move_fields(rec, tag, field_positions_local,
  551. field_position_local=None):
  552. """
  553. Moves some fields to the position specified by
  554. 'field_position_local'.
  555. @param rec: a record structure as returned by create_record()
  556. @param tag: the tag of the fields to be moved
  557. @param field_positions_local: the positions of the
  558. fields to move
  559. @param field_position_local: insert the field before that
  560. field_position_local. If unspecified, appends the fields
  561. @return: the field_position_local is the operation was successful
  562. """
  563. fields = record_delete_fields(rec, tag,
  564. field_positions_local=field_positions_local)
  565. return record_add_fields(rec, tag, fields,
  566. field_position_local=field_position_local)
  567. def record_delete_subfield(rec, tag, subfield_code, ind1=' ', ind2=' '):
  568. """Deletes all subfields with subfield_code in the record."""
  569. ind1, ind2 = _wash_indicators(ind1, ind2)
  570. for field in rec.get(tag, []):
  571. if field[1] == ind1 and field[2] == ind2:
  572. field[0][:] = [subfield for subfield in field[0]
  573. if subfield_code != subfield[0]]
  574. def record_get_field(rec, tag, field_position_global=None,
  575. field_position_local=None):
  576. """
  577. Returns the the matching field. One has to enter either a global
  578. field position or a local field position.
  579. @return: a list of subfield tuples (subfield code, value).
  580. @rtype: list
  581. """
  582. if field_position_global is None and field_position_local is None:
  583. raise InvenioBibRecordFieldError("A field position is required to "
  584. "complete this operation.")
  585. elif field_position_global is not None and field_position_local is not None:
  586. raise InvenioBibRecordFieldError("Only one field position is required "
  587. "to complete this operation.")
  588. elif field_position_global:
  589. if not tag in rec:
  590. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  591. for field in rec[tag]:
  592. if field[4] == field_position_global:
  593. return field
  594. raise InvenioBibRecordFieldError("No field has the tag '%s' and the "
  595. "global field position '%d'." % (tag, field_position_global))
  596. else:
  597. try:
  598. return rec[tag][field_position_local]
  599. except KeyError:
  600. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  601. except IndexError:
  602. raise InvenioBibRecordFieldError("No field has the tag '%s' and "
  603. "the local field position '%d'." % (tag, field_position_local))
  604. def record_replace_field(rec, tag, new_field, field_position_global=None,
  605. field_position_local=None):
  606. """Replaces a field with a new field."""
  607. if field_position_global is None and field_position_local is None:
  608. raise InvenioBibRecordFieldError("A field position is required to "
  609. "complete this operation.")
  610. elif field_position_global is not None and field_position_local is not None:
  611. raise InvenioBibRecordFieldError("Only one field position is required "
  612. "to complete this operation.")
  613. elif field_position_global:
  614. if not tag in rec:
  615. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  616. replaced = False
  617. for position, field in enumerate(rec[tag]):
  618. if field[4] == field_position_global:
  619. rec[tag][position] = new_field
  620. replaced = True
  621. if not replaced:
  622. raise InvenioBibRecordFieldError("No field has the tag '%s' and "
  623. "the global field position '%d'." %
  624. (tag, field_position_global))
  625. else:
  626. try:
  627. rec[tag][field_position_local] = new_field
  628. except KeyError:
  629. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  630. except IndexError:
  631. raise InvenioBibRecordFieldError("No field has the tag '%s' and "
  632. "the local field position '%d'." % (tag, field_position_local))
  633. def record_get_subfields(rec, tag, field_position_global=None,
  634. field_position_local=None):
  635. """
  636. Returns the subfield of the matching field. One has to enter either a
  637. global field position or a local field position.
  638. @return: a list of subfield tuples (subfield code, value).
  639. @rtype: list
  640. """
  641. field = record_get_field(rec, tag,
  642. field_position_global=field_position_global,
  643. field_position_local=field_position_local)
  644. return field[0]
  645. def record_delete_subfield_from(rec, tag, subfield_position,
  646. field_position_global=None, field_position_local=None):
  647. """Delete subfield from position specified by tag, field number and
  648. subfield position."""
  649. subfields = record_get_subfields(rec, tag,
  650. field_position_global=field_position_global,
  651. field_position_local=field_position_local)
  652. try:
  653. del subfields[subfield_position]
  654. except IndexError:
  655. from invenio.xmlmarc2textmarc import create_marc_record
  656. recordMarc = create_marc_record(rec, 0, {"text-marc": 1, "aleph-marc": 0})
  657. raise InvenioBibRecordFieldError("The record : %(recordCode)s does not contain the subfield "
  658. "'%(subfieldIndex)s' inside the field (local: '%(fieldIndexLocal)s, global: '%(fieldIndexGlobal)s' ) of tag '%(tag)s'." % \
  659. {"subfieldIndex" : subfield_position, \
  660. "fieldIndexLocal" : str(field_position_local), \
  661. "fieldIndexGlobal" : str(field_position_global), \
  662. "tag" : tag, \
  663. "recordCode" : recordMarc})
  664. if not subfields:
  665. if field_position_global is not None:
  666. for position, field in enumerate(rec[tag]):
  667. if field[4] == field_position_global:
  668. del rec[tag][position]
  669. else:
  670. del rec[tag][field_position_local]
  671. if not rec[tag]:
  672. del rec[tag]
  673. def record_add_subfield_into(rec, tag, subfield_code, value,
  674. subfield_position=None, field_position_global=None,
  675. field_position_local=None):
  676. """Add subfield into position specified by tag, field number and
  677. optionally by subfield position."""
  678. subfields = record_get_subfields(rec, tag,
  679. field_position_global=field_position_global,
  680. field_position_local=field_position_local)
  681. if subfield_position is None:
  682. subfields.append((subfield_code, value))
  683. else:
  684. subfields.insert(subfield_position, (subfield_code, value))
  685. def record_modify_controlfield(rec, tag, controlfield_value,
  686. field_position_global=None, field_position_local=None):
  687. """Modify controlfield at position specified by tag and field number."""
  688. field = record_get_field(rec, tag,
  689. field_position_global=field_position_global,
  690. field_position_local=field_position_local)
  691. new_field = (field[0], field[1], field[2], controlfield_value, field[4])
  692. record_replace_field(rec, tag, new_field,
  693. field_position_global=field_position_global,
  694. field_position_local=field_position_local)
  695. def record_modify_subfield(rec, tag, subfield_code, value, subfield_position,
  696. field_position_global=None, field_position_local=None):
  697. """Modify subfield at position specified by tag, field number and
  698. subfield position."""
  699. subfields = record_get_subfields(rec, tag,
  700. field_position_global=field_position_global,
  701. field_position_local=field_position_local)
  702. try:
  703. subfields[subfield_position] = (subfield_code, value)
  704. except IndexError:
  705. raise InvenioBibRecordFieldError("There is no subfield with position "
  706. "'%d'." % subfield_position)
  707. def record_move_subfield(rec, tag, subfield_position, new_subfield_position,
  708. field_position_global=None, field_position_local=None):
  709. """Move subfield at position specified by tag, field number and
  710. subfield position to new subfield position."""
  711. subfields = record_get_subfields(rec, tag,
  712. field_position_global=field_position_global,
  713. field_position_local=field_position_local)
  714. try:
  715. subfield = subfields.pop(subfield_position)
  716. subfields.insert(new_subfield_position, subfield)
  717. except IndexError:
  718. raise InvenioBibRecordFieldError("There is no subfield with position "
  719. "'%d'." % subfield_position)
  720. def record_get_field_value(rec, tag, ind1=" ", ind2=" ", code=""):
  721. """Returns first (string) value that matches specified field
  722. (tag, ind1, ind2, code) of the record (rec).
  723. Returns empty string if not found.
  724. Parameters (tag, ind1, ind2, code) can contain wildcard %.
  725. Difference between wildcard % and empty '':
  726. - Empty char specifies that we are not interested in a field which
  727. has one of the indicator(s)/subfield specified.
  728. - Wildcard specifies that we are interested in getting the value
  729. of the field whatever the indicator(s)/subfield is.
  730. For e.g. consider the following record in MARC:
  731. 100C5 $$a val1
  732. 555AB $$a val2
  733. 555AB val3
  734. 555 $$a val4
  735. 555A val5
  736. >> record_get_field_value(record, '555', 'A', '', '')
  737. >> "val5"
  738. >> record_get_field_value(record, '555', 'A', '%', '')
  739. >> "val3"
  740. >> record_get_field_value(record, '555', 'A', '%', '%')
  741. >> "val2"
  742. >> record_get_field_value(record, '555', 'A', 'B', '')
  743. >> "val3"
  744. >> record_get_field_value(record, '555', '', 'B', 'a')
  745. >> ""
  746. >> record_get_field_value(record, '555', '', '', 'a')
  747. >> "val4"
  748. >> record_get_field_value(record, '555', '', '', '')
  749. >> ""
  750. >> record_get_field_value(record, '%%%', '%', '%', '%')
  751. >> "val1"
  752. @param rec: a record structure as returned by create_record()
  753. @param tag: a 3 characters long string
  754. @param ind1: a 1 character long string
  755. @param ind2: a 1 character long string
  756. @param code: a 1 character long string
  757. @return: string value (empty if nothing found)"""
  758. # Note: the code is quite redundant for speed reasons (avoid calling
  759. # functions or doing tests inside loops)
  760. ind1, ind2 = _wash_indicators(ind1, ind2)
  761. if '%' in tag:
  762. # Wild card in tag. Must find all corresponding fields
  763. if code == '':
  764. # Code not specified.
  765. for field_tag, fields in rec.items():
  766. if _tag_matches_pattern(field_tag, tag):
  767. for field in fields:
  768. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  769. # Return matching field value if not empty
  770. if field[3]:
  771. return field[3]
  772. elif code == '%':
  773. # Code is wildcard. Take first subfield of first matching field
  774. for field_tag, fields in rec.items():
  775. if _tag_matches_pattern(field_tag, tag):
  776. for field in fields:
  777. if (ind1 in ('%', field[1]) and ind2 in ('%', field[2])
  778. and field[0]):
  779. return field[0][0][1]
  780. else:
  781. # Code is specified. Take corresponding one
  782. for field_tag, fields in rec.items():
  783. if _tag_matches_pattern(field_tag, tag):
  784. for field in fields:
  785. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  786. for subfield in field[0]:
  787. if subfield[0] == code:
  788. return subfield[1]
  789. else:
  790. # Tag is completely specified. Use tag as dict key
  791. if tag in rec:
  792. if code == '':
  793. # Code not specified.
  794. for field in rec[tag]:
  795. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  796. # Return matching field value if not empty
  797. # or return "" empty if not exist.
  798. if field[3]:
  799. return field[3]
  800. elif code == '%':
  801. # Code is wildcard. Take first subfield of first matching field
  802. for field in rec[tag]:
  803. if (ind1 in ('%', field[1]) and ind2 in ('%', field[2]) and
  804. field[0]):
  805. return field[0][0][1]
  806. else:
  807. # Code is specified. Take corresponding one
  808. for field in rec[tag]:
  809. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  810. for subfield in field[0]:
  811. if subfield[0] == code:
  812. return subfield[1]
  813. # Nothing was found
  814. return ""
  815. def record_get_field_values(rec, tag, ind1=" ", ind2=" ", code="",
  816. filter_subfield_code="",
  817. filter_subfield_value="",
  818. filter_subfield_mode="e"):
  819. """Returns the list of (string) values for the specified field
  820. (tag, ind1, ind2, code) of the record (rec).
  821. List can be filtered. Use filter_subfield_code
  822. and filter_subfield_value to search
  823. only in fields that have these values inside them as a subfield.
  824. filter_subfield_mode can have 3 different values:
  825. 'e' for exact search
  826. 's' for substring search
  827. 'r' for regexp search
  828. Returns empty list if nothing was found.
  829. Parameters (tag, ind1, ind2, code) can contain wildcard %.
  830. @param rec: a record structure as returned by create_record()
  831. @param tag: a 3 characters long string
  832. @param ind1: a 1 character long string
  833. @param ind2: a 1 character long string
  834. @param code: a 1 character long string
  835. @return: a list of strings"""
  836. tmp = []
  837. ind1, ind2 = _wash_indicators(ind1, ind2)
  838. if filter_subfield_code and filter_subfield_mode == "r":
  839. reg_exp = re.compile(filter_subfield_value)
  840. tags = []
  841. if '%' in tag:
  842. # Wild card in tag. Must find all corresponding tags and fields
  843. tags = [k for k in rec if _tag_matches_pattern(k, tag)]
  844. elif rec and tag in rec:
  845. tags = [tag]
  846. if code == '':
  847. # Code not specified. Consider field value (without subfields)
  848. for tag in tags:
  849. for field in rec[tag]:
  850. if (ind1 in ('%', field[1]) and ind2 in ('%', field[2]) and
  851. field[3]):
  852. tmp.append(field[3])
  853. elif code == '%':
  854. # Code is wildcard. Consider all subfields
  855. for tag in tags:
  856. for field in rec[tag]:
  857. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  858. if filter_subfield_code:
  859. if filter_subfield_mode == "e":
  860. subfield_to_match = (filter_subfield_code, filter_subfield_value)
  861. if subfield_to_match in field[0]:
  862. for subfield in field[0]:
  863. tmp.append(subfield[1])
  864. elif filter_subfield_mode == "s":
  865. if (dict(field[0]).get(filter_subfield_code, '')).find(filter_subfield_value) > -1:
  866. for subfield in field[0]:
  867. tmp.append(subfield[1])
  868. elif filter_subfield_mode == "r":
  869. if reg_exp.match(dict(field[0]).get(filter_subfield_code, '')):
  870. for subfield in field[0]:
  871. tmp.append(subfield[1])
  872. else:
  873. for subfield in field[0]:
  874. tmp.append(subfield[1])
  875. else:
  876. # Code is specified. Consider all corresponding subfields
  877. for tag in tags:
  878. for field in rec[tag]:
  879. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  880. if filter_subfield_code:
  881. if filter_subfield_mode == "e":
  882. subfield_to_match = (filter_subfield_code, filter_subfield_value)
  883. if subfield_to_match in field[0]:
  884. for subfield in field[0]:
  885. if subfield[0] == code:
  886. tmp.append(subfield[1])
  887. elif filter_subfield_mode == "s":
  888. if (dict(field[0]).get(filter_subfield_code, '')).find(filter_subfield_value) > -1:
  889. for subfield in field[0]:
  890. if subfield[0] == code:
  891. tmp.append(subfield[1])
  892. elif filter_subfield_mode == "r":
  893. if reg_exp.match(dict(field[0]).get(filter_subfield_code, '')):
  894. for subfield in field[0]:
  895. if subfield[0] == code:
  896. tmp.append(subfield[1])
  897. else:
  898. for subfield in field[0]:
  899. if subfield[0] == code:
  900. tmp.append(subfield[1])
  901. # If tmp was not set, nothing was found
  902. return tmp
  903. def record_xml_output(rec, tags=None, order_fn=None):
  904. """Generates the XML for record 'rec' and returns it as a string
  905. @rec: record
  906. @tags: list of tags to be printed"""
  907. if tags is None:
  908. tags = []
  909. if isinstance(tags, str):
  910. tags = [tags]
  911. if tags and '001' not in tags:
  912. # Add the missing controlfield.
  913. tags.append('001')
  914. marcxml = ['<record>']
  915. # Add the tag 'tag' to each field in rec[tag]
  916. fields = []
  917. if rec is not None:
  918. for tag in rec:
  919. if not tags or tag in tags:
  920. for field in rec[tag]:
  921. fields.append((tag, field))
  922. if order_fn is None:
  923. record_order_fields(fields)
  924. else:
  925. record_order_fields(fields, order_fn)
  926. for field in fields:
  927. marcxml.append(field_xml_output(field[1], field[0]))
  928. marcxml.append('</record>')
  929. return '\n'.join(marcxml)
  930. def field_get_subfield_instances(field):
  931. """Returns the list of subfields associated with field 'field'"""
  932. return field[0]
  933. def field_get_subfield_values(field_instance, code):
  934. """Return subfield CODE values of the field instance FIELD."""
  935. return [subfield_value
  936. for subfield_code, subfield_value in field_instance[0]
  937. if subfield_code == code]
  938. def field_get_subfield_codes(field_instance):
  939. """Return subfield codes of the field instance FIELD."""
  940. return [subfield_code
  941. for subfield_code, subfield_value in field_instance[0]]
  942. def field_add_subfield(field, code, value):
  943. """Adds a subfield to field 'field'"""
  944. field[0].append((code, value))
  945. def record_order_fields(rec, fun="_order_by_ord"):
  946. """Orders field inside record 'rec' according to a function"""
  947. rec.sort(eval(fun))
  948. def field_xml_output(field, tag):
  949. """Generates the XML for field 'field' and returns it as a string."""
  950. marcxml = []
  951. if field[3]:
  952. marcxml.append(' <controlfield tag="%s">%s</controlfield>' %
  953. (tag, encode_for_xml(field[3])))
  954. else:
  955. marcxml.append(' <datafield tag="%s" ind1="%s" ind2="%s">' %
  956. (tag, field[1], field[2]))
  957. marcxml += [_subfield_xml_output(subfield) for subfield in field[0]]
  958. marcxml.append(' </datafield>')
  959. return '\n'.join(marcxml)
  960. def record_extract_oai_id(record):
  961. """Returns the OAI ID of the record."""
  962. tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3]
  963. ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
  964. ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
  965. subfield = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
  966. values = record_get_field_values(record, tag, ind1, ind2, subfield)
  967. oai_id_regex = re.compile("oai[a-zA-Z0-9/.:]+")
  968. for value in [value.strip() for value in values]:
  969. if oai_id_regex.match(value):
  970. return value
  971. return ""
  972. def record_extract_dois(record):
  973. """Returns the DOI(s) of the record."""
  974. record_dois = []
  975. tag = "024"
  976. ind1 = "7"
  977. ind2 = "_"
  978. subfield_source_code = "2"
  979. subfield_value_code = "a"
  980. identifiers_fields = record_get_field_instances(record, tag, ind1, ind2)
  981. for identifer_field in identifiers_fields:
  982. if 'doi' in [val.lower() for val in field_get_subfield_values(identifer_field, subfield_source_code)]:
  983. record_dois.extend(field_get_subfield_values(identifer_field, subfield_value_code))
  984. return record_dois
  985. def print_rec(rec, format=1, tags=None):
  986. """
  987. prints a record
  988. format = 1 -- XML
  989. format = 2 -- HTML (not implemented)
  990. @param tags: list of tags to be printed
  991. """
  992. if tags is None:
  993. tags = []
  994. if format == 1:
  995. text = record_xml_output(rec, tags)
  996. else:
  997. return ''
  998. return text
  999. def print_recs(listofrec, format=1, tags=None):
  1000. """
  1001. prints a list of records
  1002. @param format: 1 XML, 2 HTML (not implemented)
  1003. @param tags: list of tags to be printed
  1004. if 'listofrec' is not a list it returns empty string
  1005. """
  1006. if tags is None:
  1007. tags = []
  1008. text = ""
  1009. if type(listofrec).__name__ !='list':
  1010. return ""
  1011. else:
  1012. for rec in listofrec:
  1013. text = "%s\n%s" % (text, print_rec(rec, format, tags))
  1014. return text
  1015. def concat(alist):
  1016. """Concats a list of lists"""
  1017. newl = []
  1018. for l in alist:
  1019. newl.extend(l)
  1020. return newl
  1021. def record_find_field(rec, tag, field, strict=False):
  1022. """
  1023. Returns the global and local positions of the first occurrence
  1024. of the field in a record.
  1025. @param rec: A record dictionary structure
  1026. @type rec: dictionary
  1027. @param tag: The tag of the field to search for
  1028. @type tag: string
  1029. @param field: A field tuple as returned by create_field()
  1030. @type field: tuple
  1031. @param strict: A boolean describing the search method. If strict
  1032. is False, then the order of the subfields doesn't
  1033. matter. Default search method is strict.
  1034. @type strict: boolean
  1035. @return: A tuple of (global_position, local_position) or a
  1036. tuple (None, None) if the field is not present.
  1037. @rtype: tuple
  1038. @raise InvenioBibRecordFieldError: If the provided field is invalid.
  1039. """
  1040. try:
  1041. _check_field_validity(field)
  1042. except InvenioBibRecordFieldError:
  1043. raise
  1044. for local_position, field1 in enumerate(rec.get(tag, [])):
  1045. if _compare_fields(field, field1, strict):
  1046. return (field1[4], local_position)
  1047. return (None, None)
  1048. def record_match_subfields(rec, tag, ind1=" ", ind2=" ", sub_key=None,
  1049. sub_value='', sub_key2=None, sub_value2='',
  1050. case_sensitive=True):
  1051. """ Finds subfield instances in a particular field and tests
  1052. values in 1 of 3 possible ways:
  1053. - Does a subfield code exist? (ie does 773__a exist?)
  1054. - Does a subfield have a particular value? (ie 773__a == 'PhysX')
  1055. - Do a pair of subfields have particular values?
  1056. (ie 035__2 == 'CDS' and 035__a == '123456')
  1057. Parameters:
  1058. * rec - dictionary: a bibrecord structure
  1059. * tag - string: the tag of the field (ie '773')
  1060. * ind1, ind2 - char: a single characters for the MARC indicators
  1061. * sub_key - char: subfield key to find
  1062. * sub_value - string: subfield value of that key
  1063. * sub_key2 - char: key of subfield to compare against
  1064. * sub_value2 - string: expected value of second subfield
  1065. * case_sensitive - bool: be case sensitive when matching values
  1066. Returns: false if no match found, else provides the field position (int) """
  1067. if sub_key is None:
  1068. raise TypeError("None object passed for parameter sub_key.")
  1069. if sub_key2 is not None and sub_value2 is '':
  1070. raise TypeError("Parameter sub_key2 defined but sub_value2 is None, "
  1071. + "function requires a value for comparrison.")
  1072. ind1, ind2 = _wash_indicators(ind1, ind2)
  1073. if not case_sensitive:
  1074. sub_value = sub_value.lower()
  1075. sub_value2 = sub_value2.lower()
  1076. for field in record_get_field_instances(rec, tag, ind1, ind2):
  1077. subfields = dict(field_get_subfield_instances(field))
  1078. if not case_sensitive:
  1079. for k, v in subfields.iteritems():
  1080. subfields[k] = v.lower()
  1081. if sub_key in subfields:
  1082. if sub_value is '':
  1083. return field[4]
  1084. else:
  1085. if sub_value == subfields[sub_key]:
  1086. if sub_key2 is None:
  1087. return field[4]
  1088. else:
  1089. if sub_key2 in subfields:
  1090. if sub_value2 == subfields[sub_key2]:
  1091. return field[4]
  1092. return False
  1093. def record_strip_empty_volatile_subfields(rec):
  1094. """
  1095. Removes unchanged volatile subfields from the record
  1096. """
  1097. for tag in rec.keys():
  1098. for field in rec[tag]:
  1099. field[0][:] = [subfield for subfield in field[0] if subfield[1][:9] != "VOLATILE:"]
  1100. def record_strip_empty_fields(rec, tag=None):
  1101. """
  1102. Removes empty subfields and fields from the record. If 'tag' is not None, only
  1103. a specific tag of the record will be stripped, otherwise the whole record.
  1104. @param rec: A record dictionary structure
  1105. @type rec: dictionary
  1106. @param tag: The tag of the field to strip empty fields from
  1107. @type tag: string
  1108. """
  1109. # Check whole record
  1110. if tag is None:
  1111. tags = rec.keys()
  1112. for tag in tags:
  1113. record_strip_empty_fields(rec, tag)
  1114. # Check specific tag of the record
  1115. elif tag in rec:
  1116. # in case of a controlfield
  1117. if tag[:2] == '00':
  1118. if len(rec[tag]) == 0 or not rec[tag][0][3]:
  1119. del rec[tag]
  1120. #in case of a normal field
  1121. else:
  1122. fields = []
  1123. for field in rec[tag]:
  1124. subfields = []
  1125. for subfield in field[0]:
  1126. # check if the subfield has been given a value
  1127. if subfield[1]:
  1128. subfield = (subfield[0], subfield[1].strip()) # Always strip values
  1129. subfields.append(subfield)
  1130. if len(subfields) > 0:
  1131. new_field = create_field(subfields, field[1], field[2],
  1132. field[3])
  1133. fields.append(new_field)
  1134. if len(fields) > 0:
  1135. rec[tag] = fields
  1136. else:
  1137. del rec[tag]
  1138. def record_strip_controlfields(rec):
  1139. """
  1140. Removes all non-empty controlfields from the record
  1141. @param rec: A record dictionary structure
  1142. @type rec: dictionary
  1143. """
  1144. for tag in rec.keys():
  1145. if tag[:2] == '00' and rec[tag][0][3]:
  1146. del rec[tag]
  1147. def record_order_subfields(rec, tag=None):
  1148. """ Orders subfields from a record alphabetically based on subfield code.
  1149. If 'tag' is not None, only a specific tag of the record will be reordered,
  1150. otherwise the whole record.
  1151. @param rec: bibrecord
  1152. @type rec: bibrec
  1153. @param tag: tag where the subfields will be ordered
  1154. @type tag: string
  1155. """
  1156. if rec is None:
  1157. return rec
  1158. if tag is None:
  1159. tags = rec.keys()
  1160. for tag in tags:
  1161. record_order_subfields(rec, tag)
  1162. elif tag in rec:
  1163. for i in xrange(len(rec[tag])):
  1164. field = rec[tag][i]
  1165. # Order subfields alphabetically by subfield code
  1166. ordered_subfields = sorted(field[0], key=lambda subfield: subfield[0])
  1167. rec[tag][i] = (ordered_subfields, field[1], field[2], field[3], field[4])
  1168. def record_empty(rec):
  1169. for key in rec.iterkeys():
  1170. if key not in ('001', '005'):
  1171. return False
  1172. return True
  1173. ### IMPLEMENTATION / INVISIBLE FUNCTIONS
  1174. def _compare_fields(field1, field2, strict=True):
  1175. """
  1176. Compares 2 fields. If strict is True, then the order of the
  1177. subfield will be taken care of, if not then the order of the
  1178. subfields doesn't matter.
  1179. @return: True if the field are equivalent, False otherwise.
  1180. """
  1181. if strict:
  1182. # Return a simple equal test on the field minus the position.
  1183. return field1[:4] == field2[:4]
  1184. else:
  1185. if field1[1:4] != field2[1:4]:
  1186. # Different indicators or controlfield value.
  1187. return False
  1188. else:
  1189. # Compare subfields in a loose way.
  1190. return set(field1[0]) == set(field2[0])
  1191. def _check_field_validity(field):
  1192. """
  1193. Checks if a field is well-formed.
  1194. @param field: A field tuple as returned by create_field()
  1195. @type field: tuple
  1196. @raise InvenioBibRecordFieldError: If the field is invalid.
  1197. """
  1198. if type(field) not in (list, tuple):
  1199. raise InvenioBibRecordFieldError("Field of type '%s' should be either "
  1200. "a list or a tuple." % type(field))
  1201. if len(field) != 5:
  1202. raise InvenioBibRecordFieldError("Field of length '%d' should have 5 "
  1203. "elements." % len(field))
  1204. if type(field[0]) not in (list, tuple):
  1205. raise InvenioBibRecordFieldError("Subfields of type '%s' should be "
  1206. "either a list or a tuple." % type(field[0]))
  1207. if type(field[1]) is not str:
  1208. raise InvenioBibRecordFieldError("Indicator 1 of type '%s' should be "
  1209. "a string." % type(field[1]))
  1210. if type(field[2]) is not str:
  1211. raise InvenioBibRecordFieldError("Indicator 2 of type '%s' should be "
  1212. "a string." % type(field[2]))
  1213. if type(field[3]) is not str:
  1214. raise InvenioBibRecordFieldError("Controlfield value of type '%s' "
  1215. "should be a string." % type(field[3]))
  1216. if type(field[4]) is not int:
  1217. raise InvenioBibRecordFieldError("Global position of type '%s' should "
  1218. "be an int." % type(field[4]))
  1219. for subfield in field[0]:
  1220. if (type(subfield) not in (list, tuple) or
  1221. len(subfield) != 2 or
  1222. type(subfield[0]) is not str or
  1223. type(subfield[1]) is not str):
  1224. raise InvenioBibRecordFieldError("Subfields are malformed. "
  1225. "Should a list of tuples of 2 strings.")
  1226. def _shift_field_positions_global(record, start, delta=1):
  1227. """Shifts all global field positions with global field positions
  1228. higher or equal to 'start' from the value 'delta'."""
  1229. if not delta:
  1230. return
  1231. for tag, fields in record.items():
  1232. newfields = []
  1233. for field in fields:
  1234. if field[4] < start:
  1235. newfields.append(field)
  1236. else:
  1237. # Increment the global field position by delta.
  1238. newfields.append(tuple(list(field[:4]) + [field[4] + delta]))
  1239. record[tag] = newfields
  1240. def _tag_matches_pattern(tag, pattern):
  1241. """Returns true if MARC 'tag' matches a 'pattern'.
  1242. 'pattern' is plain text, with % as wildcard
  1243. Both parameters must be 3 characters long strings.
  1244. For e.g.
  1245. >> _tag_matches_pattern("909", "909") -> True
  1246. >> _tag_matches_pattern("909", "9%9") -> True
  1247. >> _tag_matches_pattern("909", "9%8") -> False
  1248. @param tag: a 3 characters long string
  1249. @param pattern: a 3 characters long string
  1250. @return: False or True"""
  1251. for char1, char2 in zip(tag, pattern):
  1252. if char2 not in ('%', char1):
  1253. return False
  1254. return True
  1255. def _validate_record_field_positions_global(record):
  1256. """
  1257. Checks if the global field positions in the record are valid ie no
  1258. duplicate global field positions and local field positions in the
  1259. list of fields are ascending.
  1260. @param record: the record data structure
  1261. @return: the first error found as a string or None if no error was found
  1262. """
  1263. all_fields = []
  1264. for tag, fields in record.items():
  1265. previous_field_position_global = -1
  1266. for field in fields:
  1267. if field[4] < previous_field_position_global:
  1268. return "Non ascending global field positions in tag '%s'." % tag
  1269. previous_field_position_global = field[4]
  1270. if field[4] in all_fields:
  1271. return ("Duplicate global field position '%d' in tag '%s'" %
  1272. (field[4], tag))
  1273. def _record_sort_by_indicators(record):
  1274. """Sorts the fields inside the record by indicators."""
  1275. for tag, fields in record.items():
  1276. record[tag] = _fields_sort_by_indicators(fields)
  1277. def _fields_sort_by_indicators(fields):
  1278. """Sorts a set of fields by their indicators. Returns a sorted list
  1279. with correct global field positions."""
  1280. field_dict = {}
  1281. field_positions_global = []
  1282. for field in fields:
  1283. field_dict.setdefault(field[1:3], []).append(field)
  1284. field_positions_global.append(field[4])
  1285. indicators = field_dict.keys()
  1286. indicators.sort()
  1287. field_list = []
  1288. for indicator in indicators:
  1289. for field in field_dict[indicator]:
  1290. field_list.append(field[:4] + (field_positions_global.pop(0),))
  1291. return field_list
  1292. def _select_parser(parser=None):
  1293. """Selects the more relevant parser based on the parsers available
  1294. and on the parser desired by the user."""
  1295. if not AVAILABLE_PARSERS:
  1296. # No parser is available. This is bad.
  1297. return None
  1298. if parser is None or parser not in AVAILABLE_PARSERS:
  1299. # Return the best available parser.
  1300. return AVAILABLE_PARSERS[0]
  1301. else:
  1302. return parser
  1303. def _create_record_lxml(marcxml,
  1304. verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
  1305. correct=CFG_BIBRECORD_DEFAULT_CORRECT,
  1306. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  1307. """Creates a record object using the LXML parser.
  1308. If correct == 1, then perform DTD validation
  1309. If correct == 0, then do not perform DTD validation
  1310. If verbose == 0, the parser will not give warnings.
  1311. If 1 <= verbose <= 3, the parser will not give errors, but will warn
  1312. the user about possible mistakes (implement me!)
  1313. If verbose > 3 then the parser will be strict and will stop in case of
  1314. well-formedness errors or DTD errors."""
  1315. parser = etree.XMLParser(dtd_validation = correct,
  1316. recover = verbose <= 3)
  1317. if correct:
  1318. marcxml = ('<?xml version="1.0" encoding="UTF-8"?>\n'
  1319. '<!DOCTYPE collection SYSTEM "file://%s">\n'
  1320. '<collection>\n%s\n</collection>' \
  1321. % (CFG_MARC21_DTD, marcxml))
  1322. try:
  1323. tree = etree.parse(StringIO(marcxml), parser)
  1324. # parser errors are located in parser.error_log
  1325. # if 1 <= verbose <=3 then show them to the user?
  1326. # if verbose == 0 then continue
  1327. # if verbose >3 then an exception will be thrown
  1328. except Exception, e:
  1329. raise InvenioBibRecordParserError(str(e))
  1330. record = {}
  1331. field_position_global = 0
  1332. controlfield_iterator = tree.iter(tag='controlfield')
  1333. for controlfield in controlfield_iterator:
  1334. tag = controlfield.attrib.get('tag', '!').encode("UTF-8")
  1335. ind1 = ' '
  1336. ind2 = ' '
  1337. text = controlfield.text
  1338. if text is None:
  1339. text = ''
  1340. else:
  1341. text = text.encode("UTF-8")
  1342. subfields = []
  1343. if text or keep_singletons:
  1344. field_position_global += 1
  1345. record.setdefault(tag, []).append((subfields, ind1, ind2, text, field_position_global))
  1346. datafield_iterator = tree.iter(tag='datafield')
  1347. for datafield in datafield_iterator:
  1348. tag = datafield.attrib.get('tag', '!').encode("UTF-8")
  1349. ind1 = datafield.attrib.get('ind1', '!').encode("UTF-8")
  1350. ind2 = datafield.attrib.get('ind2', '!').encode("UTF-8")
  1351. #ind1, ind2 = _wash_indicators(ind1, ind2)
  1352. if ind1 in ('', '_'): ind1 = ' '
  1353. if ind2 in ('', '_'): ind2 = ' '
  1354. subfields = []
  1355. subfield_iterator = datafield.iter(tag='subfield')
  1356. for subfield in subfield_iterator:
  1357. code = subfield.attrib.get('code', '!').encode("UTF-8")
  1358. text = subfield.text
  1359. if text is None:
  1360. text = ''
  1361. else:
  1362. text = text.encode("UTF-8")
  1363. if text or keep_singletons:
  1364. subfields.append((code, text))
  1365. if subfields or keep_singletons:
  1366. text = ''
  1367. field_position_global += 1
  1368. record.setdefault(tag, []).append((subfields, ind1, ind2, text, field_position_global))
  1369. return record
  1370. def _create_record_rxp(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
  1371. correct=CFG_BIBRECORD_DEFAULT_CORRECT,
  1372. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  1373. """Creates a record object using the RXP parser.
  1374. If verbose>3 then the parser will be strict and will stop in case of
  1375. well-formedness errors or DTD errors.
  1376. If verbose=0, the parser will not give warnings.
  1377. If 0 < verbose <= 3, the parser will not give errors, but will warn
  1378. the user about possible mistakes
  1379. correct != 0 -> We will try to correct errors such as missing
  1380. attributes
  1381. correct = 0 -> there will not be any attempt to correct errors"""
  1382. if correct:
  1383. # Note that with pyRXP < 1.13 a memory leak has been found
  1384. # involving DTD parsing. So enable correction only if you have
  1385. # pyRXP 1.13 or greater.
  1386. marcxml = ('<?xml version="1.0" encoding="UTF-8"?>\n'
  1387. '<!DOCTYPE collection SYSTEM "file://%s">\n'
  1388. '<collection>\n%s\n</collection>' % (CFG_MARC21_DTD, marcxml))
  1389. # Create the pyRXP parser.
  1390. pyrxp_parser = pyRXP.Parser(ErrorOnValidityErrors=0, ProcessDTD=1,
  1391. ErrorOnUnquotedAttributeValues=0, srcName='string input')
  1392. if verbose > 3:
  1393. pyrxp_parser.ErrorOnValidityErrors = 1
  1394. pyrxp_parser.ErrorOnUnquotedAttributeValues = 1
  1395. try:
  1396. root = pyrxp_parser.parse(marcxml)
  1397. except pyRXP.error, ex1:
  1398. raise InvenioBibRecordParserError(str(ex1))
  1399. # If record is enclosed in a collection tag, extract it.
  1400. if root[TAG] == 'collection':
  1401. children = _get_children_by_tag_name_rxp(root, 'record')
  1402. if not children:
  1403. return {}
  1404. root = children[0]
  1405. record = {}
  1406. # This is needed because of the record_xml_output function, where we
  1407. # need to know the order of the fields.
  1408. field_position_global = 1
  1409. # Consider the control fields.
  1410. for controlfield in _get_children_by_tag_name_rxp(root, 'controlfield'):
  1411. if controlfield[CHILDREN]:
  1412. value = ''.join([n for n in controlfield[CHILDREN]])
  1413. # Construct the field tuple.
  1414. field = ([], ' ', ' ', value, field_position_global)
  1415. record.setdefault(controlfield[ATTRS]['tag'], []).append(field)
  1416. field_position_global += 1
  1417. elif keep_singletons:
  1418. field = ([], ' ', ' ', '', field_position_global)
  1419. record.setdefault(controlfield[ATTRS]['tag'], []).append(field)
  1420. field_position_global += 1
  1421. # Consider the data fields.
  1422. for datafield in _get_children_by_tag_name_rxp(root, 'datafield'):
  1423. subfields = []
  1424. for subfield in _get_children_by_tag_name_rxp(datafield, 'subfield'):
  1425. if subfield[CHILDREN]:
  1426. value = _get_children_as_string_rxp(subfield[CHILDREN])
  1427. subfields.append((subfield[ATTRS].get('code', '!'), value))
  1428. elif keep_singletons:
  1429. subfields.append((subfield[ATTRS].get('code', '!'), ''))
  1430. if subfields or keep_singletons:
  1431. # Create the field.
  1432. tag = datafield[ATTRS].get('tag', '!')
  1433. ind1 = datafield[ATTRS].get('ind1', '!')
  1434. ind2 = datafield[ATTRS].get('ind2', '!')
  1435. ind1, ind2 = _wash_indicators(ind1, ind2)
  1436. # Construct the field tuple.
  1437. field = (subfields, ind1, ind2, '', field_position_global)
  1438. record.setdefault(tag, []).append(field)
  1439. field_position_global += 1
  1440. return record
  1441. def _create_record_from_document(document,
  1442. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  1443. """Creates a record from the document (of type
  1444. xml.dom.minidom.Document or Ft.Xml.Domlette.Document)."""
  1445. root = None
  1446. for node in document.childNodes:
  1447. if node.nodeType == node.ELEMENT_NODE:
  1448. root = node
  1449. break
  1450. if root is None:
  1451. return {}
  1452. if root.tagName == 'collection':
  1453. children = _get_children_by_tag_name(root, 'record')
  1454. if not children:
  1455. return {}
  1456. root = children[0]
  1457. field_position_global = 1
  1458. record = {}
  1459. for controlfield in _get_children_by_tag_name(root, "controlfield"):
  1460. tag = controlfield.getAttributeNS(None, "tag").encode('utf-8')
  1461. text_nodes = controlfield.childNodes
  1462. value = ''.join([n.data for n in text_nodes]).encode("utf-8")
  1463. if value or keep_singletons:
  1464. field = ([], " ", " ", value, field_position_global)
  1465. record.setdefault(tag, []).append(field)
  1466. field_position_global += 1
  1467. for datafield in _get_children_by_tag_name(root, "datafield"):
  1468. subfields = []
  1469. for subfield in _get_children_by_tag_name(datafield, "subfield"):
  1470. value = _get_children_as_string(subfield.childNodes).encode("utf-8")
  1471. if value or keep_singletons:
  1472. code = subfield.getAttributeNS(None, 'code').encode("utf-8")
  1473. subfields.append((code or '!', value))
  1474. if subfields or keep_singletons:
  1475. tag = datafield.getAttributeNS(None, "tag").encode("utf-8") or '!'
  1476. ind1 = datafield.getAttributeNS(None, "ind1").encode("utf-8")
  1477. ind2 = datafield.getAttributeNS(None, "ind2").encode("utf-8")
  1478. ind1, ind2 = _wash_indicators(ind1, ind2)
  1479. field = (subfields, ind1, ind2, "", field_position_global)
  1480. record.setdefault(tag, []).append(field)
  1481. field_position_global += 1
  1482. return record
  1483. def _create_record_minidom(marcxml,
  1484. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  1485. """Creates a record using minidom."""
  1486. try:
  1487. dom = xml.dom.minidom.parseString(marcxml)
  1488. except xml.parsers.expat.ExpatError, ex1:
  1489. raise InvenioBibRecordParserError(str(ex1))
  1490. return _create_record_from_document(dom, keep_singletons=keep_singletons)
  1491. def _create_record_4suite(marcxml,
  1492. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  1493. """Creates a record using the 4suite parser."""
  1494. try:
  1495. dom = Ft.Xml.Domlette.NonvalidatingReader.parseString(marcxml,
  1496. "urn:dummy")
  1497. except Ft.Xml.ReaderException, ex1:
  1498. raise InvenioBibRecordParserError(ex1.message)
  1499. return _create_record_from_document(dom, keep_singletons=keep_singletons)
  1500. def _concat(alist):
  1501. """Concats a list of lists"""
  1502. return [element for single_list in alist for element in single_list]
  1503. def _subfield_xml_output(subfield):
  1504. """Generates the XML for a subfield object and return it as a string"""
  1505. return ' <subfield code="%s">%s</subfield>' % (subfield[0],
  1506. encode_for_xml(subfield[1]))
  1507. def _order_by_ord(field1, field2):
  1508. """Function used to order the fields according to their ord value"""
  1509. return cmp(field1[1][4], field2[1][4])
  1510. def _order_by_tags(field1, field2):
  1511. """Function used to order the fields according to the tags"""
  1512. return cmp(field1[0], field2[0])
  1513. def _get_children_by_tag_name(node, name):
  1514. """Retrieves all children from node 'node' with name 'name' and
  1515. returns them as a list."""
  1516. try:
  1517. return [child for child in node.childNodes if child.nodeName == name]
  1518. except TypeError:
  1519. return []
  1520. def _get_children_by_tag_name_rxp(node, name):
  1521. """Retrieves all children from 'children' with tag name 'tag' and
  1522. returns them as a list.
  1523. children is a list returned by the RXP parser"""
  1524. try:
  1525. return [child for child in node[CHILDREN] if child[TAG] == name]
  1526. except TypeError:
  1527. return []
  1528. def _get_children_as_string(node):
  1529. """
  1530. Iterates through all the children of a node and returns one string
  1531. containing the values from all the text-nodes recursively.
  1532. """
  1533. out = []
  1534. if node:
  1535. for child in node:
  1536. if child.nodeType == child.TEXT_NODE:
  1537. out.append(child.data)
  1538. else:
  1539. out.append(_get_children_as_string(child.childNodes))
  1540. return ''.join(out)
  1541. def _get_children_as_string_rxp(node):
  1542. """
  1543. RXP version of _get_children_as_string():
  1544. Iterates through all the children of a node and returns one string
  1545. containing the values from all the text-nodes recursively.
  1546. """
  1547. out = []
  1548. if node:
  1549. for child in node:
  1550. if type(child) is str:
  1551. out.append(child)
  1552. else:
  1553. out.append(_get_children_as_string_rxp(child[CHILDREN]))
  1554. return ''.join(out)
  1555. def _wash_indicators(*indicators):
  1556. """
  1557. Washes the values of the indicators. An empty string or an
  1558. underscore is replaced by a blank space.
  1559. @param indicators: a series of indicators to be washed
  1560. @return: a list of washed indicators
  1561. """
  1562. return [indicator in ('', '_') and ' ' or indicator
  1563. for indicator in indicators]
  1564. def _correct_record(record):
  1565. """
  1566. Checks and corrects the structure of the record.
  1567. @param record: the record data structure
  1568. @return: a list of errors found
  1569. """
  1570. errors = []
  1571. for tag in record.keys():
  1572. upper_bound = '999'
  1573. n = len(tag)
  1574. if n > 3:
  1575. i = n - 3
  1576. while i > 0:
  1577. upper_bound = '%s%s' % ('0', upper_bound)
  1578. i -= 1
  1579. # Missing tag. Replace it with dummy tag '000'.
  1580. if tag == '!':
  1581. errors.append((1, '(field number(s): ' +
  1582. str([f[4] for f in record[tag]]) + ')'))
  1583. record['000'] = record.pop(tag)
  1584. tag = '000'
  1585. elif not ('001' <= tag <= upper_bound or tag in ('FMT', 'FFT', 'BDR', 'BDM')):
  1586. errors.append(2)
  1587. record['000'] = record.pop(tag)
  1588. tag = '000'
  1589. fields = []
  1590. for field in record[tag]:
  1591. # Datafield without any subfield.
  1592. if field[0] == [] and field[3] == '':
  1593. errors.append((8, '(field number: ' + str(field[4]) + ')'))
  1594. subfields = []
  1595. for subfield in field[0]:
  1596. if subfield[0] == '!':
  1597. errors.append((3, '(field number: ' + str(field[4]) + ')'))
  1598. newsub = ('', subfield[1])
  1599. else:
  1600. newsub = subfield
  1601. subfields.append(newsub)
  1602. if field[1] == '!':
  1603. errors.append((4, '(field number: ' + str(field[4]) + ')'))
  1604. ind1 = " "
  1605. else:
  1606. ind1 = field[1]
  1607. if field[2] == '!':
  1608. errors.append((5, '(field number: ' + str(field[4]) + ')'))
  1609. ind2 = " "
  1610. else:
  1611. ind2 = field[2]
  1612. fields.append((subfields, ind1, ind2, field[3], field[4]))
  1613. record[tag] = fields
  1614. return errors
  1615. def _warning(code):
  1616. """It returns a warning message of code 'code'.
  1617. If code = (cd, str) it returns the warning message of code 'cd'
  1618. and appends str at the end"""
  1619. if isinstance(code, str):
  1620. return code
  1621. message = ''
  1622. if isinstance(code, tuple):
  1623. if isinstance(code[0], str):
  1624. message = code[1]
  1625. code = code[0]
  1626. return CFG_BIBRECORD_WARNING_MSGS.get(code, '') + message
  1627. def _warnings(alist):
  1628. """Applies the function _warning() to every element in l."""
  1629. return [_warning(element) for element in alist]
  1630. def _compare_lists(list1, list2, custom_cmp):
  1631. """Compares twolists using given comparing function
  1632. @param list1: first list to compare
  1633. @param list2: second list to compare
  1634. @param custom_cmp: a function taking two arguments (element of
  1635. list 1, element of list 2) and
  1636. @return: True or False depending if the values are the same"""
  1637. if len(list1) != len(list2):
  1638. return False
  1639. for element1, element2 in zip(list1, list2):
  1640. if not custom_cmp(element1, element2):
  1641. return False
  1642. return True