PageRenderTime 51ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/invenio/legacy/websubmit/file_metadata_plugins/extractor_plugin.py

https://github.com/MSusik/invenio
Python | 74 lines | 19 code | 9 blank | 46 comment | 1 complexity | c0e5251bc2ee835753e2a6c0b53c26a5 MD5 | raw file
Possible License(s): GPL-2.0
  1. ## This file is part of Invenio.
  2. ## Copyright (C) 2010, 2011 CERN.
  3. ##
  4. ## Invenio is free software; you can redistribute it and/or
  5. ## modify it under the terms of the GNU General Public License as
  6. ## published by the Free Software Foundation; either version 2 of the
  7. ## License, or (at your option) any later version.
  8. ##
  9. ## Invenio is distributed in the hope that it will be useful, but
  10. ## WITHOUT ANY WARRANTY; without even the implied warranty of
  11. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. ## General Public License for more details.
  13. ##
  14. ## You should have received a copy of the GNU General Public License
  15. ## along with Invenio; if not, write to the Free Software Foundation, Inc.,
  16. ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  17. """
  18. WebSubmit Metadata Plugin - This is the generic metadata extraction
  19. plugin. Contains methods to extract metadata from many kinds of files.
  20. Dependencies: extractor
  21. """
  22. __plugin_version__ = "WebSubmit File Metadata Plugin API 1.0"
  23. import extractor
  24. from invenio.legacy.bibdocfile.api import decompose_file
  25. def can_read_local(inputfile):
  26. """
  27. Checks if inputfile is among metadata-readable file types
  28. @param inputfile: path to the image
  29. @type inputfile: string
  30. @rtype: boolean
  31. @return: True if file can be processed
  32. """
  33. # Check file type (0 base, 1 name, 2 ext)
  34. ext = decompose_file(inputfile)[2]
  35. return ext.lower() in ['.html', '.doc', '.ps', '.xls', '.ppt',
  36. '.ps', '.sxw', '.sdw', '.dvi', '.man', '.flac',
  37. '.mp3', '.nsf', '.sid', '.ogg', '.wav', '.png',
  38. '.deb', '.rpm', '.tar.gz', '.zip', '.elf',
  39. '.s3m', '.xm', '.it', '.flv', '.real', '.avi',
  40. '.mpeg', '.qt', '.asf']
  41. def read_metadata_local(inputfile, verbose):
  42. """
  43. Metadata extraction from many kind of files
  44. @param inputfile: path to the image
  45. @type inputfile: string
  46. @param verbose: verbosity
  47. @type verbose: int
  48. @rtype: dict
  49. @return: dictionary with metadata
  50. """
  51. # Initialization dict
  52. meta_info = {}
  53. # Extraction
  54. xtract = extractor.Extractor()
  55. # Get the keywords
  56. keys = xtract.extract(inputfile)
  57. # Loop to dump data to the dict
  58. for keyword_type, keyword in keys:
  59. meta_info[keyword_type.encode('iso-8859-1')] = \
  60. keyword.encode('iso-8859-1')
  61. # Return the dictionary
  62. return meta_info