invenio /modules/websubmit/lib/wsm_extractor_plugin.py

Language Python Lines 75
MD5 Hash 3c9277317c4a7e48a78a99fd4d713e04
Repository https://github.com/gardenunez/invenio.git View Raw File View Project SPDX
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
## This file is part of Invenio.
## Copyright (C) 2010, 2011 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
WebSubmit Metadata Plugin - This is the generic metadata extraction
plugin. Contains methods to extract metadata from many kinds of files.

Dependencies: extractor
"""

__plugin_version__ = "WebSubmit File Metadata Plugin API 1.0"

import extractor
from invenio.bibdocfile import decompose_file

def can_read_local(inputfile):
    """
    Checks if inputfile is among metadata-readable file types

    @param inputfile: path to the image
    @type inputfile: string
    @rtype: boolean
    @return: True if file can be processed
    """

    # Check file type (0 base, 1 name, 2 ext)
    ext = decompose_file(inputfile)[2]
    return ext.lower() in ['.html', '.doc', '.ps', '.xls', '.ppt',
                           '.ps', '.sxw', '.sdw', '.dvi', '.man', '.flac',
                           '.mp3', '.nsf', '.sid', '.ogg', '.wav', '.png',
                           '.deb', '.rpm', '.tar.gz', '.zip', '.elf',
                           '.s3m', '.xm', '.it', '.flv', '.real', '.avi',
                           '.mpeg', '.qt', '.asf']

def read_metadata_local(inputfile, verbose):
    """
    Metadata extraction from many kind of files

    @param inputfile: path to the image
    @type inputfile: string
    @param verbose: verbosity
    @type verbose: int
    @rtype: dict
    @return: dictionary with metadata
    """
    # Initialization dict
    meta_info = {}

    # Extraction
    xtract = extractor.Extractor()

    # Get the keywords
    keys = xtract.extract(inputfile)

    # Loop to dump data to the dict
    for keyword_type, keyword in keys:
        meta_info[keyword_type.encode('iso-8859-1')] = \
            keyword.encode('iso-8859-1')

    # Return the dictionary
    return meta_info
Back to Top