PageRenderTime 81ms CodeModel.GetById 56ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 0ms

/Lib/mimetypes.py

http://unladen-swallow.googlecode.com/
Python | 538 lines | 447 code | 17 blank | 74 comment | 34 complexity | d8c0df35c90110d54022647a5e5ee5f6 MD5 | raw file
  1"""Guess the MIME type of a file.
  2
  3This module defines two useful functions:
  4
  5guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
  6
  7guess_extension(type, strict=1) -- guess the extension for a given MIME type.
  8
  9It also contains the following, for tuning the behavior:
 10
 11Data:
 12
 13knownfiles -- list of files to parse
 14inited -- flag set when init() has been called
 15suffix_map -- dictionary mapping suffixes to suffixes
 16encodings_map -- dictionary mapping suffixes to encodings
 17types_map -- dictionary mapping suffixes to types
 18
 19Functions:
 20
 21init([files]) -- parse a list of files, default knownfiles
 22read_mime_types(file) -- parse one file, return a dictionary or None
 23"""
 24
 25import os
 26import posixpath
 27import urllib
 28
 29__all__ = [
 30    "guess_type","guess_extension","guess_all_extensions",
 31    "add_type","read_mime_types","init"
 32]
 33
 34knownfiles = [
 35    "/etc/mime.types",
 36    "/etc/httpd/mime.types",                    # Mac OS X
 37    "/etc/httpd/conf/mime.types",               # Apache
 38    "/etc/apache/mime.types",                   # Apache 1
 39    "/etc/apache2/mime.types",                  # Apache 2
 40    "/usr/local/etc/httpd/conf/mime.types",
 41    "/usr/local/lib/netscape/mime.types",
 42    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2
 43    "/usr/local/etc/mime.types",                # Apache 1.3
 44    ]
 45
 46inited = False
 47_db = None
 48
 49
 50class MimeTypes:
 51    """MIME-types datastore.
 52
 53    This datastore can handle information from mime.types-style files
 54    and supports basic determination of MIME type from a filename or
 55    URL, and can guess a reasonable extension given a MIME type.
 56    """
 57
 58    def __init__(self, filenames=(), strict=True):
 59        if not inited:
 60            init()
 61        self.encodings_map = encodings_map.copy()
 62        self.suffix_map = suffix_map.copy()
 63        self.types_map = ({}, {}) # dict for (non-strict, strict)
 64        self.types_map_inv = ({}, {})
 65        for (ext, type) in types_map.items():
 66            self.add_type(type, ext, True)
 67        for (ext, type) in common_types.items():
 68            self.add_type(type, ext, False)
 69        for name in filenames:
 70            self.read(name, strict)
 71
 72    def add_type(self, type, ext, strict=True):
 73        """Add a mapping between a type and an extension.
 74
 75        When the extension is already known, the new
 76        type will replace the old one. When the type
 77        is already known the extension will be added
 78        to the list of known extensions.
 79
 80        If strict is true, information will be added to
 81        list of standard types, else to the list of non-standard
 82        types.
 83        """
 84        self.types_map[strict][ext] = type
 85        exts = self.types_map_inv[strict].setdefault(type, [])
 86        if ext not in exts:
 87            exts.append(ext)
 88
 89    def guess_type(self, url, strict=True):
 90        """Guess the type of a file based on its URL.
 91
 92        Return value is a tuple (type, encoding) where type is None if
 93        the type can't be guessed (no or unknown suffix) or a string
 94        of the form type/subtype, usable for a MIME Content-type
 95        header; and encoding is None for no encoding or the name of
 96        the program used to encode (e.g. compress or gzip).  The
 97        mappings are table driven.  Encoding suffixes are case
 98        sensitive; type suffixes are first tried case sensitive, then
 99        case insensitive.
100
101        The suffixes .tgz, .taz and .tz (case sensitive!) are all
102        mapped to '.tar.gz'.  (This is table-driven too, using the
103        dictionary suffix_map.)
104
105        Optional `strict' argument when False adds a bunch of commonly found,
106        but non-standard types.
107        """
108        scheme, url = urllib.splittype(url)
109        if scheme == 'data':
110            # syntax of data URLs:
111            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
112            # mediatype := [ type "/" subtype ] *( ";" parameter )
113            # data      := *urlchar
114            # parameter := attribute "=" value
115            # type/subtype defaults to "text/plain"
116            comma = url.find(',')
117            if comma < 0:
118                # bad data URL
119                return None, None
120            semi = url.find(';', 0, comma)
121            if semi >= 0:
122                type = url[:semi]
123            else:
124                type = url[:comma]
125            if '=' in type or '/' not in type:
126                type = 'text/plain'
127            return type, None           # never compressed, so encoding is None
128        base, ext = posixpath.splitext(url)
129        while ext in self.suffix_map:
130            base, ext = posixpath.splitext(base + self.suffix_map[ext])
131        if ext in self.encodings_map:
132            encoding = self.encodings_map[ext]
133            base, ext = posixpath.splitext(base)
134        else:
135            encoding = None
136        types_map = self.types_map[True]
137        if ext in types_map:
138            return types_map[ext], encoding
139        elif ext.lower() in types_map:
140            return types_map[ext.lower()], encoding
141        elif strict:
142            return None, encoding
143        types_map = self.types_map[False]
144        if ext in types_map:
145            return types_map[ext], encoding
146        elif ext.lower() in types_map:
147            return types_map[ext.lower()], encoding
148        else:
149            return None, encoding
150
151    def guess_all_extensions(self, type, strict=True):
152        """Guess the extensions for a file based on its MIME type.
153
154        Return value is a list of strings giving the possible filename
155        extensions, including the leading dot ('.').  The extension is not
156        guaranteed to have been associated with any particular data stream,
157        but would be mapped to the MIME type `type' by guess_type().
158
159        Optional `strict' argument when false adds a bunch of commonly found,
160        but non-standard types.
161        """
162        type = type.lower()
163        extensions = self.types_map_inv[True].get(type, [])
164        if not strict:
165            for ext in self.types_map_inv[False].get(type, []):
166                if ext not in extensions:
167                    extensions.append(ext)
168        return extensions
169
170    def guess_extension(self, type, strict=True):
171        """Guess the extension for a file based on its MIME type.
172
173        Return value is a string giving a filename extension,
174        including the leading dot ('.').  The extension is not
175        guaranteed to have been associated with any particular data
176        stream, but would be mapped to the MIME type `type' by
177        guess_type().  If no extension can be guessed for `type', None
178        is returned.
179
180        Optional `strict' argument when false adds a bunch of commonly found,
181        but non-standard types.
182        """
183        extensions = self.guess_all_extensions(type, strict)
184        if not extensions:
185            return None
186        return extensions[0]
187
188    def read(self, filename, strict=True):
189        """
190        Read a single mime.types-format file, specified by pathname.
191
192        If strict is true, information will be added to
193        list of standard types, else to the list of non-standard
194        types.
195        """
196        fp = open(filename)
197        self.readfp(fp, strict)
198        fp.close()
199
200    def readfp(self, fp, strict=True):
201        """
202        Read a single mime.types-format file.
203
204        If strict is true, information will be added to
205        list of standard types, else to the list of non-standard
206        types.
207        """
208        while 1:
209            line = fp.readline()
210            if not line:
211                break
212            words = line.split()
213            for i in range(len(words)):
214                if words[i][0] == '#':
215                    del words[i:]
216                    break
217            if not words:
218                continue
219            type, suffixes = words[0], words[1:]
220            for suff in suffixes:
221                self.add_type(type, '.' + suff, strict)
222
223def guess_type(url, strict=True):
224    """Guess the type of a file based on its URL.
225
226    Return value is a tuple (type, encoding) where type is None if the
227    type can't be guessed (no or unknown suffix) or a string of the
228    form type/subtype, usable for a MIME Content-type header; and
229    encoding is None for no encoding or the name of the program used
230    to encode (e.g. compress or gzip).  The mappings are table
231    driven.  Encoding suffixes are case sensitive; type suffixes are
232    first tried case sensitive, then case insensitive.
233
234    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
235    to ".tar.gz".  (This is table-driven too, using the dictionary
236    suffix_map).
237
238    Optional `strict' argument when false adds a bunch of commonly found, but
239    non-standard types.
240    """
241    if _db is None:
242        init()
243    return _db.guess_type(url, strict)
244
245
246def guess_all_extensions(type, strict=True):
247    """Guess the extensions for a file based on its MIME type.
248
249    Return value is a list of strings giving the possible filename
250    extensions, including the leading dot ('.').  The extension is not
251    guaranteed to have been associated with any particular data
252    stream, but would be mapped to the MIME type `type' by
253    guess_type().  If no extension can be guessed for `type', None
254    is returned.
255
256    Optional `strict' argument when false adds a bunch of commonly found,
257    but non-standard types.
258    """
259    if _db is None:
260        init()
261    return _db.guess_all_extensions(type, strict)
262
263def guess_extension(type, strict=True):
264    """Guess the extension for a file based on its MIME type.
265
266    Return value is a string giving a filename extension, including the
267    leading dot ('.').  The extension is not guaranteed to have been
268    associated with any particular data stream, but would be mapped to the
269    MIME type `type' by guess_type().  If no extension can be guessed for
270    `type', None is returned.
271
272    Optional `strict' argument when false adds a bunch of commonly found,
273    but non-standard types.
274    """
275    if _db is None:
276        init()
277    return _db.guess_extension(type, strict)
278
279def add_type(type, ext, strict=True):
280    """Add a mapping between a type and an extension.
281
282    When the extension is already known, the new
283    type will replace the old one. When the type
284    is already known the extension will be added
285    to the list of known extensions.
286
287    If strict is true, information will be added to
288    list of standard types, else to the list of non-standard
289    types.
290    """
291    if _db is None:
292        init()
293    return _db.add_type(type, ext, strict)
294
295
296def init(files=None):
297    global suffix_map, types_map, encodings_map, common_types
298    global inited, _db
299    inited = True    # so that MimeTypes.__init__() doesn't call us again
300    db = MimeTypes()
301    if files is None:
302        files = knownfiles
303    for file in files:
304        if os.path.isfile(file):
305            db.readfp(open(file))
306    encodings_map = db.encodings_map
307    suffix_map = db.suffix_map
308    types_map = db.types_map[True]
309    common_types = db.types_map[False]
310    # Make the DB a global variable now that it is fully initialized
311    _db = db
312
313
314def read_mime_types(file):
315    try:
316        f = open(file)
317    except IOError:
318        return None
319    db = MimeTypes()
320    db.readfp(f, True)
321    return db.types_map[True]
322
323
324def _default_mime_types():
325    global suffix_map
326    global encodings_map
327    global types_map
328    global common_types
329
330    suffix_map = {
331        '.tgz': '.tar.gz',
332        '.taz': '.tar.gz',
333        '.tz': '.tar.gz',
334        '.tbz2': '.tar.bz2',
335        }
336
337    encodings_map = {
338        '.gz': 'gzip',
339        '.Z': 'compress',
340        '.bz2': 'bzip2',
341        }
342
343    # Before adding new types, make sure they are either registered with IANA,
344    # at http://www.isi.edu/in-notes/iana/assignments/media-types
345    # or extensions, i.e. using the x- prefix
346
347    # If you add to these, please keep them sorted!
348    types_map = {
349        '.a'      : 'application/octet-stream',
350        '.ai'     : 'application/postscript',
351        '.aif'    : 'audio/x-aiff',
352        '.aifc'   : 'audio/x-aiff',
353        '.aiff'   : 'audio/x-aiff',
354        '.au'     : 'audio/basic',
355        '.avi'    : 'video/x-msvideo',
356        '.bat'    : 'text/plain',
357        '.bcpio'  : 'application/x-bcpio',
358        '.bin'    : 'application/octet-stream',
359        '.bmp'    : 'image/x-ms-bmp',
360        '.c'      : 'text/plain',
361        # Duplicates :(
362        '.cdf'    : 'application/x-cdf',
363        '.cdf'    : 'application/x-netcdf',
364        '.cpio'   : 'application/x-cpio',
365        '.csh'    : 'application/x-csh',
366        '.css'    : 'text/css',
367        '.dll'    : 'application/octet-stream',
368        '.doc'    : 'application/msword',
369        '.dot'    : 'application/msword',
370        '.dvi'    : 'application/x-dvi',
371        '.eml'    : 'message/rfc822',
372        '.eps'    : 'application/postscript',
373        '.etx'    : 'text/x-setext',
374        '.exe'    : 'application/octet-stream',
375        '.gif'    : 'image/gif',
376        '.gtar'   : 'application/x-gtar',
377        '.h'      : 'text/plain',
378        '.hdf'    : 'application/x-hdf',
379        '.htm'    : 'text/html',
380        '.html'   : 'text/html',
381        '.ief'    : 'image/ief',
382        '.jpe'    : 'image/jpeg',
383        '.jpeg'   : 'image/jpeg',
384        '.jpg'    : 'image/jpeg',
385        '.js'     : 'application/x-javascript',
386        '.ksh'    : 'text/plain',
387        '.latex'  : 'application/x-latex',
388        '.m1v'    : 'video/mpeg',
389        '.man'    : 'application/x-troff-man',
390        '.me'     : 'application/x-troff-me',
391        '.mht'    : 'message/rfc822',
392        '.mhtml'  : 'message/rfc822',
393        '.mif'    : 'application/x-mif',
394        '.mov'    : 'video/quicktime',
395        '.movie'  : 'video/x-sgi-movie',
396        '.mp2'    : 'audio/mpeg',
397        '.mp3'    : 'audio/mpeg',
398        '.mp4'    : 'video/mp4',
399        '.mpa'    : 'video/mpeg',
400        '.mpe'    : 'video/mpeg',
401        '.mpeg'   : 'video/mpeg',
402        '.mpg'    : 'video/mpeg',
403        '.ms'     : 'application/x-troff-ms',
404        '.nc'     : 'application/x-netcdf',
405        '.nws'    : 'message/rfc822',
406        '.o'      : 'application/octet-stream',
407        '.obj'    : 'application/octet-stream',
408        '.oda'    : 'application/oda',
409        '.p12'    : 'application/x-pkcs12',
410        '.p7c'    : 'application/pkcs7-mime',
411        '.pbm'    : 'image/x-portable-bitmap',
412        '.pdf'    : 'application/pdf',
413        '.pfx'    : 'application/x-pkcs12',
414        '.pgm'    : 'image/x-portable-graymap',
415        '.pl'     : 'text/plain',
416        '.png'    : 'image/png',
417        '.pnm'    : 'image/x-portable-anymap',
418        '.pot'    : 'application/vnd.ms-powerpoint',
419        '.ppa'    : 'application/vnd.ms-powerpoint',
420        '.ppm'    : 'image/x-portable-pixmap',
421        '.pps'    : 'application/vnd.ms-powerpoint',
422        '.ppt'    : 'application/vnd.ms-powerpoint',
423        '.ps'     : 'application/postscript',
424        '.pwz'    : 'application/vnd.ms-powerpoint',
425        '.py'     : 'text/x-python',
426        '.pyc'    : 'application/x-python-code',
427        '.pyo'    : 'application/x-python-code',
428        '.qt'     : 'video/quicktime',
429        '.ra'     : 'audio/x-pn-realaudio',
430        '.ram'    : 'application/x-pn-realaudio',
431        '.ras'    : 'image/x-cmu-raster',
432        '.rdf'    : 'application/xml',
433        '.rgb'    : 'image/x-rgb',
434        '.roff'   : 'application/x-troff',
435        '.rtx'    : 'text/richtext',
436        '.sgm'    : 'text/x-sgml',
437        '.sgml'   : 'text/x-sgml',
438        '.sh'     : 'application/x-sh',
439        '.shar'   : 'application/x-shar',
440        '.snd'    : 'audio/basic',
441        '.so'     : 'application/octet-stream',
442        '.src'    : 'application/x-wais-source',
443        '.sv4cpio': 'application/x-sv4cpio',
444        '.sv4crc' : 'application/x-sv4crc',
445        '.swf'    : 'application/x-shockwave-flash',
446        '.t'      : 'application/x-troff',
447        '.tar'    : 'application/x-tar',
448        '.tcl'    : 'application/x-tcl',
449        '.tex'    : 'application/x-tex',
450        '.texi'   : 'application/x-texinfo',
451        '.texinfo': 'application/x-texinfo',
452        '.tif'    : 'image/tiff',
453        '.tiff'   : 'image/tiff',
454        '.tr'     : 'application/x-troff',
455        '.tsv'    : 'text/tab-separated-values',
456        '.txt'    : 'text/plain',
457        '.ustar'  : 'application/x-ustar',
458        '.vcf'    : 'text/x-vcard',
459        '.wav'    : 'audio/x-wav',
460        '.wiz'    : 'application/msword',
461        '.wsdl'   : 'application/xml',
462        '.xbm'    : 'image/x-xbitmap',
463        '.xlb'    : 'application/vnd.ms-excel',
464        # Duplicates :(
465        '.xls'    : 'application/excel',
466        '.xls'    : 'application/vnd.ms-excel',
467        '.xml'    : 'text/xml',
468        '.xpdl'   : 'application/xml',
469        '.xpm'    : 'image/x-xpixmap',
470        '.xsl'    : 'application/xml',
471        '.xwd'    : 'image/x-xwindowdump',
472        '.zip'    : 'application/zip',
473        }
474
475    # These are non-standard types, commonly found in the wild.  They will
476    # only match if strict=0 flag is given to the API methods.
477
478    # Please sort these too
479    common_types = {
480        '.jpg' : 'image/jpg',
481        '.mid' : 'audio/midi',
482        '.midi': 'audio/midi',
483        '.pct' : 'image/pict',
484        '.pic' : 'image/pict',
485        '.pict': 'image/pict',
486        '.rtf' : 'application/rtf',
487        '.xul' : 'text/xul'
488        }
489
490
491_default_mime_types()
492
493
494if __name__ == '__main__':
495    import sys
496    import getopt
497
498    USAGE = """\
499Usage: mimetypes.py [options] type
500
501Options:
502    --help / -h       -- print this message and exit
503    --lenient / -l    -- additionally search of some common, but non-standard
504                         types.
505    --extension / -e  -- guess extension instead of type
506
507More than one type argument may be given.
508"""
509
510    def usage(code, msg=''):
511        print USAGE
512        if msg: print msg
513        sys.exit(code)
514
515    try:
516        opts, args = getopt.getopt(sys.argv[1:], 'hle',
517                                   ['help', 'lenient', 'extension'])
518    except getopt.error, msg:
519        usage(1, msg)
520
521    strict = 1
522    extension = 0
523    for opt, arg in opts:
524        if opt in ('-h', '--help'):
525            usage(0)
526        elif opt in ('-l', '--lenient'):
527            strict = 0
528        elif opt in ('-e', '--extension'):
529            extension = 1
530    for gtype in args:
531        if extension:
532            guess = guess_extension(gtype, strict)
533            if not guess: print "I don't know anything about type", gtype
534            else: print guess
535        else:
536            guess, encoding = guess_type(gtype, strict)
537            if not guess: print "I don't know anything about type", gtype
538            else: print 'type:', guess, 'encoding:', encoding