/Lib/mimetypes.py

http://unladen-swallow.googlecode.com/ · Python · 538 lines · 417 code · 37 blank · 84 comment · 48 complexity · d8c0df35c90110d54022647a5e5ee5f6 MD5 · raw file

  1. """Guess the MIME type of a file.
  2. This module defines two useful functions:
  3. guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
  4. guess_extension(type, strict=1) -- guess the extension for a given MIME type.
  5. It also contains the following, for tuning the behavior:
  6. Data:
  7. knownfiles -- list of files to parse
  8. inited -- flag set when init() has been called
  9. suffix_map -- dictionary mapping suffixes to suffixes
  10. encodings_map -- dictionary mapping suffixes to encodings
  11. types_map -- dictionary mapping suffixes to types
  12. Functions:
  13. init([files]) -- parse a list of files, default knownfiles
  14. read_mime_types(file) -- parse one file, return a dictionary or None
  15. """
  16. import os
  17. import posixpath
  18. import urllib
  19. __all__ = [
  20. "guess_type","guess_extension","guess_all_extensions",
  21. "add_type","read_mime_types","init"
  22. ]
  23. knownfiles = [
  24. "/etc/mime.types",
  25. "/etc/httpd/mime.types", # Mac OS X
  26. "/etc/httpd/conf/mime.types", # Apache
  27. "/etc/apache/mime.types", # Apache 1
  28. "/etc/apache2/mime.types", # Apache 2
  29. "/usr/local/etc/httpd/conf/mime.types",
  30. "/usr/local/lib/netscape/mime.types",
  31. "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
  32. "/usr/local/etc/mime.types", # Apache 1.3
  33. ]
  34. inited = False
  35. _db = None
  36. class MimeTypes:
  37. """MIME-types datastore.
  38. This datastore can handle information from mime.types-style files
  39. and supports basic determination of MIME type from a filename or
  40. URL, and can guess a reasonable extension given a MIME type.
  41. """
  42. def __init__(self, filenames=(), strict=True):
  43. if not inited:
  44. init()
  45. self.encodings_map = encodings_map.copy()
  46. self.suffix_map = suffix_map.copy()
  47. self.types_map = ({}, {}) # dict for (non-strict, strict)
  48. self.types_map_inv = ({}, {})
  49. for (ext, type) in types_map.items():
  50. self.add_type(type, ext, True)
  51. for (ext, type) in common_types.items():
  52. self.add_type(type, ext, False)
  53. for name in filenames:
  54. self.read(name, strict)
  55. def add_type(self, type, ext, strict=True):
  56. """Add a mapping between a type and an extension.
  57. When the extension is already known, the new
  58. type will replace the old one. When the type
  59. is already known the extension will be added
  60. to the list of known extensions.
  61. If strict is true, information will be added to
  62. list of standard types, else to the list of non-standard
  63. types.
  64. """
  65. self.types_map[strict][ext] = type
  66. exts = self.types_map_inv[strict].setdefault(type, [])
  67. if ext not in exts:
  68. exts.append(ext)
  69. def guess_type(self, url, strict=True):
  70. """Guess the type of a file based on its URL.
  71. Return value is a tuple (type, encoding) where type is None if
  72. the type can't be guessed (no or unknown suffix) or a string
  73. of the form type/subtype, usable for a MIME Content-type
  74. header; and encoding is None for no encoding or the name of
  75. the program used to encode (e.g. compress or gzip). The
  76. mappings are table driven. Encoding suffixes are case
  77. sensitive; type suffixes are first tried case sensitive, then
  78. case insensitive.
  79. The suffixes .tgz, .taz and .tz (case sensitive!) are all
  80. mapped to '.tar.gz'. (This is table-driven too, using the
  81. dictionary suffix_map.)
  82. Optional `strict' argument when False adds a bunch of commonly found,
  83. but non-standard types.
  84. """
  85. scheme, url = urllib.splittype(url)
  86. if scheme == 'data':
  87. # syntax of data URLs:
  88. # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
  89. # mediatype := [ type "/" subtype ] *( ";" parameter )
  90. # data := *urlchar
  91. # parameter := attribute "=" value
  92. # type/subtype defaults to "text/plain"
  93. comma = url.find(',')
  94. if comma < 0:
  95. # bad data URL
  96. return None, None
  97. semi = url.find(';', 0, comma)
  98. if semi >= 0:
  99. type = url[:semi]
  100. else:
  101. type = url[:comma]
  102. if '=' in type or '/' not in type:
  103. type = 'text/plain'
  104. return type, None # never compressed, so encoding is None
  105. base, ext = posixpath.splitext(url)
  106. while ext in self.suffix_map:
  107. base, ext = posixpath.splitext(base + self.suffix_map[ext])
  108. if ext in self.encodings_map:
  109. encoding = self.encodings_map[ext]
  110. base, ext = posixpath.splitext(base)
  111. else:
  112. encoding = None
  113. types_map = self.types_map[True]
  114. if ext in types_map:
  115. return types_map[ext], encoding
  116. elif ext.lower() in types_map:
  117. return types_map[ext.lower()], encoding
  118. elif strict:
  119. return None, encoding
  120. types_map = self.types_map[False]
  121. if ext in types_map:
  122. return types_map[ext], encoding
  123. elif ext.lower() in types_map:
  124. return types_map[ext.lower()], encoding
  125. else:
  126. return None, encoding
  127. def guess_all_extensions(self, type, strict=True):
  128. """Guess the extensions for a file based on its MIME type.
  129. Return value is a list of strings giving the possible filename
  130. extensions, including the leading dot ('.'). The extension is not
  131. guaranteed to have been associated with any particular data stream,
  132. but would be mapped to the MIME type `type' by guess_type().
  133. Optional `strict' argument when false adds a bunch of commonly found,
  134. but non-standard types.
  135. """
  136. type = type.lower()
  137. extensions = self.types_map_inv[True].get(type, [])
  138. if not strict:
  139. for ext in self.types_map_inv[False].get(type, []):
  140. if ext not in extensions:
  141. extensions.append(ext)
  142. return extensions
  143. def guess_extension(self, type, strict=True):
  144. """Guess the extension for a file based on its MIME type.
  145. Return value is a string giving a filename extension,
  146. including the leading dot ('.'). The extension is not
  147. guaranteed to have been associated with any particular data
  148. stream, but would be mapped to the MIME type `type' by
  149. guess_type(). If no extension can be guessed for `type', None
  150. is returned.
  151. Optional `strict' argument when false adds a bunch of commonly found,
  152. but non-standard types.
  153. """
  154. extensions = self.guess_all_extensions(type, strict)
  155. if not extensions:
  156. return None
  157. return extensions[0]
  158. def read(self, filename, strict=True):
  159. """
  160. Read a single mime.types-format file, specified by pathname.
  161. If strict is true, information will be added to
  162. list of standard types, else to the list of non-standard
  163. types.
  164. """
  165. fp = open(filename)
  166. self.readfp(fp, strict)
  167. fp.close()
  168. def readfp(self, fp, strict=True):
  169. """
  170. Read a single mime.types-format file.
  171. If strict is true, information will be added to
  172. list of standard types, else to the list of non-standard
  173. types.
  174. """
  175. while 1:
  176. line = fp.readline()
  177. if not line:
  178. break
  179. words = line.split()
  180. for i in range(len(words)):
  181. if words[i][0] == '#':
  182. del words[i:]
  183. break
  184. if not words:
  185. continue
  186. type, suffixes = words[0], words[1:]
  187. for suff in suffixes:
  188. self.add_type(type, '.' + suff, strict)
  189. def guess_type(url, strict=True):
  190. """Guess the type of a file based on its URL.
  191. Return value is a tuple (type, encoding) where type is None if the
  192. type can't be guessed (no or unknown suffix) or a string of the
  193. form type/subtype, usable for a MIME Content-type header; and
  194. encoding is None for no encoding or the name of the program used
  195. to encode (e.g. compress or gzip). The mappings are table
  196. driven. Encoding suffixes are case sensitive; type suffixes are
  197. first tried case sensitive, then case insensitive.
  198. The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
  199. to ".tar.gz". (This is table-driven too, using the dictionary
  200. suffix_map).
  201. Optional `strict' argument when false adds a bunch of commonly found, but
  202. non-standard types.
  203. """
  204. if _db is None:
  205. init()
  206. return _db.guess_type(url, strict)
  207. def guess_all_extensions(type, strict=True):
  208. """Guess the extensions for a file based on its MIME type.
  209. Return value is a list of strings giving the possible filename
  210. extensions, including the leading dot ('.'). The extension is not
  211. guaranteed to have been associated with any particular data
  212. stream, but would be mapped to the MIME type `type' by
  213. guess_type(). If no extension can be guessed for `type', None
  214. is returned.
  215. Optional `strict' argument when false adds a bunch of commonly found,
  216. but non-standard types.
  217. """
  218. if _db is None:
  219. init()
  220. return _db.guess_all_extensions(type, strict)
  221. def guess_extension(type, strict=True):
  222. """Guess the extension for a file based on its MIME type.
  223. Return value is a string giving a filename extension, including the
  224. leading dot ('.'). The extension is not guaranteed to have been
  225. associated with any particular data stream, but would be mapped to the
  226. MIME type `type' by guess_type(). If no extension can be guessed for
  227. `type', None is returned.
  228. Optional `strict' argument when false adds a bunch of commonly found,
  229. but non-standard types.
  230. """
  231. if _db is None:
  232. init()
  233. return _db.guess_extension(type, strict)
  234. def add_type(type, ext, strict=True):
  235. """Add a mapping between a type and an extension.
  236. When the extension is already known, the new
  237. type will replace the old one. When the type
  238. is already known the extension will be added
  239. to the list of known extensions.
  240. If strict is true, information will be added to
  241. list of standard types, else to the list of non-standard
  242. types.
  243. """
  244. if _db is None:
  245. init()
  246. return _db.add_type(type, ext, strict)
  247. def init(files=None):
  248. global suffix_map, types_map, encodings_map, common_types
  249. global inited, _db
  250. inited = True # so that MimeTypes.__init__() doesn't call us again
  251. db = MimeTypes()
  252. if files is None:
  253. files = knownfiles
  254. for file in files:
  255. if os.path.isfile(file):
  256. db.readfp(open(file))
  257. encodings_map = db.encodings_map
  258. suffix_map = db.suffix_map
  259. types_map = db.types_map[True]
  260. common_types = db.types_map[False]
  261. # Make the DB a global variable now that it is fully initialized
  262. _db = db
  263. def read_mime_types(file):
  264. try:
  265. f = open(file)
  266. except IOError:
  267. return None
  268. db = MimeTypes()
  269. db.readfp(f, True)
  270. return db.types_map[True]
  271. def _default_mime_types():
  272. global suffix_map
  273. global encodings_map
  274. global types_map
  275. global common_types
  276. suffix_map = {
  277. '.tgz': '.tar.gz',
  278. '.taz': '.tar.gz',
  279. '.tz': '.tar.gz',
  280. '.tbz2': '.tar.bz2',
  281. }
  282. encodings_map = {
  283. '.gz': 'gzip',
  284. '.Z': 'compress',
  285. '.bz2': 'bzip2',
  286. }
  287. # Before adding new types, make sure they are either registered with IANA,
  288. # at http://www.isi.edu/in-notes/iana/assignments/media-types
  289. # or extensions, i.e. using the x- prefix
  290. # If you add to these, please keep them sorted!
  291. types_map = {
  292. '.a' : 'application/octet-stream',
  293. '.ai' : 'application/postscript',
  294. '.aif' : 'audio/x-aiff',
  295. '.aifc' : 'audio/x-aiff',
  296. '.aiff' : 'audio/x-aiff',
  297. '.au' : 'audio/basic',
  298. '.avi' : 'video/x-msvideo',
  299. '.bat' : 'text/plain',
  300. '.bcpio' : 'application/x-bcpio',
  301. '.bin' : 'application/octet-stream',
  302. '.bmp' : 'image/x-ms-bmp',
  303. '.c' : 'text/plain',
  304. # Duplicates :(
  305. '.cdf' : 'application/x-cdf',
  306. '.cdf' : 'application/x-netcdf',
  307. '.cpio' : 'application/x-cpio',
  308. '.csh' : 'application/x-csh',
  309. '.css' : 'text/css',
  310. '.dll' : 'application/octet-stream',
  311. '.doc' : 'application/msword',
  312. '.dot' : 'application/msword',
  313. '.dvi' : 'application/x-dvi',
  314. '.eml' : 'message/rfc822',
  315. '.eps' : 'application/postscript',
  316. '.etx' : 'text/x-setext',
  317. '.exe' : 'application/octet-stream',
  318. '.gif' : 'image/gif',
  319. '.gtar' : 'application/x-gtar',
  320. '.h' : 'text/plain',
  321. '.hdf' : 'application/x-hdf',
  322. '.htm' : 'text/html',
  323. '.html' : 'text/html',
  324. '.ief' : 'image/ief',
  325. '.jpe' : 'image/jpeg',
  326. '.jpeg' : 'image/jpeg',
  327. '.jpg' : 'image/jpeg',
  328. '.js' : 'application/x-javascript',
  329. '.ksh' : 'text/plain',
  330. '.latex' : 'application/x-latex',
  331. '.m1v' : 'video/mpeg',
  332. '.man' : 'application/x-troff-man',
  333. '.me' : 'application/x-troff-me',
  334. '.mht' : 'message/rfc822',
  335. '.mhtml' : 'message/rfc822',
  336. '.mif' : 'application/x-mif',
  337. '.mov' : 'video/quicktime',
  338. '.movie' : 'video/x-sgi-movie',
  339. '.mp2' : 'audio/mpeg',
  340. '.mp3' : 'audio/mpeg',
  341. '.mp4' : 'video/mp4',
  342. '.mpa' : 'video/mpeg',
  343. '.mpe' : 'video/mpeg',
  344. '.mpeg' : 'video/mpeg',
  345. '.mpg' : 'video/mpeg',
  346. '.ms' : 'application/x-troff-ms',
  347. '.nc' : 'application/x-netcdf',
  348. '.nws' : 'message/rfc822',
  349. '.o' : 'application/octet-stream',
  350. '.obj' : 'application/octet-stream',
  351. '.oda' : 'application/oda',
  352. '.p12' : 'application/x-pkcs12',
  353. '.p7c' : 'application/pkcs7-mime',
  354. '.pbm' : 'image/x-portable-bitmap',
  355. '.pdf' : 'application/pdf',
  356. '.pfx' : 'application/x-pkcs12',
  357. '.pgm' : 'image/x-portable-graymap',
  358. '.pl' : 'text/plain',
  359. '.png' : 'image/png',
  360. '.pnm' : 'image/x-portable-anymap',
  361. '.pot' : 'application/vnd.ms-powerpoint',
  362. '.ppa' : 'application/vnd.ms-powerpoint',
  363. '.ppm' : 'image/x-portable-pixmap',
  364. '.pps' : 'application/vnd.ms-powerpoint',
  365. '.ppt' : 'application/vnd.ms-powerpoint',
  366. '.ps' : 'application/postscript',
  367. '.pwz' : 'application/vnd.ms-powerpoint',
  368. '.py' : 'text/x-python',
  369. '.pyc' : 'application/x-python-code',
  370. '.pyo' : 'application/x-python-code',
  371. '.qt' : 'video/quicktime',
  372. '.ra' : 'audio/x-pn-realaudio',
  373. '.ram' : 'application/x-pn-realaudio',
  374. '.ras' : 'image/x-cmu-raster',
  375. '.rdf' : 'application/xml',
  376. '.rgb' : 'image/x-rgb',
  377. '.roff' : 'application/x-troff',
  378. '.rtx' : 'text/richtext',
  379. '.sgm' : 'text/x-sgml',
  380. '.sgml' : 'text/x-sgml',
  381. '.sh' : 'application/x-sh',
  382. '.shar' : 'application/x-shar',
  383. '.snd' : 'audio/basic',
  384. '.so' : 'application/octet-stream',
  385. '.src' : 'application/x-wais-source',
  386. '.sv4cpio': 'application/x-sv4cpio',
  387. '.sv4crc' : 'application/x-sv4crc',
  388. '.swf' : 'application/x-shockwave-flash',
  389. '.t' : 'application/x-troff',
  390. '.tar' : 'application/x-tar',
  391. '.tcl' : 'application/x-tcl',
  392. '.tex' : 'application/x-tex',
  393. '.texi' : 'application/x-texinfo',
  394. '.texinfo': 'application/x-texinfo',
  395. '.tif' : 'image/tiff',
  396. '.tiff' : 'image/tiff',
  397. '.tr' : 'application/x-troff',
  398. '.tsv' : 'text/tab-separated-values',
  399. '.txt' : 'text/plain',
  400. '.ustar' : 'application/x-ustar',
  401. '.vcf' : 'text/x-vcard',
  402. '.wav' : 'audio/x-wav',
  403. '.wiz' : 'application/msword',
  404. '.wsdl' : 'application/xml',
  405. '.xbm' : 'image/x-xbitmap',
  406. '.xlb' : 'application/vnd.ms-excel',
  407. # Duplicates :(
  408. '.xls' : 'application/excel',
  409. '.xls' : 'application/vnd.ms-excel',
  410. '.xml' : 'text/xml',
  411. '.xpdl' : 'application/xml',
  412. '.xpm' : 'image/x-xpixmap',
  413. '.xsl' : 'application/xml',
  414. '.xwd' : 'image/x-xwindowdump',
  415. '.zip' : 'application/zip',
  416. }
  417. # These are non-standard types, commonly found in the wild. They will
  418. # only match if strict=0 flag is given to the API methods.
  419. # Please sort these too
  420. common_types = {
  421. '.jpg' : 'image/jpg',
  422. '.mid' : 'audio/midi',
  423. '.midi': 'audio/midi',
  424. '.pct' : 'image/pict',
  425. '.pic' : 'image/pict',
  426. '.pict': 'image/pict',
  427. '.rtf' : 'application/rtf',
  428. '.xul' : 'text/xul'
  429. }
  430. _default_mime_types()
  431. if __name__ == '__main__':
  432. import sys
  433. import getopt
  434. USAGE = """\
  435. Usage: mimetypes.py [options] type
  436. Options:
  437. --help / -h -- print this message and exit
  438. --lenient / -l -- additionally search of some common, but non-standard
  439. types.
  440. --extension / -e -- guess extension instead of type
  441. More than one type argument may be given.
  442. """
  443. def usage(code, msg=''):
  444. print USAGE
  445. if msg: print msg
  446. sys.exit(code)
  447. try:
  448. opts, args = getopt.getopt(sys.argv[1:], 'hle',
  449. ['help', 'lenient', 'extension'])
  450. except getopt.error, msg:
  451. usage(1, msg)
  452. strict = 1
  453. extension = 0
  454. for opt, arg in opts:
  455. if opt in ('-h', '--help'):
  456. usage(0)
  457. elif opt in ('-l', '--lenient'):
  458. strict = 0
  459. elif opt in ('-e', '--extension'):
  460. extension = 1
  461. for gtype in args:
  462. if extension:
  463. guess = guess_extension(gtype, strict)
  464. if not guess: print "I don't know anything about type", gtype
  465. else: print guess
  466. else:
  467. guess, encoding = guess_type(gtype, strict)
  468. if not guess: print "I don't know anything about type", gtype
  469. else: print 'type:', guess, 'encoding:', encoding