python/html/mimetypes_8py_source.html

"""Guess the MIME type of a file.


This module defines two useful functions:


guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.


guess_extension(type, strict=1) -- guess the extension for a given MIME type.


It also contains the following, for tuning the behavior:


Data:


knownfiles -- list of files to parse

inited -- flag set when init() has been called

suffix_map -- dictionary mapping suffixes to suffixes

encodings_map -- dictionary mapping suffixes to encodings

types_map -- dictionary mapping suffixes to types


Functions:


init([files]) -- parse a list of files, default knownfiles

read_mime_types(file) -- parse one file, return a dictionary or None

"""


import os

import posixpath

import urllib


__all__ = ["guess_type","guess_extension","read_mime_types","init"]


knownfiles = [

    "/usr/local/etc/httpd/conf/mime.types",

    "/usr/local/lib/netscape/mime.types",

    "/usr/local/etc/httpd/conf/mime.types",     # Apache 1.2

    "/usr/local/etc/mime.types",                # Apache 1.3

    ]


inited = 0


class MimeTypes:

    """MIME-types datastore.


    This datastore can handle information from mime.types-style files

    and supports basic determination of MIME type from a filename or

    URL, and can guess a reasonable extension given a MIME type.

    """


    def __init__(self, filenames=()):

        if not inited:

            init()

        self.encodings_map = encodings_map.copy()

        self.suffix_map = suffix_map.copy()

        self.types_map = types_map.copy()

        self.common_types = common_types.copy()

        for name in filenames:

            self.read(name)


    def guess_type(self, url, strict=1):

        """Guess the type of a file based on its URL.


        Return value is a tuple (type, encoding) where type is None if

        the type can't be guessed (no or unknown suffix) or a string

        of the form type/subtype, usable for a MIME Content-type

        header; and encoding is None for no encoding or the name of

        the program used to encode (e.g. compress or gzip).  The

        mappings are table driven.  Encoding suffixes are case

        sensitive; type suffixes are first tried case sensitive, then

        case insensitive.


        The suffixes .tgz, .taz and .tz (case sensitive!) are all

        mapped to '.tar.gz'.  (This is table-driven too, using the

        dictionary suffix_map.)


        Optional `strict' argument when false adds a bunch of commonly found,

        but non-standard types.

        """

        scheme, url = urllib.splittype(url)

        if scheme == 'data':

            # syntax of data URLs:

            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data

            # mediatype := [ type "/" subtype ] *( ";" parameter )

            # data      := *urlchar

            # parameter := attribute "=" value

            # type/subtype defaults to "text/plain"

            comma = url.find(',')

            if comma < 0:

                # bad data URL

                return None, None

            semi = url.find(';', 0, comma)

            if semi >= 0:

                type = url[:semi]

            else:

                type = url[:comma]

            if '=' in type or '/' not in type:

                type = 'text/plain'

            return type, None           # never compressed, so encoding is None

        base, ext = posixpath.splitext(url)

        while self.suffix_map.has_key(ext):

            base, ext = posixpath.splitext(base + self.suffix_map[ext])

        if self.encodings_map.has_key(ext):

            encoding = self.encodings_map[ext]

            base, ext = posixpath.splitext(base)

        else:

            encoding = None

        types_map = self.types_map

        common_types = self.common_types

        if types_map.has_key(ext):

            return types_map[ext], encoding

        elif types_map.has_key(ext.lower()):

            return types_map[ext.lower()], encoding

        elif strict:

            return None, encoding

        elif common_types.has_key(ext):

            return common_types[ext], encoding

        elif common_types.has_key(ext.lower()):

            return common_types[ext.lower()], encoding

        else:

            return None, encoding


    def guess_extension(self, type, strict=1):

        """Guess the extension for a file based on its MIME type.


        Return value is a string giving a filename extension,

        including the leading dot ('.').  The extension is not

        guaranteed to have been associated with any particular data

        stream, but would be mapped to the MIME type `type' by

        guess_type().  If no extension can be guessed for `type', None

        is returned.


        Optional `strict' argument when false adds a bunch of commonly found,

        but non-standard types.

        """

        type = type.lower()

        for ext, stype in self.types_map.items():

            if type == stype:

                return ext

        if not strict:

            for ext, stype in common_types.items():

                if type == stype:

                    return ext

        return None


    def read(self, filename):

        """Read a single mime.types-format file, specified by pathname."""

        fp = open(filename)

        self.readfp(fp)

        fp.close()


    def readfp(self, fp):

        """Read a single mime.types-format file."""

        map = self.types_map

        while 1:

            line = fp.readline()

            if not line:

                break

            words = line.split()

            for i in range(len(words)):

                if words[i][0] == '#':

                    del words[i:]

                    break

            if not words:

                continue

            type, suffixes = words[0], words[1:]

            for suff in suffixes:

                map['.' + suff] = type


def guess_type(url, strict=1):

    """Guess the type of a file based on its URL.


    Return value is a tuple (type, encoding) where type is None if the

    type can't be guessed (no or unknown suffix) or a string of the

    form type/subtype, usable for a MIME Content-type header; and

    encoding is None for no encoding or the name of the program used

    to encode (e.g. compress or gzip).  The mappings are table

    driven.  Encoding suffixes are case sensitive; type suffixes are

    first tried case sensitive, then case insensitive.


    The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped

    to ".tar.gz".  (This is table-driven too, using the dictionary

    suffix_map).


    Optional `strict' argument when false adds a bunch of commonly found, but

    non-standard types.

    """

    init()

    return guess_type(url, strict)


def guess_extension(type, strict=1):

    """Guess the extension for a file based on its MIME type.


    Return value is a string giving a filename extension, including the

    leading dot ('.').  The extension is not guaranteed to have been

    associated with any particular data stream, but would be mapped to the

    MIME type `type' by guess_type().  If no extension can be guessed for

    `type', None is returned.


    Optional `strict' argument when false adds a bunch of commonly found,

    but non-standard types.

    """

    init()

    return guess_extension(type, strict)


def init(files=None):

    global guess_extension, guess_type

    global suffix_map, types_map, encodings_map, common_types

    global inited

    inited = 1

    db = MimeTypes()

    if files is None:

        files = knownfiles

    for file in files:

        if os.path.isfile(file):

            db.readfp(open(file))

    encodings_map = db.encodings_map

    suffix_map = db.suffix_map

    types_map = db.types_map

    guess_extension = db.guess_extension

    guess_type = db.guess_type

    common_types = db.common_types


def read_mime_types(file):

    try:

        f = open(file)

    except IOError:

        return None

    db = MimeTypes()

    db.readfp(f)

    return db.types_map


suffix_map = {

    '.tgz': '.tar.gz',

    '.taz': '.tar.gz',

    '.tz': '.tar.gz',

    }


encodings_map = {

    '.gz': 'gzip',

    '.Z': 'compress',

    }


# Before adding new types, make sure they are either registered with IANA, at

# http://www.isi.edu/in-notes/iana/assignments/media-types

# or extensions, i.e. using the x- prefix


# If you add to these, please keep them sorted!

types_map = {

    '.a'      : 'application/octet-stream',

    '.ai'     : 'application/postscript',

    '.aif'    : 'audio/x-aiff',

    '.aifc'   : 'audio/x-aiff',

    '.aiff'   : 'audio/x-aiff',

    '.au'     : 'audio/basic',

    '.avi'    : 'video/x-msvideo',

    '.bat'    : 'text/plain',

    '.bcpio'  : 'application/x-bcpio',

    '.bin'    : 'application/octet-stream',

    '.bmp'    : 'image/x-ms-bmp',

    '.c'      : 'text/plain',

    # Duplicates :(

    '.cdf'    : 'application/x-cdf',

    '.cdf'    : 'application/x-netcdf',

    '.cpio'   : 'application/x-cpio',

    '.csh'    : 'application/x-csh',

    '.css'    : 'text/css',

    '.dll'    : 'application/octet-stream',

    '.doc'    : 'application/msword',

    '.dot'    : 'application/msword',

    '.dvi'    : 'application/x-dvi',

    '.eml'    : 'message/rfc822',

    '.eps'    : 'application/postscript',

    '.etx'    : 'text/x-setext',

    '.exe'    : 'application/octet-stream',

    '.gif'    : 'image/gif',

    '.gtar'   : 'application/x-gtar',

    '.h'      : 'text/plain',

    '.hdf'    : 'application/x-hdf',

    '.htm'    : 'text/html',

    '.html'   : 'text/html',

    '.ief'    : 'image/ief',

    '.jpe'    : 'image/jpeg',

    '.jpeg'   : 'image/jpeg',

    '.jpg'    : 'image/jpeg',

    '.js'     : 'application/x-javascript',

    '.ksh'    : 'text/plain',

    '.latex'  : 'application/x-latex',

    '.m1v'    : 'video/mpeg',

    '.man'    : 'application/x-troff-man',

    '.me'     : 'application/x-troff-me',

    '.mht'    : 'message/rfc822',

    '.mhtml'  : 'message/rfc822',

    '.mif'    : 'application/x-mif',

    '.mov'    : 'video/quicktime',

    '.movie'  : 'video/x-sgi-movie',

    '.mp2'    : 'audio/mpeg',

    '.mp3'    : 'audio/mpeg',

    '.mpa'    : 'video/mpeg',

    '.mpe'    : 'video/mpeg',

    '.mpeg'   : 'video/mpeg',

    '.mpg'    : 'video/mpeg',

    '.ms'     : 'application/x-troff-ms',

    '.nc'     : 'application/x-netcdf',

    '.nws'    : 'message/rfc822',

    '.o'      : 'application/octet-stream',

    '.obj'    : 'application/octet-stream',

    '.oda'    : 'application/oda',

    '.p12'    : 'application/x-pkcs12',

    '.p7c'    : 'application/pkcs7-mime',

    '.pbm'    : 'image/x-portable-bitmap',

    '.pdf'    : 'application/pdf',

    '.pfx'    : 'application/x-pkcs12',

    '.pgm'    : 'image/x-portable-graymap',

    '.pl'     : 'text/plain',

    '.png'    : 'image/png',

    '.pnm'    : 'image/x-portable-anymap',

    '.pot'    : 'application/vnd.ms-powerpoint',

    '.ppa'    : 'application/vnd.ms-powerpoint',

    '.ppm'    : 'image/x-portable-pixmap',

    '.pps'    : 'application/vnd.ms-powerpoint',

    '.ppt'    : 'application/vnd.ms-powerpoint',

    '.ps'     : 'application/postscript',

    '.pwz'    : 'application/vnd.ms-powerpoint',

    '.py'     : 'text/x-python',

    '.pyc'    : 'application/x-python-code',

    '.pyo'    : 'application/x-python-code',

    '.qt'     : 'video/quicktime',

    '.ra'     : 'audio/x-pn-realaudio',

    '.ram'    : 'application/x-pn-realaudio',

    '.ras'    : 'image/x-cmu-raster',

    '.rdf'    : 'application/xml',

    '.rgb'    : 'image/x-rgb',

    '.roff'   : 'application/x-troff',

    '.rtx'    : 'text/richtext',

    '.sgm'    : 'text/x-sgml',

    '.sgml'   : 'text/x-sgml',

    '.sh'     : 'application/x-sh',

    '.shar'   : 'application/x-shar',

    '.snd'    : 'audio/basic',

    '.so'     : 'application/octet-stream',

    '.src'    : 'application/x-wais-source',

    '.sv4cpio': 'application/x-sv4cpio',

    '.sv4crc' : 'application/x-sv4crc',

    '.t'      : 'application/x-troff',

    '.tar'    : 'application/x-tar',

    '.tcl'    : 'application/x-tcl',

    '.tex'    : 'application/x-tex',

    '.texi'   : 'application/x-texinfo',

    '.texinfo': 'application/x-texinfo',

    '.tif'    : 'image/tiff',

    '.tiff'   : 'image/tiff',

    '.tr'     : 'application/x-troff',

    '.tsv'    : 'text/tab-separated-values',

    '.txt'    : 'text/plain',

    '.ustar'  : 'application/x-ustar',

    '.vcf'    : 'text/x-vcard',

    '.wav'    : 'audio/x-wav',

    '.wiz'    : 'application/msword',

    '.xbm'    : 'image/x-xbitmap',

    '.xlb'    : 'application/vnd.ms-excel',

    # Duplicates :(

    '.xls'    : 'application/excel',

    '.xls'    : 'application/vnd.ms-excel',

    '.xml'    : 'text/xml',

    '.xpm'    : 'image/x-xpixmap',

    '.xsl'    : 'application/xml',

    '.xwd'    : 'image/x-xwindowdump',

    '.zip'    : 'application/zip',

    }


# These are non-standard types, commonly found in the wild.  They will only

# match if strict=0 flag is given to the API methods.


# Please sort these too

common_types = {

    '.jpg' : 'image/jpg',

    '.mid' : 'audio/midi',

    '.midi': 'audio/midi',

    '.pct' : 'image/pict',

    '.pic' : 'image/pict',

    '.pict': 'image/pict',

    '.rtf' : 'application/rtf',

    '.xul' : 'text/xul'

    }


if __name__ == '__main__':

    import sys

    import getopt


    USAGE = """\

Usage: mimetypes.py [options] type


Options:

    --help / -h       -- print this message and exit

    --lenient / -l    -- additionally search of some common, but non-standard

                         types.

    --extension / -e  -- guess extension instead of type


More than one type argument may be given.

"""


    def usage(code, msg=''):

        print USAGE

        if msg: print msg

        sys.exit(code)


    try:

        opts, args = getopt.getopt(sys.argv[1:], 'hle',

                                   ['help', 'lenient', 'extension'])

    except getopt.error, msg:

        usage(1, msg)


    strict = 1

    extension = 0

    for opt, arg in opts:

        if opt in ('-h', '--help'):

            usage(0)

        elif opt in ('-l', '--lenient'):

            strict = 0

        elif opt in ('-e', '--extension'):

            extension = 1

    for gtype in args:

        if extension:

            guess = guess_extension(gtype, strict)

            if not guess: print "I don't know anything about type", gtype

            else: print guess

        else:

            guess, encoding = guess_type(gtype, strict)

            if not guess: print "I don't know anything about type", gtype

            else: print 'type:', guess, 'encoding:', encoding