python/html/urllib_8py_source.html

"""Open an arbitrary URL.


See the following document for more info on URLs:

"Names and Addresses, URIs, URLs, URNs, URCs", at

http://www.w3.org/pub/WWW/Addressing/Overview.html


See also the HTTP spec (from which the error codes are derived):

"HTTP - Hypertext Transfer Protocol", at

http://www.w3.org/pub/WWW/Protocols/


Related standards and specs:

- RFC1808: the "relative URL" spec. (authoritative status)

- RFC1738 - the "URL standard". (authoritative status)

- RFC1630 - the "URI spec". (informational status)


The object returned by URLopener().open(file) will differ per

protocol.  All you know is that is has methods read(), readline(),

readlines(), fileno(), close() and info().  The read*(), fileno()

and close() methods work like those of open files.

The info() method returns a mimetools.Message object which can be

used to query various info about the object, if available.

(mimetools.Message objects are queried with the getheader() method.)

"""


import string

import socket

import os

import stat

import time

import sys

import types


__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",

           "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",

           "urlencode", "url2pathname", "pathname2url", "splittag",

           "localhost", "thishost", "ftperrors", "basejoin", "unwrap",

           "splittype", "splithost", "splituser", "splitpasswd", "splitport",

           "splitnport", "splitquery", "splitattr", "splitvalue",

           "splitgophertype", "getproxies"]


__version__ = '1.15'    # XXX This version is not always updated :-(


MAXFTPCACHE = 10        # Trim the ftp cache beyond this size


# Helper for non-unix systems

if os.name == 'mac':

    from macurl2path import url2pathname, pathname2url

elif os.name == 'nt':

    from nturl2path import url2pathname, pathname2url

elif os.name == 'riscos':

    from rourl2path import url2pathname, pathname2url

else:

    def url2pathname(pathname):

        return unquote(pathname)

    def pathname2url(pathname):

        return quote(pathname)


# This really consists of two pieces:

# (1) a class which handles opening of all sorts of URLs

#     (plus assorted utilities etc.)

# (2) a set of functions for parsing URLs

# XXX Should these be separated out into different modules?


# Shortcut for basic usage

_urlopener = None

def urlopen(url, data=None):

    """urlopen(url [, data]) -> open file-like object"""

    global _urlopener

    if not _urlopener:

        _urlopener = FancyURLopener()

    if data is None:

        return _urlopener.open(url)

    else:

        return _urlopener.open(url, data)

def urlretrieve(url, filename=None, reporthook=None, data=None):

    global _urlopener

    if not _urlopener:

        _urlopener = FancyURLopener()

    return _urlopener.retrieve(url, filename, reporthook, data)

def urlcleanup():

    if _urlopener:

        _urlopener.cleanup()


ftpcache = {}

class URLopener:

    """Class to open URLs.

    This is a class rather than just a subroutine because we may need

    more than one set of global protocol-specific options.

    Note -- this is a base class for those who don't want the

    automatic handling of errors type 302 (relocated) and 401

    (authorization needed)."""


    __tempfiles = None


    version = "Python-urllib/%s" % __version__


    # Constructor

    def __init__(self, proxies=None, **x509):

        if proxies is None:

            proxies = getproxies()

        assert hasattr(proxies, 'has_key'), "proxies must be a mapping"

        self.proxies = proxies

        self.key_file = x509.get('key_file')

        self.cert_file = x509.get('cert_file')

        self.addheaders = [('User-agent', self.version)]

        self.__tempfiles = []

        self.__unlink = os.unlink # See cleanup()

        self.tempcache = None

        # Undocumented feature: if you assign {} to tempcache,

        # it is used to cache files retrieved with

        # self.retrieve().  This is not enabled by default

        # since it does not work for changing documents (and I

        # haven't got the logic to check expiration headers

        # yet).

        self.ftpcache = ftpcache

        # Undocumented feature: you can use a different

        # ftp cache by assigning to the .ftpcache member;

        # in case you want logically independent URL openers

        # XXX This is not threadsafe.  Bah.


    def __del__(self):

        self.close()


    def close(self):

        self.cleanup()


    def cleanup(self):

        # This code sometimes runs when the rest of this module

        # has already been deleted, so it can't use any globals

        # or import anything.

        if self.__tempfiles:

            for file in self.__tempfiles:

                try:

                    self.__unlink(file)

                except OSError:

                    pass

            del self.__tempfiles[:]

        if self.tempcache:

            self.tempcache.clear()


    def addheader(self, *args):

        """Add a header to be used by the HTTP interface only

        e.g. u.addheader('Accept', 'sound/basic')"""

        self.addheaders.append(args)


    # External interface

    def open(self, fullurl, data=None):

        """Use URLopener().open(file) instead of open(file, 'r')."""

        fullurl = unwrap(toBytes(fullurl))

        if self.tempcache and self.tempcache.has_key(fullurl):

            filename, headers = self.tempcache[fullurl]

            fp = open(filename, 'rb')

            return addinfourl(fp, headers, fullurl)

        urltype, url = splittype(fullurl)

        if not urltype:

            urltype = 'file'

        if self.proxies.has_key(urltype):

            proxy = self.proxies[urltype]

            urltype, proxyhost = splittype(proxy)

            host, selector = splithost(proxyhost)

            url = (host, fullurl) # Signal special case to open_*()

        else:

            proxy = None

        name = 'open_' + urltype

        self.type = urltype

        if '-' in name:

            # replace - with _

            name = '_'.join(name.split('-'))

        if not hasattr(self, name):

            if proxy:

                return self.open_unknown_proxy(proxy, fullurl, data)

            else:

                return self.open_unknown(fullurl, data)

        try:

            if data is None:

                return getattr(self, name)(url)

            else:

                return getattr(self, name)(url, data)

        except socket.error, msg:

            raise IOError, ('socket error', msg), sys.exc_info()[2]


    def open_unknown(self, fullurl, data=None):

        """Overridable interface to open unknown URL type."""

        type, url = splittype(fullurl)

        raise IOError, ('url error', 'unknown url type', type)


    def open_unknown_proxy(self, proxy, fullurl, data=None):

        """Overridable interface to open unknown URL type."""

        type, url = splittype(fullurl)

        raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)


    # External interface

    def retrieve(self, url, filename=None, reporthook=None, data=None):

        """retrieve(url) returns (filename, None) for a local object

        or (tempfilename, headers) for a remote object."""

        url = unwrap(toBytes(url))

        if self.tempcache and self.tempcache.has_key(url):

            return self.tempcache[url]

        type, url1 = splittype(url)

        if not filename and (not type or type == 'file'):

            try:

                fp = self.open_local_file(url1)

                hdrs = fp.info()

                del fp

                return url2pathname(splithost(url1)[1]), hdrs

            except IOError, msg:

                pass

        fp = self.open(url, data)

        headers = fp.info()

        if not filename:

            import tempfile

            garbage, path = splittype(url)

            garbage, path = splithost(path or "")

            path, garbage = splitquery(path or "")

            path, garbage = splitattr(path or "")

            suffix = os.path.splitext(path)[1]

            filename = tempfile.mktemp(suffix)

            self.__tempfiles.append(filename)

        result = filename, headers

        if self.tempcache is not None:

            self.tempcache[url] = result

        tfp = open(filename, 'wb')

        bs = 1024*8

        size = -1

        blocknum = 1

        if reporthook:

            if headers.has_key("content-length"):

                size = int(headers["Content-Length"])

            reporthook(0, bs, size)

        block = fp.read(bs)

        if reporthook:

            reporthook(1, bs, size)

        while block:

            tfp.write(block)

            block = fp.read(bs)

            blocknum = blocknum + 1

            if reporthook:

                reporthook(blocknum, bs, size)

        fp.close()

        tfp.close()

        del fp

        del tfp

        return result


    # Each method named open_<type> knows how to open that type of URL


    def open_http(self, url, data=None):

        """Use HTTP protocol."""

        import httplib

        user_passwd = None

        if type(url) is types.StringType:

            host, selector = splithost(url)

            if host:

                user_passwd, host = splituser(host)

                host = unquote(host)

            realhost = host

        else:

            host, selector = url

            urltype, rest = splittype(selector)

            url = rest

            user_passwd = None

            if urltype.lower() != 'http':

                realhost = None

            else:

                realhost, rest = splithost(rest)

                if realhost:

                    user_passwd, realhost = splituser(realhost)

                if user_passwd:

                    selector = "%s://%s%s" % (urltype, realhost, rest)

                if proxy_bypass(realhost):

                    host = realhost


            #print "proxy via http:", host, selector

        if not host: raise IOError, ('http error', 'no host given')

        if user_passwd:

            import base64

            auth = base64.encodestring(user_passwd).strip()

        else:

            auth = None

        h = httplib.HTTP(host)

        if data is not None:

            h.putrequest('POST', selector)

            h.putheader('Content-type', 'application/x-www-form-urlencoded')

            h.putheader('Content-length', '%d' % len(data))

        else:

            h.putrequest('GET', selector)

        if auth: h.putheader('Authorization', 'Basic %s' % auth)

        if realhost: h.putheader('Host', realhost)

        for args in self.addheaders: apply(h.putheader, args)

        h.endheaders()

        if data is not None:

            h.send(data)

        errcode, errmsg, headers = h.getreply()

        fp = h.getfile()

        if errcode == 200:

            return addinfourl(fp, headers, "http:" + url)

        else:

            if data is None:

                return self.http_error(url, fp, errcode, errmsg, headers)

            else:

                return self.http_error(url, fp, errcode, errmsg, headers, data)


    def http_error(self, url, fp, errcode, errmsg, headers, data=None):

        """Handle http errors.

        Derived class can override this, or provide specific handlers

        named http_error_DDD where DDD is the 3-digit error code."""

        # First check if there's a specific handler for this error

        name = 'http_error_%d' % errcode

        if hasattr(self, name):

            method = getattr(self, name)

            if data is None:

                result = method(url, fp, errcode, errmsg, headers)

            else:

                result = method(url, fp, errcode, errmsg, headers, data)

            if result: return result

        return self.http_error_default(url, fp, errcode, errmsg, headers)


    def http_error_default(self, url, fp, errcode, errmsg, headers):

        """Default error handler: close the connection and raise IOError."""

        void = fp.read()

        fp.close()

        raise IOError, ('http error', errcode, errmsg, headers)


    if hasattr(socket, "ssl"):

        def open_https(self, url, data=None):

            """Use HTTPS protocol."""

            import httplib

            user_passwd = None

            if type(url) is types.StringType:

                host, selector = splithost(url)

                if host:

                    user_passwd, host = splituser(host)

                    host = unquote(host)

                realhost = host

            else:

                host, selector = url

                urltype, rest = splittype(selector)

                url = rest

                user_passwd = None

                if urltype.lower() != 'https':

                    realhost = None

                else:

                    realhost, rest = splithost(rest)

                    if realhost:

                        user_passwd, realhost = splituser(realhost)

                    if user_passwd:

                        selector = "%s://%s%s" % (urltype, realhost, rest)

                #print "proxy via https:", host, selector

            if not host: raise IOError, ('https error', 'no host given')

            if user_passwd:

                import base64

                auth = base64.encodestring(user_passwd).strip()

            else:

                auth = None

            h = httplib.HTTPS(host, 0,

                              key_file=self.key_file,

                              cert_file=self.cert_file)

            if data is not None:

                h.putrequest('POST', selector)

                h.putheader('Content-type',

                            'application/x-www-form-urlencoded')

                h.putheader('Content-length', '%d' % len(data))

            else:

                h.putrequest('GET', selector)

            if auth: h.putheader('Authorization: Basic %s' % auth)

            if realhost: h.putheader('Host', realhost)

            for args in self.addheaders: apply(h.putheader, args)

            h.endheaders()

            if data is not None:

                h.send(data)

            errcode, errmsg, headers = h.getreply()

            fp = h.getfile()

            if errcode == 200:

                return addinfourl(fp, headers, "https:" + url)

            else:

                if data is None:

                    return self.http_error(url, fp, errcode, errmsg, headers)

                else:

                    return self.http_error(url, fp, errcode, errmsg, headers,

                                           data)


    def open_gopher(self, url):

        """Use Gopher protocol."""

        import gopherlib

        host, selector = splithost(url)

        if not host: raise IOError, ('gopher error', 'no host given')

        host = unquote(host)

        type, selector = splitgophertype(selector)

        selector, query = splitquery(selector)

        selector = unquote(selector)

        if query:

            query = unquote(query)

            fp = gopherlib.send_query(selector, query, host)

        else:

            fp = gopherlib.send_selector(selector, host)

        return addinfourl(fp, noheaders(), "gopher:" + url)


    def open_file(self, url):

        """Use local file or FTP depending on form of URL."""

        if url[:2] == '//' and url[2:3] != '/':

            return self.open_ftp(url)

        else:

            return self.open_local_file(url)


    def open_local_file(self, url):

        """Use local file."""

        import mimetypes, mimetools, rfc822, StringIO

        host, file = splithost(url)

        localname = url2pathname(file)

        stats = os.stat(localname)

        size = stats[stat.ST_SIZE]

        modified = rfc822.formatdate(stats[stat.ST_MTIME])

        mtype = mimetypes.guess_type(url)[0]

        headers = mimetools.Message(StringIO.StringIO(

            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %

            (mtype or 'text/plain', size, modified)))

        if not host:

            urlfile = file

            if file[:1] == '/':

                urlfile = 'file://' + file

            return addinfourl(open(localname, 'rb'),

                              headers, urlfile)

        host, port = splitport(host)

        if not port \

           and socket.gethostbyname(host) in (localhost(), thishost()):

            urlfile = file

            if file[:1] == '/':

                urlfile = 'file://' + file

            return addinfourl(open(localname, 'rb'),

                              headers, urlfile)

        raise IOError, ('local file error', 'not on local host')


    def open_ftp(self, url):

        """Use FTP protocol."""

        import mimetypes, mimetools, StringIO

        host, path = splithost(url)

        if not host: raise IOError, ('ftp error', 'no host given')

        host, port = splitport(host)

        user, host = splituser(host)

        if user: user, passwd = splitpasswd(user)

        else: passwd = None

        host = unquote(host)

        user = unquote(user or '')

        passwd = unquote(passwd or '')

        host = socket.gethostbyname(host)

        if not port:

            import ftplib

            port = ftplib.FTP_PORT

        else:

            port = int(port)

        path, attrs = splitattr(path)

        path = unquote(path)

        dirs = path.split('/')

        dirs, file = dirs[:-1], dirs[-1]

        if dirs and not dirs[0]: dirs = dirs[1:]

        if dirs and not dirs[0]: dirs[0] = '/'

        key = user, host, port, '/'.join(dirs)

        # XXX thread unsafe!

        if len(self.ftpcache) > MAXFTPCACHE:

            # Prune the cache, rather arbitrarily

            for k in self.ftpcache.keys():

                if k != key:

                    v = self.ftpcache[k]

                    del self.ftpcache[k]

                    v.close()

        try:

            if not self.ftpcache.has_key(key):

                self.ftpcache[key] = \

                    ftpwrapper(user, passwd, host, port, dirs)

            if not file: type = 'D'

            else: type = 'I'

            for attr in attrs:

                attr, value = splitvalue(attr)

                if attr.lower() == 'type' and \

                   value in ('a', 'A', 'i', 'I', 'd', 'D'):

                    type = value.upper()

            (fp, retrlen) = self.ftpcache[key].retrfile(file, type)

            mtype = mimetypes.guess_type("ftp:" + url)[0]

            headers = ""

            if mtype:

                headers += "Content-Type: %s\n" % mtype

            if retrlen is not None and retrlen >= 0:

                headers += "Content-Length: %d\n" % retrlen

            headers = mimetools.Message(StringIO.StringIO(headers))

            return addinfourl(fp, headers, "ftp:" + url)

        except ftperrors(), msg:

            raise IOError, ('ftp error', msg), sys.exc_info()[2]


    def open_data(self, url, data=None):

        """Use "data" URL."""

        # ignore POSTed data

        #

        # syntax of data URLs:

        # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data

        # mediatype := [ type "/" subtype ] *( ";" parameter )

        # data      := *urlchar

        # parameter := attribute "=" value

        import StringIO, mimetools, time

        try:

            [type, data] = url.split(',', 1)

        except ValueError:

            raise IOError, ('data error', 'bad data URL')

        if not type:

            type = 'text/plain;charset=US-ASCII'

        semi = type.rfind(';')

        if semi >= 0 and '=' not in type[semi:]:

            encoding = type[semi+1:]

            type = type[:semi]

        else:

            encoding = ''

        msg = []

        msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',

                                            time.gmtime(time.time())))

        msg.append('Content-type: %s' % type)

        if encoding == 'base64':

            import base64

            data = base64.decodestring(data)

        else:

            data = unquote(data)

        msg.append('Content-length: %d' % len(data))

        msg.append('')

        msg.append(data)

        msg = '\n'.join(msg)

        f = StringIO.StringIO(msg)

        headers = mimetools.Message(f, 0)

        f.fileno = None     # needed for addinfourl

        return addinfourl(f, headers, url)


class FancyURLopener(URLopener):

    """Derived class with handlers for errors we can handle (perhaps)."""


    def __init__(self, *args):

        apply(URLopener.__init__, (self,) + args)

        self.auth_cache = {}

        self.tries = 0

        self.maxtries = 10


    def http_error_default(self, url, fp, errcode, errmsg, headers):

        """Default error handling -- don't raise an exception."""

        return addinfourl(fp, headers, "http:" + url)


    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):

        """Error 302 -- relocated (temporarily)."""

        self.tries += 1

        if self.maxtries and self.tries >= self.maxtries:

            if hasattr(self, "http_error_500"):

                meth = self.http_error_500

            else:

                meth = self.http_error_default

            self.tries = 0

            return meth(url, fp, 500,

                        "Internal Server Error: Redirect Recursion", headers)

        result = self.redirect_internal(url, fp, errcode, errmsg, headers,

                                        data)

        self.tries = 0

        return result


    def redirect_internal(self, url, fp, errcode, errmsg, headers, data):

        if headers.has_key('location'):

            newurl = headers['location']

        elif headers.has_key('uri'):

            newurl = headers['uri']

        else:

            return

        void = fp.read()

        fp.close()

        # In case the server sent a relative URL, join with original:

        newurl = basejoin(self.type + ":" + url, newurl)

        if data is None:

            return self.open(newurl)

        else:

            return self.open(newurl, data)


    def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):

        """Error 301 -- also relocated (permanently)."""

        return self.http_error_302(url, fp, errcode, errmsg, headers, data)


    def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):

        """Error 401 -- authentication required.

        See this URL for a description of the basic authentication scheme:

        http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""

        if not headers.has_key('www-authenticate'):

            URLopener.http_error_default(self, url, fp,

                                         errcode, errmsg, headers)

        stuff = headers['www-authenticate']

        import re

        match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)

        if not match:

            URLopener.http_error_default(self, url, fp,

                                         errcode, errmsg, headers)

        scheme, realm = match.groups()

        if scheme.lower() != 'basic':

            URLopener.http_error_default(self, url, fp,

                                         errcode, errmsg, headers)

        name = 'retry_' + self.type + '_basic_auth'

        if data is None:

            return getattr(self,name)(url, realm)

        else:

            return getattr(self,name)(url, realm, data)


    def retry_http_basic_auth(self, url, realm, data=None):

        host, selector = splithost(url)

        i = host.find('@') + 1

        host = host[i:]

        user, passwd = self.get_user_passwd(host, realm, i)

        if not (user or passwd): return None

        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host

        newurl = 'http://' + host + selector

        if data is None:

            return self.open(newurl)

        else:

            return self.open(newurl, data)


    def retry_https_basic_auth(self, url, realm, data=None):

        host, selector = splithost(url)

        i = host.find('@') + 1

        host = host[i:]

        user, passwd = self.get_user_passwd(host, realm, i)

        if not (user or passwd): return None

        host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host

        newurl = '//' + host + selector

        return self.open_https(newurl, data)


    def get_user_passwd(self, host, realm, clear_cache = 0):

        key = realm + '@' + host.lower()

        if self.auth_cache.has_key(key):

            if clear_cache:

                del self.auth_cache[key]

            else:

                return self.auth_cache[key]

        user, passwd = self.prompt_user_passwd(host, realm)

        if user or passwd: self.auth_cache[key] = (user, passwd)

        return user, passwd


    def prompt_user_passwd(self, host, realm):

        """Override this in a GUI environment!"""

        import getpass

        try:

            user = raw_input("Enter username for %s at %s: " % (realm,

                                                                host))

            passwd = getpass.getpass("Enter password for %s in %s at %s: " %

                (user, realm, host))

            return user, passwd

        except KeyboardInterrupt:

            print

            return None, None


# Utility functions


_localhost = None

def localhost():

    """Return the IP address of the magic hostname 'localhost'."""

    global _localhost

    if not _localhost:

        _localhost = socket.gethostbyname('localhost')

    return _localhost


_thishost = None

def thishost():

    """Return the IP address of the current host."""

    global _thishost

    if not _thishost:

        _thishost = socket.gethostbyname(socket.gethostname())

    return _thishost


_ftperrors = None

def ftperrors():

    """Return the set of errors raised by the FTP class."""

    global _ftperrors

    if not _ftperrors:

        import ftplib

        _ftperrors = ftplib.all_errors

    return _ftperrors


_noheaders = None

def noheaders():

    """Return an empty mimetools.Message object."""

    global _noheaders

    if not _noheaders:

        import mimetools

        import StringIO

        _noheaders = mimetools.Message(StringIO.StringIO(), 0)

        _noheaders.fp.close()   # Recycle file descriptor

    return _noheaders


# Utility classes


class ftpwrapper:

    """Class used by open_ftp() for cache of open FTP connections."""


    def __init__(self, user, passwd, host, port, dirs):

        self.user = user

        self.passwd = passwd

        self.host = host

        self.port = port

        self.dirs = dirs

        self.init()


    def init(self):

        import ftplib

        self.busy = 0

        self.ftp = ftplib.FTP()

        self.ftp.connect(self.host, self.port)

        self.ftp.login(self.user, self.passwd)

        for dir in self.dirs:

            self.ftp.cwd(dir)


    def retrfile(self, file, type):

        import ftplib

        self.endtransfer()

        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1

        else: cmd = 'TYPE ' + type; isdir = 0

        try:

            self.ftp.voidcmd(cmd)

        except ftplib.all_errors:

            self.init()

            self.ftp.voidcmd(cmd)

        conn = None

        if file and not isdir:

            # Use nlst to see if the file exists at all

            try:

                self.ftp.nlst(file)

            except ftplib.error_perm, reason:

                raise IOError, ('ftp error', reason), sys.exc_info()[2]

            # Restore the transfer mode!

            self.ftp.voidcmd(cmd)

            # Try to retrieve as a file

            try:

                cmd = 'RETR ' + file

                conn = self.ftp.ntransfercmd(cmd)

            except ftplib.error_perm, reason:

                if str(reason)[:3] != '550':

                    raise IOError, ('ftp error', reason), sys.exc_info()[2]

        if not conn:

            # Set transfer mode to ASCII!

            self.ftp.voidcmd('TYPE A')

            # Try a directory listing

            if file: cmd = 'LIST ' + file

            else: cmd = 'LIST'

            conn = self.ftp.ntransfercmd(cmd)

        self.busy = 1

        # Pass back both a suitably decorated object and a retrieval length

        return (addclosehook(conn[0].makefile('rb'),

                             self.endtransfer), conn[1])

    def endtransfer(self):

        if not self.busy:

            return

        self.busy = 0

        try:

            self.ftp.voidresp()

        except ftperrors():

            pass


    def close(self):

        self.endtransfer()

        try:

            self.ftp.close()

        except ftperrors():

            pass


class addbase:

    """Base class for addinfo and addclosehook."""


    def __init__(self, fp):

        self.fp = fp

        self.read = self.fp.read

        self.readline = self.fp.readline

        if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines

        if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno


    def __repr__(self):

        return '<%s at %s whose fp = %s>' % (self.__class__.__name__,

                                             `id(self)`, `self.fp`)


    def close(self):

        self.read = None

        self.readline = None

        self.readlines = None

        self.fileno = None

        if self.fp: self.fp.close()

        self.fp = None


class addclosehook(addbase):

    """Class to add a close hook to an open file."""


    def __init__(self, fp, closehook, *hookargs):

        addbase.__init__(self, fp)

        self.closehook = closehook

        self.hookargs = hookargs


    def close(self):

        addbase.close(self)

        if self.closehook:

            apply(self.closehook, self.hookargs)

            self.closehook = None

            self.hookargs = None


class addinfo(addbase):

    """class to add an info() method to an open file."""


    def __init__(self, fp, headers):

        addbase.__init__(self, fp)

        self.headers = headers


    def info(self):

        return self.headers


class addinfourl(addbase):

    """class to add info() and geturl() methods to an open file."""


    def __init__(self, fp, headers, url):

        addbase.__init__(self, fp)

        self.headers = headers

        self.url = url


    def info(self):

        return self.headers


    def geturl(self):

        return self.url


def basejoin(base, url):

    """Utility to combine a URL with a base URL to form a new URL."""

    type, path = splittype(url)

    if type:

        # if url is complete (i.e., it contains a type), return it

        return url

    host, path = splithost(path)

    type, basepath = splittype(base) # inherit type from base

    if host:

        # if url contains host, just inherit type

        if type: return type + '://' + host + path

        else:

            # no type inherited, so url must have started with //

            # just return it

            return url

    host, basepath = splithost(basepath) # inherit host

    basepath, basetag = splittag(basepath) # remove extraneous cruft

    basepath, basequery = splitquery(basepath) # idem

    if path[:1] != '/':

        # non-absolute path name

        if path[:1] in ('#', '?'):

            # path is just a tag or query, attach to basepath

            i = len(basepath)

        else:

            # else replace last component

            i = basepath.rfind('/')

        if i < 0:

            # basepath not absolute

            if host:

                # host present, make absolute

                basepath = '/'

            else:

                # else keep non-absolute

                basepath = ''

        else:

            # remove last file component

            basepath = basepath[:i+1]

        # Interpret ../ (important because of symlinks)

        while basepath and path[:3] == '../':

            path = path[3:]

            i = basepath[:-1].rfind('/')

            if i > 0:

                basepath = basepath[:i+1]

            elif i == 0:

                basepath = '/'

                break

            else:

                basepath = ''


        path = basepath + path

    if host and path and path[0] != '/':

        path = '/' + path

    if type and host: return type + '://' + host + path

    elif type: return type + ':' + path

    elif host: return '//' + host + path # don't know what this means

    else: return path


# Utilities to parse URLs (most of these return None for missing parts):

# unwrap('<URL:type://host/path>') --> 'type://host/path'

# splittype('type:opaquestring') --> 'type', 'opaquestring'

# splithost('//host[:port]/path') --> 'host[:port]', '/path'

# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'

# splitpasswd('user:passwd') -> 'user', 'passwd'

# splitport('host:port') --> 'host', 'port'

# splitquery('/path?query') --> '/path', 'query'

# splittag('/path#tag') --> '/path', 'tag'

# splitattr('/path;attr1=value1;attr2=value2;...') ->

#   '/path', ['attr1=value1', 'attr2=value2', ...]

# splitvalue('attr=value') --> 'attr', 'value'

# splitgophertype('/Xselector') --> 'X', 'selector'

# unquote('abc%20def') -> 'abc def'

# quote('abc def') -> 'abc%20def')


def toBytes(url):

    """toBytes(u"URL") --> 'URL'."""

    # Most URL schemes require ASCII. If that changes, the conversion

    # can be relaxed

    if type(url) is types.UnicodeType:

        try:

            url = url.encode("ASCII")

        except UnicodeError:

            raise UnicodeError("URL " + repr(url) +

                               " contains non-ASCII characters")

    return url


def unwrap(url):

    """unwrap('<URL:type://host/path>') --> 'type://host/path'."""

    url = url.strip()

    if url[:1] == '<' and url[-1:] == '>':

        url = url[1:-1].strip()

    if url[:4] == 'URL:': url = url[4:].strip()

    return url


_typeprog = None

def splittype(url):

    """splittype('type:opaquestring') --> 'type', 'opaquestring'."""

    global _typeprog

    if _typeprog is None:

        import re

        _typeprog = re.compile('^([^/:]+):')


    match = _typeprog.match(url)

    if match:

        scheme = match.group(1)

        return scheme.lower(), url[len(scheme) + 1:]

    return None, url


_hostprog = None

def splithost(url):

    """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""

    global _hostprog

    if _hostprog is None:

        import re

        _hostprog = re.compile('^//([^/]*)(.*)$')


    match = _hostprog.match(url)

    if match: return match.group(1, 2)

    return None, url


_userprog = None

def splituser(host):

    """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""

    global _userprog

    if _userprog is None:

        import re

        _userprog = re.compile('^([^@]*)@(.*)$')


    match = _userprog.match(host)

    if match: return map(unquote, match.group(1, 2))

    return None, host


_passwdprog = None

def splitpasswd(user):

    """splitpasswd('user:passwd') -> 'user', 'passwd'."""

    global _passwdprog

    if _passwdprog is None:

        import re

        _passwdprog = re.compile('^([^:]*):(.*)$')


    match = _passwdprog.match(user)

    if match: return match.group(1, 2)

    return user, None


# splittag('/path#tag') --> '/path', 'tag'

_portprog = None

def splitport(host):

    """splitport('host:port') --> 'host', 'port'."""

    global _portprog

    if _portprog is None:

        import re

        _portprog = re.compile('^(.*):([0-9]+)$')


    match = _portprog.match(host)

    if match: return match.group(1, 2)

    return host, None


_nportprog = None

def splitnport(host, defport=-1):

    """Split host and port, returning numeric port.

    Return given default port if no ':' found; defaults to -1.

    Return numerical port if a valid number are found after ':'.

    Return None if ':' but not a valid number."""

    global _nportprog

    if _nportprog is None:

        import re

        _nportprog = re.compile('^(.*):(.*)$')


    match = _nportprog.match(host)

    if match:

        host, port = match.group(1, 2)

        try:

            if not port: raise ValueError, "no digits"

            nport = int(port)

        except ValueError:

            nport = None

        return host, nport

    return host, defport


_queryprog = None

def splitquery(url):

    """splitquery('/path?query') --> '/path', 'query'."""

    global _queryprog

    if _queryprog is None:

        import re

        _queryprog = re.compile('^(.*)\?([^?]*)$')


    match = _queryprog.match(url)

    if match: return match.group(1, 2)

    return url, None


_tagprog = None

def splittag(url):

    """splittag('/path#tag') --> '/path', 'tag'."""

    global _tagprog

    if _tagprog is None:

        import re

        _tagprog = re.compile('^(.*)#([^#]*)$')


    match = _tagprog.match(url)

    if match: return match.group(1, 2)

    return url, None


def splitattr(url):

    """splitattr('/path;attr1=value1;attr2=value2;...') ->

        '/path', ['attr1=value1', 'attr2=value2', ...]."""

    words = url.split(';')

    return words[0], words[1:]


_valueprog = None

def splitvalue(attr):

    """splitvalue('attr=value') --> 'attr', 'value'."""

    global _valueprog

    if _valueprog is None:

        import re

        _valueprog = re.compile('^([^=]*)=(.*)$')


    match = _valueprog.match(attr)

    if match: return match.group(1, 2)

    return attr, None


def splitgophertype(selector):

    """splitgophertype('/Xselector') --> 'X', 'selector'."""

    if selector[:1] == '/' and selector[1:2]:

        return selector[1], selector[2:]

    return None, selector


def unquote(s):

    """unquote('abc%20def') -> 'abc def'."""

    mychr = chr

    myatoi = int

    list = s.split('%')

    res = [list[0]]

    myappend = res.append

    del list[0]

    for item in list:

        if item[1:2]:

            try:

                myappend(mychr(myatoi(item[:2], 16))

                     + item[2:])

            except ValueError:

                myappend('%' + item)

        else:

            myappend('%' + item)

    return "".join(res)


def unquote_plus(s):

    """unquote('%7e/abc+def') -> '~/abc def'"""

    if '+' in s:

        # replace '+' with ' '

        s = ' '.join(s.split('+'))

    return unquote(s)


always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

               'abcdefghijklmnopqrstuvwxyz'

               '0123456789' '_.-')


_fast_safe_test = always_safe + '/'

_fast_safe = None


def _fast_quote(s):

    global _fast_safe

    if _fast_safe is None:

        _fast_safe = {}

        for c in _fast_safe_test:

            _fast_safe[c] = c

    res = list(s)

    for i in range(len(res)):

        c = res[i]

        if not _fast_safe.has_key(c):

            res[i] = '%%%02X' % ord(c)

    return ''.join(res)


def quote(s, safe = '/'):

    """quote('abc def') -> 'abc%20def'


    Each part of a URL, e.g. the path info, the query, etc., has a

    different set of reserved characters that must be quoted.


    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

    the following reserved characters.


    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

                  "$" | ","


    Each of these characters is reserved in some component of a URL,

    but not necessarily in all of them.


    By default, the quote function is intended for quoting the path

    section of a URL.  Thus, it will not encode '/'.  This character

    is reserved, but in typical usage the quote function is being

    called on a path where the existing slash characters are used as

    reserved characters.

    """

    safe = always_safe + safe

    if _fast_safe_test == safe:

        return _fast_quote(s)

    res = list(s)

    for i in range(len(res)):

        c = res[i]

        if c not in safe:

            res[i] = '%%%02X' % ord(c)

    return ''.join(res)


def quote_plus(s, safe = ''):

    """Quote the query fragment of a URL; replacing ' ' with '+'"""

    if ' ' in s:

        l = s.split(' ')

        for i in range(len(l)):

            l[i] = quote(l[i], safe)

        return '+'.join(l)

    else:

        return quote(s, safe)


def urlencode(query,doseq=0):

    """Encode a sequence of two-element tuples or dictionary into a URL query string.


    If any values in the query arg are sequences and doseq is true, each

    sequence element is converted to a separate parameter.


    If the query arg is a sequence of two-element tuples, the order of the

    parameters in the output will match the order of parameters in the

    input.

    """


    if hasattr(query,"items"):

        # mapping objects

        query = query.items()

    else:

        # it's a bother at times that strings and string-like objects are

        # sequences...

        try:

            # non-sequence items should not work with len()

            x = len(query)

            # non-empty strings will fail this

            if len(query) and type(query[0]) != types.TupleType:

                raise TypeError

            # zero-length sequences of all types will get here and succeed,

            # but that's a minor nit - since the original implementation

            # allowed empty dicts that type of behavior probably should be

            # preserved for consistency

        except TypeError:

            ty,va,tb = sys.exc_info()

            raise TypeError, "not a valid non-string sequence or mapping object", tb


    l = []

    if not doseq:

        # preserve old behavior

        for k, v in query:

            k = quote_plus(str(k))

            v = quote_plus(str(v))

            l.append(k + '=' + v)

    else:

        for k, v in query:

            k = quote_plus(str(k))

            if type(v) == types.StringType:

                v = quote_plus(v)

                l.append(k + '=' + v)

            elif type(v) == types.UnicodeType:

                # is there a reasonable way to convert to ASCII?

                # encode generates a string, but "replace" or "ignore"

                # lose information and "strict" can raise UnicodeError

                v = quote_plus(v.encode("ASCII","replace"))

                l.append(k + '=' + v)

            else:

                try:

                    # is this a sufficient test for sequence-ness?

                    x = len(v)

                except TypeError:

                    # not a sequence

                    v = quote_plus(str(v))

                    l.append(k + '=' + v)

                else:

                    # loop over the sequence

                    for elt in v:

                        l.append(k + '=' + quote_plus(str(elt)))

    return '&'.join(l)


# Proxy handling

def getproxies_environment():

    """Return a dictionary of scheme -> proxy server URL mappings.


    Scan the environment for variables named <scheme>_proxy;

    this seems to be the standard convention.  If you need a

    different way, you can pass a proxies dictionary to the

    [Fancy]URLopener constructor.


    """

    proxies = {}

    for name, value in os.environ.items():

        name = name.lower()

        if value and name[-6:] == '_proxy':

            proxies[name[:-6]] = value

    return proxies


if os.name == 'mac':

    def getproxies():

        """Return a dictionary of scheme -> proxy server URL mappings.


        By convention the mac uses Internet Config to store

        proxies.  An HTTP proxy, for instance, is stored under

        the HttpProxy key.


        """

        try:

            import ic

        except ImportError:

            return {}


        try:

            config = ic.IC()

        except ic.error:

            return {}

        proxies = {}

        # HTTP:

        if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:

            try:

                value = config['HTTPProxyHost']

            except ic.error:

                pass

            else:

                proxies['http'] = 'http://%s' % value

        # FTP: XXXX To be done.

        # Gopher: XXXX To be done.

        return proxies


    def proxy_bypass(x):

        return 0


elif os.name == 'nt':

    def getproxies_registry():

        """Return a dictionary of scheme -> proxy server URL mappings.


        Win32 uses the registry to store proxies.


        """

        proxies = {}

        try:

            import _winreg

        except ImportError:

            # Std module, so should be around - but you never know!

            return proxies

        try:

            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,

                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')

            proxyEnable = _winreg.QueryValueEx(internetSettings,

                                               'ProxyEnable')[0]

            if proxyEnable:

                # Returned as Unicode but problems if not converted to ASCII

                proxyServer = str(_winreg.QueryValueEx(internetSettings,

                                                       'ProxyServer')[0])

                if '=' in proxyServer:

                    # Per-protocol settings

                    for p in proxyServer.split(';'):

                        protocol, address = p.split('=', 1)

                        # See if address has a type:// prefix

                        import re

                        if not re.match('^([^/:]+)://', address):

                            address = '%s://%s' % (protocol, address)

                        proxies[protocol] = address

                else:

                    # Use one setting for all protocols

                    if proxyServer[:5] == 'http:':

                        proxies['http'] = proxyServer

                    else:

                        proxies['http'] = 'http://%s' % proxyServer

                        proxies['ftp'] = 'ftp://%s' % proxyServer

            internetSettings.Close()

        except (WindowsError, ValueError, TypeError):

            # Either registry key not found etc, or the value in an

            # unexpected format.

            # proxies already set up to be empty so nothing to do

            pass

        return proxies


    def getproxies():

        """Return a dictionary of scheme -> proxy server URL mappings.


        Returns settings gathered from the environment, if specified,

        or the registry.


        """

        return getproxies_environment() or getproxies_registry()


    def proxy_bypass(host):

        try:

            import _winreg

            import re

            import socket

        except ImportError:

            # Std modules, so should be around - but you never know!

            return 0

        try:

            internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,

                r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')

            proxyEnable = _winreg.QueryValueEx(internetSettings,

                                               'ProxyEnable')[0]

            proxyOverride = str(_winreg.QueryValueEx(internetSettings,

                                                     'ProxyOverride')[0])

            # ^^^^ Returned as Unicode but problems if not converted to ASCII

        except WindowsError:

            return 0

        if not proxyEnable or not proxyOverride:

            return 0

        # try to make a host list from name and IP address.

        host = [host]

        try:

            addr = socket.gethostbyname(host[0])

            if addr != host:

                host.append(addr)

        except socket.error:

            pass

        # make a check value list from the registry entry: replace the

        # '<local>' string by the localhost entry and the corresponding

        # canonical entry.

        proxyOverride = proxyOverride.split(';')

        i = 0

        while i < len(proxyOverride):

            if proxyOverride[i] == '<local>':

                proxyOverride[i:i+1] = ['localhost',

                                        '127.0.0.1',

                                        socket.gethostname(),

                                        socket.gethostbyname(

                                            socket.gethostname())]

            i += 1

        # print proxyOverride

        # now check if we match one of the registry values.

        for test in proxyOverride:

            test = test.replace(".", r"\.")     # mask dots

            test = test.replace("*", r".*")     # change glob sequence

            test = test.replace("?", r".")      # change glob char

            for val in host:

                # print "%s <--> %s" %( test, val )

                if re.match(test, val, re.I):

                    return 1

        return 0


else:

    # By default use environment variables

    getproxies = getproxies_environment


    def proxy_bypass(host):

        return 0


# Test and time quote() and unquote()

def test1():

    import time

    s = ''

    for i in range(256): s = s + chr(i)

    s = s*4

    t0 = time.time()

    qs = quote(s)

    uqs = unquote(qs)

    t1 = time.time()

    if uqs != s:

        print 'Wrong!'

    print `s`

    print `qs`

    print `uqs`

    print round(t1 - t0, 3), 'sec'


def reporthook(blocknum, blocksize, totalsize):

    # Report during remote transfers

    print "Block number: %d, Block size: %d, Total size: %d" % (

        blocknum, blocksize, totalsize)


# Test program

def test(args=[]):

    if not args:

        args = [

            '/etc/passwd',

            'file:/etc/passwd',

            'file://localhost/etc/passwd',

            'ftp://ftp.python.org/pub/python/README',

##          'gopher://gopher.micro.umn.edu/1/',

            'http://www.python.org/index.html',

            ]

        if hasattr(URLopener, "open_https"):

            args.append('https://synergy.as.cmu.edu/~geek/')

    try:

        for url in args:

            print '-'*10, url, '-'*10

            fn, h = urlretrieve(url, None, reporthook)

            print fn

            if h:

                print '======'

                for k in h.keys(): print k + ':', h[k]

                print '======'

            fp = open(fn, 'rb')

            data = fp.read()

            del fp

            if '\r' in data:

                table = string.maketrans("", "")

                data = data.translate(table, "\r")

            print data

            fn, h = None, None

        print '-'*40

    finally:

        urlcleanup()


def main():

    import getopt, sys

    try:

        opts, args = getopt.getopt(sys.argv[1:], "th")

    except getopt.error, msg:

        print msg

        print "Use -h for help"

        return

    t = 0

    for o, a in opts:

        if o == '-t':

            t = t + 1

        if o == '-h':

            print "Usage: python urllib.py [-t] [url ...]"

            print "-t runs self-test;",

            print "otherwise, contents of urls are printed"

            return

    if t:

        if t > 1:

            test1()

        test(args)

    else:

        if not args:

            print "Use -h for help"

        for url in args:

            print urlopen(url).read(),


# Run test program when run as a script

if __name__ == '__main__':

    main()