1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirectory manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301 & 302 redirect errors, and the HTTPDigestAuthHandler deals
15 with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
42 URLError-- a subclass of IOError, individual protocols have their own
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
49 BaseHandler and parent
50 _call_chain conventions
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
110 from cStringIO
import StringIO
112 from StringIO
import StringIO
121 from urllib
import unwrap, unquote, splittype, splithost, \
122 addinfourl, splitport, splitgophertype, splitquery, \
123 splitattr, ftpwrapper, noheaders
126 from urllib
import getproxies
129 from urllib
import localhost, url2pathname
131 __version__ =
"2.0a1"
138 return _opener.open(url, data)
155 return '<urlopen error %s>' % self.
reason
158 """Raised when HTTP error occurs, but also acts like non-error return"""
159 __super_init = addinfourl.__init__
171 return 'HTTP Error %s: %s' % (self.
code, self.
msg)
194 self.headers.update(headers)
201 if attr[:12] ==
'_Request__r_':
203 if hasattr(Request,
'get_' + name):
204 getattr(self,
'get_' + name)()
205 return getattr(self, attr)
206 raise AttributeError, attr
212 return self.
data is not None
221 if self.
type is None:
223 if self.
type is None:
224 raise ValueError,
"unknown url type: %s" % self.
__original
228 if self.
host is None:
247 server_version =
"Python-urllib/%s" % __version__
256 for meth
in dir(handler):
257 if meth[-5:] ==
'_open':
259 if self.handle_open.has_key(protocol):
266 j = meth[i+1:].
find(
'_') + i + 1
267 if j != -1
and meth[i+1:j] ==
'error':
274 dict = self.handle_error.get(proto, {})
275 if dict.has_key(kind):
276 dict[kind].
append(handler)
278 dict[kind] = [handler]
283 self.handlers.append(handler)
284 handler.add_parent(self)
294 def _call_chain(self, chain, kind, meth_name, *args):
297 handlers = chain.get(kind, ())
298 for handler
in handlers:
299 func = getattr(handler, meth_name)
302 if result
is not None:
305 def open(self, fullurl, data=None):
307 if isinstance(fullurl, (types.StringType, types.UnicodeType)):
313 assert isinstance(req, Request)
320 type_ = req.get_type()
330 if proto
in [
'http',
'https']:
334 meth_name =
'http_error_%d' % proto
339 meth_name = proto +
'_error'
341 args = (dict, proto, meth_name) + args
347 args = (dict,
'default',
'http_error_default') + orig_args
356 """Create an opener object from a list of handlers.
358 The opener will use several default handlers, including support
359 for HTTP and FTP. If there is a ProxyHandler, it must be at the
360 front of the list of handlers. (Yuck.)
362 If any of the handlers passed as arguments are subclasses of the
363 default handlers, the default handlers will not be used.
367 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
368 HTTPDefaultErrorHandler, HTTPRedirectHandler,
369 FTPHandler, FileHandler]
370 if hasattr(httplib,
'HTTPS'):
371 default_classes.append(HTTPSHandler)
373 for klass
in default_classes:
374 for check
in handlers:
376 if issubclass(check, klass):
378 elif isinstance(check, klass):
381 default_classes.remove(klass)
383 for klass
in default_classes:
384 opener.add_handler(klass())
389 opener.add_handler(h)
400 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
408 if headers.has_key(
'location'):
409 newurl = headers[
'location']
410 elif headers.has_key(
'uri'):
411 newurl = headers[
'uri']
419 new =
Request(newurl, req.get_data(), req.headers)
420 new.error_302_dict = {}
421 if hasattr(req,
'error_302_dict'):
422 if len(req.error_302_dict)>10
or \
423 req.error_302_dict.has_key(newurl):
424 raise HTTPError(req.get_full_url(), code,
425 self.
inf_msg + msg, headers, fp)
426 new.error_302_dict.update(req.error_302_dict)
427 new.error_302_dict[newurl] = newurl
434 return self.parent.open(new)
436 http_error_301 = http_error_302
438 inf_msg =
"The HTTP server returned a redirect error that would" \
439 "lead to an infinite loop.\n" \
440 "The last 302 error message was:\n"
446 assert hasattr(proxies,
'has_key'),
"proxies must be a mapping"
448 for type, url
in proxies.items():
449 setattr(self,
'%s_open' % type,
450 lambda r, proxy=url, type=type, meth=self.
proxy_open: \
451 meth(r, proxy, type))
454 orig_type = req.get_type()
458 user_pass, host = host.split(
'@', 1)
460 req.add_header(
'Proxy-Authorization',
'Basic '+user_pass)
462 req.set_proxy(host, type)
463 if orig_type == type:
471 return self.parent.open(req)
477 def __init__(self, proto, func=None, proxy_addr=None):
494 proto = req.get_type()
501 req.set_proxy(p.get_proxy())
502 return self.parent.open(req)
506 return self.parent.open(req)
509 if self.proxies.has_key(cpo.proto):
512 self.
proxies[cpo.proto] = [cpo]
520 if isinstance(uri, (types.StringType, types.UnicodeType)):
523 if not self.passwd.has_key(realm):
525 self.
passwd[realm][uri] = (user, passwd)
528 domains = self.passwd.get(realm, {})
530 for uris, authinfo
in domains.items():
537 """Accept netloc or URI and extract only the netloc and path"""
540 return parts[1], parts[2]
or '/'
545 """Check if test is below base in a URI tree
547 Both args must be URIs in reduced form.
551 if base[0] != test[0]:
554 if len(common) == len(base[1]):
562 user, password = HTTPPasswordMgr.find_user_password(self,realm,authuri)
564 return user, password
565 return HTTPPasswordMgr.find_user_password(self,
None, authuri)
570 rx = re.compile(
'[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"')
576 def __init__(self, password_mgr=None):
577 if password_mgr
is None:
579 self.passwd = password_mgr
580 self.add_password = self.passwd.add_password
582 def http_error_auth_reqed(self, authreq, host, req, headers):
584 authreq = headers.get(authreq,
None)
586 mo = AbstractBasicAuthHandler.rx.match(authreq)
588 scheme, realm = mo.groups()
589 if scheme.lower() ==
'basic':
590 return self.retry_http_basic_auth(host, req, realm)
592 def retry_http_basic_auth(self, host, req, realm):
593 user,pw = self.passwd.find_user_password(realm, host)
595 raw =
"%s:%s" % (user, pw)
597 if req.headers.get(self.auth_header,
None) == auth:
599 req.add_header(self.auth_header, auth)
600 return self.parent.open(req)
606 auth_header =
'Authorization'
610 return self.http_error_auth_reqed(
'www-authenticate',
616 auth_header =
'Proxy-Authorization'
619 host = req.get_host()
620 return self.http_error_auth_reqed(
'proxy-authenticate',
633 authreq = headers.get(self.auth_header,
None)
635 kind = authreq.split()[0]
640 token, challenge = auth.split(
' ', 1)
644 auth_val =
'Digest %s' % auth
645 if req.headers.get(self.auth_header,
None) == auth_val:
647 req.add_header(self.auth_header, auth_val)
648 resp = self.parent.open(req)
653 realm = chal[
'realm']
654 nonce = chal[
'nonce']
655 algorithm = chal.get(
'algorithm',
'MD5')
658 opaque = chal.get(
'opaque',
None)
666 user, pw = self.passwd.find_user_password(realm,
677 A1 =
"%s:%s:%s" % (user, realm, pw)
678 A2 =
"%s:%s" % (req.has_data()
and 'POST' or 'GET',
681 respdig = KD(H(A1),
"%s:%s" % (nonce, H(A2)))
684 base =
'username="%s", realm="%s", nonce="%s", uri="%s", ' \
685 'response="%s"' % (user, realm, nonce, req.get_selector(),
688 base = base +
', opaque="%s"' % opaque
690 base = base +
', digest="%s"' % entdig
691 if algorithm !=
'MD5':
692 base = base +
', algorithm="%s"' % algorithm
697 if algorithm ==
'MD5':
698 H =
lambda x, e=encode_digest:
e(md5.new(x).
digest())
699 elif algorithm ==
'SHA':
700 H =
lambda x, e=encode_digest:
e(sha.new(x).
digest())
702 KD =
lambda s, d, H=H: H(
"%s:%s" % (s, d))
711 """An authentication protocol defined by RFC 2069
713 Digest authentication improves on basic authentication because it
714 does not transmit passwords in the clear.
717 header =
'Authorization'
726 header =
'Proxy-Authorization'
729 host = req.get_host()
736 n = (ord(c) >> 4) & 0xf
737 hexrep.append(hex(n)[-1])
739 hexrep.append(hex(n)[-1])
740 return ''.
join(hexrep)
746 host = req.get_host()
753 data = req.get_data()
754 h.putrequest(
'POST', req.get_selector())
755 if not req.headers.has_key(
'Content-type'):
756 h.putheader(
'Content-type',
757 'application/x-www-form-urlencoded')
758 if not req.headers.has_key(
'Content-length'):
759 h.putheader(
'Content-length',
'%d' % len(data))
761 h.putrequest(
'GET', req.get_selector())
762 except socket.error, err:
765 h.putheader(
'Host', host)
766 for args
in self.parent.addheaders:
768 for k, v
in req.headers.items():
774 code, msg, hdrs = h.getreply()
777 return addinfourl(fp, hdrs, req.get_full_url())
779 return self.parent.error(
'http', req, fp, code, msg, hdrs)
788 if hasattr(httplib,
'HTTPS'):
797 type = req.get_type()
798 raise URLError(
'unknown url type: %s' % type)
801 """Parse list of key=value strings where keys are not duplicated."""
804 k, v = elt.split(
'=', 1)
805 if v[0] ==
'"' and v[-1] ==
'"':
811 """Parse lists as described by RFC 2068 Section 2.
813 In particular, parse comman-separated lists where the elements of
814 the list may include quoted-strings. A quoted-string could
829 list.append(s[start:])
833 raise ValueError,
"unbalanced quotes"
835 list.append(s[start:i+c])
840 list.append(s[start:i+c])
848 list.append(s[start:i+c])
854 return map(
lambda x: x.strip(), list)
859 url = req.get_selector()
860 if url[:2] ==
'//' and url[2:3] !=
'/':
862 return self.parent.open(req)
869 if FileHandler.names
is None:
870 FileHandler.names = (socket.gethostbyname(
'localhost'),
871 socket.gethostbyname(socket.gethostname()))
872 return FileHandler.names
876 host = req.get_host()
877 file = req.get_selector()
879 stats = os.stat(localfile)
880 size = stats[stat.ST_SIZE]
883 stats = os.stat(localfile)
885 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
886 (mtype
or 'text/plain', size, modified)))
890 (
not port
and socket.gethostbyname(host)
in self.
get_names()):
891 return addinfourl(
open(localfile,
'rb'),
892 headers,
'file:'+file)
893 raise URLError(
'file not on local host')
897 host = req.get_host()
899 raise IOError, (
'ftp error',
'no host given')
902 host = socket.gethostbyname(host)
903 except socket.error, msg:
907 port = ftplib.FTP_PORT
908 path, attrs =
splitattr(req.get_selector())
910 dirs = path.split(
'/')
911 dirs, file = dirs[:-1], dirs[-1]
912 if dirs
and not dirs[0]:
916 fw = self.
connect_ftp(user, passwd, host, port, dirs)
917 type = file
and 'I' or 'D'
920 if attr.lower() ==
'type' and \
921 value
in (
'a',
'A',
'i',
'I',
'd',
'D'):
923 fp, retrlen = fw.retrfile(file, type)
927 headers +=
"Content-Type: %s\n" % mtype
928 if retrlen
is not None and retrlen >= 0:
929 headers +=
"Content-Length: %d\n" % retrlen
932 return addinfourl(fp, headers, req.get_full_url())
933 except ftplib.all_errors, msg:
934 raise IOError, (
'ftp error', msg), sys.exc_info()[2]
937 fw = ftpwrapper(user, passwd, host, port, dirs)
958 key = user, passwd, host, port
959 if self.cache.has_key(key):
962 self.
cache[key] = ftpwrapper(user, passwd, host, port, dirs)
965 return self.
cache[key]
971 for k, v
in self.timeout.items():
980 for k, v
in self.timeout.items():
989 host = req.get_host()
993 selector = req.get_selector()
1002 return addinfourl(fp,
noheaders(), req.get_full_url())
1007 default_handlers = [UnknownHandler, HTTPHandler,
1008 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1009 FTPHandler, FileHandler]
1010 proxy_handlers = [ProxyHandler]
1012 replacement_handlers = []
1024 opener = OpenerDirector()
1025 for ph
in self.proxy_handlers:
1028 opener.add_handler(ph)
1030 if __name__ ==
"__main__":
1034 if socket.gethostname() ==
'bitdiddle':
1035 localhost =
'bitdiddle.cnri.reston.va.us'
1036 elif socket.gethostname() ==
'bitdiddle.concentric.net':
1037 localhost =
'localhost'
1042 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1043 'gopher://gopher.vt.edu:10010/10/33',
1046 'file://nonsensename/etc/passwd',
1047 'ftp://www.python.org/pub/python/misc/sousa.au',
1048 'ftp://www.python.org/pub/tmp/blat',
1049 'http://www.espn.com/',
1050 'http://www.python.org/Spanish/Inquistion/',
1051 (
'http://www.python.org/cgi-bin/faqw.py',
1052 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1053 'http://www.python.org/',
1054 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1089 if isinstance(url, types.TupleType):
1096 except IOError, err:
1097 print "IOError:", err
1098 except socket.error, err:
1099 print "socket.error:", err
1103 print "read %d bytes" % len(buf)