1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
33 __all__ = [
"urlopen",
"URLopener",
"FancyURLopener",
"urlretrieve",
34 "urlcleanup",
"quote",
"quote_plus",
"unquote",
"unquote_plus",
35 "urlencode",
"url2pathname",
"pathname2url",
"splittag",
36 "localhost",
"thishost",
"ftperrors",
"basejoin",
"unwrap",
37 "splittype",
"splithost",
"splituser",
"splitpasswd",
"splitport",
38 "splitnport",
"splitquery",
"splitattr",
"splitvalue",
39 "splitgophertype",
"getproxies"]
47 from macurl2path
import url2pathname, pathname2url
49 from nturl2path
import url2pathname, pathname2url
50 elif os.name ==
'riscos':
51 from rourl2path
import url2pathname, pathname2url
56 return quote(pathname)
68 """urlopen(url [, data]) -> open file-like object"""
73 return _urlopener.open(url)
75 return _urlopener.open(url, data)
76 def urlretrieve(url, filename=None, reporthook=None, data=None):
80 return _urlopener.retrieve(url, filename, reporthook, data)
88 """Class to open URLs.
89 This is a class rather than just a subroutine because we may need
90 more than one set of global protocol-specific options.
91 Note -- this is a base class for those who don't want the
92 automatic handling of errors type 302 (relocated) and 401
93 (authorization needed)."""
97 version =
"Python-urllib/%s" % __version__
103 assert hasattr(proxies,
'has_key'),
"proxies must be a mapping"
141 self.tempcache.clear()
144 """Add a header to be used by the HTTP interface only
145 e.g. u.addheader('Accept', 'sound/basic')"""
146 self.addheaders.append(args)
149 def open(self, fullurl, data=None):
150 """Use URLopener().open(file) instead of open(file, 'r')."""
152 if self.
tempcache and self.tempcache.has_key(fullurl):
153 filename, headers = self.
tempcache[fullurl]
154 fp =
open(filename,
'rb')
159 if self.proxies.has_key(urltype):
163 url = (host, fullurl)
166 name =
'open_' + urltype
170 name =
'_'.
join(name.split(
'-'))
171 if not hasattr(self, name):
178 return getattr(self, name)(url)
180 return getattr(self, name)(url, data)
181 except socket.error, msg:
182 raise IOError, (
'socket error', msg), sys.exc_info()[2]
185 """Overridable interface to open unknown URL type."""
187 raise IOError, (
'url error',
'unknown url type', type)
190 """Overridable interface to open unknown URL type."""
192 raise IOError, (
'url error',
'invalid proxy for %s' % type, proxy)
195 def retrieve(self, url, filename=None, reporthook=None, data=None):
196 """retrieve(url) returns (filename, None) for a local object
197 or (tempfilename, headers) for a remote object."""
199 if self.
tempcache and self.tempcache.has_key(url):
202 if not filename
and (
not type
or type ==
'file'):
210 fp = self.
open(url, data)
218 suffix = os.path.splitext(path)[1]
220 self.__tempfiles.append(filename)
221 result = filename, headers
224 tfp =
open(filename,
'wb')
229 if headers.has_key(
"content-length"):
230 size = int(headers[
"Content-Length"])
238 blocknum = blocknum + 1
250 """Use HTTP protocol."""
253 if type(url)
is types.StringType:
264 if urltype.lower() !=
'http':
269 user_passwd, realhost =
splituser(realhost)
271 selector =
"%s://%s%s" % (urltype, realhost, rest)
276 if not host:
raise IOError, (
'http error',
'no host given')
284 h.putrequest(
'POST', selector)
285 h.putheader(
'Content-type',
'application/x-www-form-urlencoded')
286 h.putheader(
'Content-length',
'%d' % len(data))
288 h.putrequest(
'GET', selector)
289 if auth: h.putheader(
'Authorization',
'Basic %s' % auth)
290 if realhost: h.putheader(
'Host', realhost)
291 for args
in self.
addheaders: apply(h.putheader, args)
295 errcode, errmsg, headers = h.getreply()
301 return self.
http_error(url, fp, errcode, errmsg, headers)
303 return self.
http_error(url, fp, errcode, errmsg, headers, data)
305 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
306 """Handle http errors.
307 Derived class can override this, or provide specific handlers
308 named http_error_DDD where DDD is the 3-digit error code."""
310 name =
'http_error_%d' % errcode
311 if hasattr(self, name):
312 method = getattr(self, name)
314 result = method(url, fp, errcode, errmsg, headers)
316 result = method(url, fp, errcode, errmsg, headers, data)
317 if result:
return result
321 """Default error handler: close the connection and raise IOError."""
324 raise IOError, (
'http error', errcode, errmsg, headers)
326 if hasattr(socket,
"ssl"):
328 """Use HTTPS protocol."""
331 if type(url)
is types.StringType:
342 if urltype.lower() !=
'https':
347 user_passwd, realhost =
splituser(realhost)
349 selector =
"%s://%s%s" % (urltype, realhost, rest)
351 if not host:
raise IOError, (
'https error',
'no host given')
361 h.putrequest(
'POST', selector)
362 h.putheader(
'Content-type',
363 'application/x-www-form-urlencoded')
364 h.putheader(
'Content-length',
'%d' % len(data))
366 h.putrequest(
'GET', selector)
367 if auth: h.putheader(
'Authorization: Basic %s' % auth)
368 if realhost: h.putheader(
'Host', realhost)
369 for args
in self.
addheaders: apply(h.putheader, args)
373 errcode, errmsg, headers = h.getreply()
376 return addinfourl(fp, headers,
"https:" + url)
379 return self.
http_error(url, fp, errcode, errmsg, headers)
381 return self.
http_error(url, fp, errcode, errmsg, headers,
385 """Use Gopher protocol."""
388 if not host:
raise IOError, (
'gopher error',
'no host given')
401 """Use local file or FTP depending on form of URL."""
402 if url[:2] ==
'//' and url[2:3] !=
'/':
408 """Use local file."""
409 import mimetypes, mimetools, rfc822, StringIO
412 stats = os.stat(localname)
413 size = stats[stat.ST_SIZE]
417 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
418 (mtype
or 'text/plain', size, modified)))
422 urlfile =
'file://' + file
430 urlfile =
'file://' + file
433 raise IOError, (
'local file error',
'not on local host')
436 """Use FTP protocol."""
437 import mimetypes, mimetools, StringIO
439 if not host:
raise IOError, (
'ftp error',
'no host given')
447 host = socket.gethostbyname(host)
450 port = ftplib.FTP_PORT
455 dirs = path.split(
'/')
456 dirs, file = dirs[:-1], dirs[-1]
457 if dirs
and not dirs[0]: dirs = dirs[1:]
458 if dirs
and not dirs[0]: dirs[0] =
'/'
459 key = user, host, port,
'/'.
join(dirs)
461 if len(self.
ftpcache) > MAXFTPCACHE:
463 for k
in self.ftpcache.keys():
469 if not self.ftpcache.has_key(key):
472 if not file: type =
'D'
476 if attr.lower() ==
'type' and \
477 value
in (
'a',
'A',
'i',
'I',
'd',
'D'):
479 (fp, retrlen) = self.
ftpcache[key].retrfile(file, type)
483 headers +=
"Content-Type: %s\n" % mtype
484 if retrlen
is not None and retrlen >= 0:
485 headers +=
"Content-Length: %d\n" % retrlen
489 raise IOError, (
'ftp error', msg), sys.exc_info()[2]
492 """Use "data" URL."""
500 import StringIO, mimetools, time
502 [type, data] = url.split(
',', 1)
504 raise IOError, (
'data error',
'bad data URL')
506 type =
'text/plain;charset=US-ASCII'
507 semi = type.rfind(
';')
508 if semi >= 0
and '=' not in type[semi:]:
509 encoding = type[semi+1:]
514 msg.append(
'Date: %s'%time.strftime(
'%a, %d %b %Y %T GMT',
515 time.gmtime(time.time())))
516 msg.append(
'Content-type: %s' % type)
517 if encoding ==
'base64':
522 msg.append(
'Content-length: %d' % len(data))
533 """Derived class with handlers for errors we can handle (perhaps)."""
536 apply(URLopener.__init__, (self,) + args)
542 """Default error handling -- don't raise an exception."""
546 """Error 302 -- relocated (temporarily)."""
549 if hasattr(self,
"http_error_500"):
550 meth = self.http_error_500
554 return meth(url, fp, 500,
555 "Internal Server Error: Redirect Recursion", headers)
562 if headers.has_key(
'location'):
563 newurl = headers[
'location']
564 elif headers.has_key(
'uri'):
565 newurl = headers[
'uri']
573 return self.
open(newurl)
575 return self.
open(newurl, data)
578 """Error 301 -- also relocated (permanently)."""
579 return self.
http_error_302(url, fp, errcode, errmsg, headers, data)
582 """Error 401 -- authentication required.
583 See this URL for a description of the basic authentication scheme:
584 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
585 if not headers.has_key(
'www-authenticate'):
586 URLopener.http_error_default(self, url, fp,
587 errcode, errmsg, headers)
588 stuff = headers[
'www-authenticate']
590 match = re.match(
'[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
592 URLopener.http_error_default(self, url, fp,
593 errcode, errmsg, headers)
594 scheme, realm = match.groups()
595 if scheme.lower() !=
'basic':
596 URLopener.http_error_default(self, url, fp,
597 errcode, errmsg, headers)
598 name =
'retry_' + self.
type +
'_basic_auth'
600 return getattr(self,name)(url, realm)
602 return getattr(self,name)(url, realm, data)
606 i = host.find(
'@') + 1
609 if not (user
or passwd):
return None
610 host =
quote(user, safe=
'') +
':' +
quote(passwd, safe=
'') +
'@' + host
611 newurl =
'http://' + host + selector
613 return self.
open(newurl)
615 return self.
open(newurl, data)
619 i = host.find(
'@') + 1
622 if not (user
or passwd):
return None
623 host =
quote(user, safe=
'') +
':' +
quote(passwd, safe=
'') +
'@' + host
624 newurl =
'//' + host + selector
628 key = realm +
'@' + host.lower()
629 if self.auth_cache.has_key(key):
635 if user
or passwd: self.
auth_cache[key] = (user, passwd)
639 """Override this in a GUI environment!"""
642 user = raw_input(
"Enter username for %s at %s: " % (realm,
647 except KeyboardInterrupt:
656 """Return the IP address of the magic hostname 'localhost'."""
659 _localhost = socket.gethostbyname(
'localhost')
664 """Return the IP address of the current host."""
667 _thishost = socket.gethostbyname(socket.gethostname())
672 """Return the set of errors raised by the FTP class."""
676 _ftperrors = ftplib.all_errors
681 """Return an empty mimetools.Message object."""
687 _noheaders.fp.close()
694 """Class used by open_ftp() for cache of open FTP connections."""
696 def __init__(self, user, passwd, host, port, dirs):
708 self.ftp.connect(self.
host, self.
port)
710 for dir
in self.
dirs:
716 if type
in (
'd',
'D'): cmd =
'TYPE A'; isdir = 1
717 else: cmd =
'TYPE ' + type; isdir = 0
719 self.ftp.voidcmd(cmd)
720 except ftplib.all_errors:
722 self.ftp.voidcmd(cmd)
724 if file
and not isdir:
729 raise IOError, (
'ftp error', reason), sys.exc_info()[2]
731 self.ftp.voidcmd(cmd)
735 conn = self.ftp.ntransfercmd(cmd)
737 if str(reason)[:3] !=
'550':
738 raise IOError, (
'ftp error', reason), sys.exc_info()[2]
741 self.ftp.voidcmd(
'TYPE A')
743 if file: cmd =
'LIST ' + file
745 conn = self.ftp.ntransfercmd(cmd)
767 """Base class for addinfo and addclosehook."""
773 if hasattr(self.
fp,
"readlines"): self.
readlines = self.fp.readlines
774 if hasattr(self.
fp,
"fileno"): self.
fileno = self.fp.fileno
777 return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
778 `id(self)`, `self.
fp`)
785 if self.
fp: self.fp.close()
789 """Class to add a close hook to an open file."""
792 addbase.__init__(self, fp)
804 """class to add an info() method to an open file."""
807 addbase.__init__(self, fp)
814 """class to add info() and geturl() methods to an open file."""
817 addbase.__init__(self, fp)
829 """Utility to combine a URL with a base URL to form a new URL."""
838 if type:
return type +
'://' + host + path
844 basepath, basetag =
splittag(basepath)
848 if path[:1]
in (
'#',
'?'):
853 i = basepath.rfind(
'/')
864 basepath = basepath[:i+1]
866 while basepath
and path[:3] ==
'../':
868 i = basepath[:-1].
rfind(
'/')
870 basepath = basepath[:i+1]
877 path = basepath + path
878 if host
and path
and path[0] !=
'/':
880 if type
and host:
return type +
'://' + host + path
881 elif type:
return type +
':' + path
882 elif host:
return '//' + host + path
903 """toBytes(u"URL") --> 'URL'."""
906 if type(url)
is types.UnicodeType:
908 url = url.encode(
"ASCII")
910 raise UnicodeError(
"URL " +
repr(url) +
911 " contains non-ASCII characters")
915 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
917 if url[:1] ==
'<' and url[-1:] ==
'>':
918 url = url[1:-1].
strip()
919 if url[:4] ==
'URL:': url = url[4:].
strip()
924 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
926 if _typeprog
is None:
928 _typeprog = re.compile(
'^([^/:]+):')
930 match = _typeprog.match(url)
932 scheme = match.group(1)
933 return scheme.lower(), url[len(scheme) + 1:]
938 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
940 if _hostprog
is None:
942 _hostprog = re.compile(
'^//([^/]*)(.*)$')
944 match = _hostprog.match(url)
945 if match:
return match.group(1, 2)
950 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
952 if _userprog
is None:
954 _userprog = re.compile(
'^([^@]*)@(.*)$')
956 match = _userprog.match(host)
957 if match:
return map(unquote, match.group(1, 2))
962 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
964 if _passwdprog
is None:
966 _passwdprog = re.compile(
'^([^:]*):(.*)$')
968 match = _passwdprog.match(user)
969 if match:
return match.group(1, 2)
975 """splitport('host:port') --> 'host', 'port'."""
977 if _portprog
is None:
979 _portprog = re.compile(
'^(.*):([0-9]+)$')
981 match = _portprog.match(host)
982 if match:
return match.group(1, 2)
987 """Split host and port, returning numeric port.
988 Return given default port if no ':' found; defaults to -1.
989 Return numerical port if a valid number are found after ':'.
990 Return None if ':' but not a valid number."""
992 if _nportprog
is None:
994 _nportprog = re.compile(
'^(.*):(.*)$')
996 match = _nportprog.match(host)
998 host, port = match.group(1, 2)
1000 if not port:
raise ValueError,
"no digits"
1005 return host, defport
1009 """splitquery('/path?query') --> '/path', 'query'."""
1011 if _queryprog
is None:
1013 _queryprog = re.compile(
'^(.*)\?([^?]*)$')
1015 match = _queryprog.match(url)
1016 if match:
return match.group(1, 2)
1021 """splittag('/path#tag') --> '/path', 'tag'."""
1023 if _tagprog
is None:
1025 _tagprog = re.compile(
'^(.*)#([^#]*)$')
1027 match = _tagprog.match(url)
1028 if match:
return match.group(1, 2)
1032 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1033 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1034 words = url.split(
';')
1035 return words[0], words[1:]
1039 """splitvalue('attr=value') --> 'attr', 'value'."""
1041 if _valueprog
is None:
1043 _valueprog = re.compile(
'^([^=]*)=(.*)$')
1045 match = _valueprog.match(attr)
1046 if match:
return match.group(1, 2)
1050 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1051 if selector[:1] ==
'/' and selector[1:2]:
1052 return selector[1], selector[2:]
1053 return None, selector
1056 """unquote('abc%20def') -> 'abc def'."""
1061 myappend = res.append
1066 myappend(mychr(myatoi(item[:2], 16))
1069 myappend(
'%' + item)
1071 myappend(
'%' + item)
1075 """unquote('%7e/abc+def') -> '~/abc def'"""
1078 s =
' '.
join(s.split(
'+'))
1081 always_safe = (
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1082 'abcdefghijklmnopqrstuvwxyz'
1085 _fast_safe_test = always_safe +
'/'
1090 if _fast_safe
is None:
1092 for c
in _fast_safe_test:
1095 for i
in range(len(res)):
1097 if not _fast_safe.has_key(c):
1098 res[i] =
'%%%02X' % ord(c)
1102 """quote('abc def') -> 'abc%20def'
1104 Each part of a URL, e.g. the path info, the query, etc., has a
1105 different set of reserved characters that must be quoted.
1107 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1108 the following reserved characters.
1110 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1113 Each of these characters is reserved in some component of a URL,
1114 but not necessarily in all of them.
1116 By default, the quote function is intended for quoting the path
1117 section of a URL. Thus, it will not encode '/'. This character
1118 is reserved, but in typical usage the quote function is being
1119 called on a path where the existing slash characters are used as
1120 reserved characters.
1122 safe = always_safe + safe
1123 if _fast_safe_test == safe:
1124 return _fast_quote(s)
1126 for i
in range(len(res)):
1129 res[i] =
'%%%02X' % ord(c)
1133 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1136 for i
in range(len(l)):
1137 l[i] =
quote(l[i], safe)
1140 return quote(s, safe)
1143 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1145 If any values in the query arg are sequences and doseq is true, each
1146 sequence element is converted to a separate parameter.
1148 If the query arg is a sequence of two-element tuples, the order of the
1149 parameters in the output will match the order of parameters in the
1153 if hasattr(query,
"items"):
1155 query = query.items()
1163 if len(query)
and type(query[0]) != types.TupleType:
1170 ty,va,tb = sys.exc_info()
1171 raise TypeError,
"not a valid non-string sequence or mapping object", tb
1179 l.append(k +
'=' + v)
1183 if type(v) == types.StringType:
1185 l.append(k +
'=' + v)
1186 elif type(v) == types.UnicodeType:
1191 l.append(k +
'=' + v)
1199 l.append(k +
'=' + v)
1208 """Return a dictionary of scheme -> proxy server URL mappings.
1210 Scan the environment for variables named <scheme>_proxy;
1211 this seems to be the standard convention. If you need a
1212 different way, you can pass a proxies dictionary to the
1213 [Fancy]URLopener constructor.
1217 for name, value
in os.environ.items():
1219 if value
and name[-6:] ==
'_proxy':
1220 proxies[name[:-6]] = value
1223 if os.name ==
'mac':
1225 """Return a dictionary of scheme -> proxy server URL mappings.
1227 By convention the mac uses Internet Config to store
1228 proxies. An HTTP proxy, for instance, is stored under
1243 if config.has_key(
'UseHTTPProxy')
and config[
'UseHTTPProxy']:
1245 value = config[
'HTTPProxyHost']
1249 proxies[
'http'] =
'http://%s' % value
1257 elif os.name ==
'nt':
1259 """Return a dictionary of scheme -> proxy server URL mappings.
1261 Win32 uses the registry to store proxies.
1271 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1272 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1273 proxyEnable = _winreg.QueryValueEx(internetSettings,
1277 proxyServer =
str(_winreg.QueryValueEx(internetSettings,
1279 if '=' in proxyServer:
1281 for p
in proxyServer.split(
';'):
1282 protocol, address = p.split(
'=', 1)
1285 if not re.match(
'^([^/:]+)://', address):
1286 address =
'%s://%s' % (protocol, address)
1287 proxies[protocol] = address
1290 if proxyServer[:5] ==
'http:':
1291 proxies[
'http'] = proxyServer
1293 proxies[
'http'] =
'http://%s' % proxyServer
1294 proxies[
'ftp'] =
'ftp://%s' % proxyServer
1295 internetSettings.Close()
1296 except (WindowsError, ValueError, TypeError):
1304 """Return a dictionary of scheme -> proxy server URL mappings.
1306 Returns settings gathered from the environment, if specified,
1321 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1322 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1323 proxyEnable = _winreg.QueryValueEx(internetSettings,
1325 proxyOverride =
str(_winreg.QueryValueEx(internetSettings,
1326 'ProxyOverride')[0])
1328 except WindowsError:
1330 if not proxyEnable
or not proxyOverride:
1335 addr = socket.gethostbyname(host[0])
1338 except socket.error:
1343 proxyOverride = proxyOverride.split(
';')
1345 while i < len(proxyOverride):
1346 if proxyOverride[i] ==
'<local>':
1347 proxyOverride[i:i+1] = [
'localhost',
1349 socket.gethostname(),
1350 socket.gethostbyname(
1351 socket.gethostname())]
1355 for test
in proxyOverride:
1356 test = test.replace(
".",
r"\.")
1357 test = test.replace(
"*",
r".*")
1358 test = test.replace(
"?",
r".")
1361 if re.match(test, val, re.I):
1367 getproxies = getproxies_environment
1376 for i
in range(256): s = s + chr(i)
1387 print round(t1 - t0, 3),
'sec'
1392 print "Block number: %d, Block size: %d, Total size: %d" % (
1393 blocknum, blocksize, totalsize)
1401 'file://localhost/etc/passwd',
1402 'ftp://ftp.python.org/pub/python/README',
1404 'http://www.python.org/index.html',
1406 if hasattr(URLopener,
"open_https"):
1407 args.append(
'https://synergy.as.cmu.edu/~geek/')
1410 print '-'*10, url,
'-'*10
1415 for k
in h.keys():
print k +
':', h[k]
1422 data = data.translate(table,
"\r")
1433 except getopt.error, msg:
1435 print "Use -h for help"
1442 print "Usage: python urllib.py [-t] [url ...]"
1443 print "-t runs self-test;",
1444 print "otherwise, contents of urls are printed"
1452 print "Use -h for help"
1457 if __name__ ==
'__main__':