Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
urllib.py
Go to the documentation of this file.
1 """Open an arbitrary URL.
2 
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
6 
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
10 
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
15 
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
23 """
24 
25 import string
26 import socket
27 import os
28 import stat
29 import time
30 import sys
31 import types
32 
33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34  "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35  "urlencode", "url2pathname", "pathname2url", "splittag",
36  "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37  "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38  "splitnport", "splitquery", "splitattr", "splitvalue",
39  "splitgophertype", "getproxies"]
40 
41 __version__ = '1.15' # XXX This version is not always updated :-(
42 
43 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
44 
45 # Helper for non-unix systems
46 if os.name == 'mac':
47  from macurl2path import url2pathname, pathname2url
48 elif os.name == 'nt':
49  from nturl2path import url2pathname, pathname2url
50 elif os.name == 'riscos':
51  from rourl2path import url2pathname, pathname2url
52 else:
53  def url2pathname(pathname):
54  return unquote(pathname)
55  def pathname2url(pathname):
56  return quote(pathname)
57 
58 # This really consists of two pieces:
59 # (1) a class which handles opening of all sorts of URLs
60 # (plus assorted utilities etc.)
61 # (2) a set of functions for parsing URLs
62 # XXX Should these be separated out into different modules?
63 
64 
65 # Shortcut for basic usage
66 _urlopener = None
67 def urlopen(url, data=None):
68  """urlopen(url [, data]) -> open file-like object"""
69  global _urlopener
70  if not _urlopener:
71  _urlopener = FancyURLopener()
72  if data is None:
73  return _urlopener.open(url)
74  else:
75  return _urlopener.open(url, data)
76 def urlretrieve(url, filename=None, reporthook=None, data=None):
77  global _urlopener
78  if not _urlopener:
79  _urlopener = FancyURLopener()
80  return _urlopener.retrieve(url, filename, reporthook, data)
81 def urlcleanup():
82  if _urlopener:
83  _urlopener.cleanup()
84 
85 
86 ftpcache = {}
87 class URLopener:
88  """Class to open URLs.
89  This is a class rather than just a subroutine because we may need
90  more than one set of global protocol-specific options.
91  Note -- this is a base class for those who don't want the
92  automatic handling of errors type 302 (relocated) and 401
93  (authorization needed)."""
94 
95  __tempfiles = None
96 
97  version = "Python-urllib/%s" % __version__
98 
99  # Constructor
100  def __init__(self, proxies=None, **x509):
101  if proxies is None:
102  proxies = getproxies()
103  assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
104  self.proxies = proxies
105  self.key_file = x509.get('key_file')
106  self.cert_file = x509.get('cert_file')
107  self.addheaders = [('User-agent', self.version)]
108  self.__tempfiles = []
109  self.__unlink = os.unlink # See cleanup()
110  self.tempcache = None
111  # Undocumented feature: if you assign {} to tempcache,
112  # it is used to cache files retrieved with
113  # self.retrieve(). This is not enabled by default
114  # since it does not work for changing documents (and I
115  # haven't got the logic to check expiration headers
116  # yet).
117  self.ftpcache = ftpcache
118  # Undocumented feature: you can use a different
119  # ftp cache by assigning to the .ftpcache member;
120  # in case you want logically independent URL openers
121  # XXX This is not threadsafe. Bah.
122 
123  def __del__(self):
124  self.close()
125 
126  def close(self):
127  self.cleanup()
128 
129  def cleanup(self):
130  # This code sometimes runs when the rest of this module
131  # has already been deleted, so it can't use any globals
132  # or import anything.
133  if self.__tempfiles:
134  for file in self.__tempfiles:
135  try:
136  self.__unlink(file)
137  except OSError:
138  pass
139  del self.__tempfiles[:]
140  if self.tempcache:
141  self.tempcache.clear()
142 
143  def addheader(self, *args):
144  """Add a header to be used by the HTTP interface only
145  e.g. u.addheader('Accept', 'sound/basic')"""
146  self.addheaders.append(args)
147 
148  # External interface
149  def open(self, fullurl, data=None):
150  """Use URLopener().open(file) instead of open(file, 'r')."""
151  fullurl = unwrap(toBytes(fullurl))
152  if self.tempcache and self.tempcache.has_key(fullurl):
153  filename, headers = self.tempcache[fullurl]
154  fp = open(filename, 'rb')
155  return addinfourl(fp, headers, fullurl)
156  urltype, url = splittype(fullurl)
157  if not urltype:
158  urltype = 'file'
159  if self.proxies.has_key(urltype):
160  proxy = self.proxies[urltype]
161  urltype, proxyhost = splittype(proxy)
162  host, selector = splithost(proxyhost)
163  url = (host, fullurl) # Signal special case to open_*()
164  else:
165  proxy = None
166  name = 'open_' + urltype
167  self.type = urltype
168  if '-' in name:
169  # replace - with _
170  name = '_'.join(name.split('-'))
171  if not hasattr(self, name):
172  if proxy:
173  return self.open_unknown_proxy(proxy, fullurl, data)
174  else:
175  return self.open_unknown(fullurl, data)
176  try:
177  if data is None:
178  return getattr(self, name)(url)
179  else:
180  return getattr(self, name)(url, data)
181  except socket.error, msg:
182  raise IOError, ('socket error', msg), sys.exc_info()[2]
183 
184  def open_unknown(self, fullurl, data=None):
185  """Overridable interface to open unknown URL type."""
186  type, url = splittype(fullurl)
187  raise IOError, ('url error', 'unknown url type', type)
188 
189  def open_unknown_proxy(self, proxy, fullurl, data=None):
190  """Overridable interface to open unknown URL type."""
191  type, url = splittype(fullurl)
192  raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
193 
194  # External interface
195  def retrieve(self, url, filename=None, reporthook=None, data=None):
196  """retrieve(url) returns (filename, None) for a local object
197  or (tempfilename, headers) for a remote object."""
198  url = unwrap(toBytes(url))
199  if self.tempcache and self.tempcache.has_key(url):
200  return self.tempcache[url]
201  type, url1 = splittype(url)
202  if not filename and (not type or type == 'file'):
203  try:
204  fp = self.open_local_file(url1)
205  hdrs = fp.info()
206  del fp
207  return url2pathname(splithost(url1)[1]), hdrs
208  except IOError, msg:
209  pass
210  fp = self.open(url, data)
211  headers = fp.info()
212  if not filename:
213  import tempfile
214  garbage, path = splittype(url)
215  garbage, path = splithost(path or "")
216  path, garbage = splitquery(path or "")
217  path, garbage = splitattr(path or "")
218  suffix = os.path.splitext(path)[1]
219  filename = tempfile.mktemp(suffix)
220  self.__tempfiles.append(filename)
221  result = filename, headers
222  if self.tempcache is not None:
223  self.tempcache[url] = result
224  tfp = open(filename, 'wb')
225  bs = 1024*8
226  size = -1
227  blocknum = 1
228  if reporthook:
229  if headers.has_key("content-length"):
230  size = int(headers["Content-Length"])
231  reporthook(0, bs, size)
232  block = fp.read(bs)
233  if reporthook:
234  reporthook(1, bs, size)
235  while block:
236  tfp.write(block)
237  block = fp.read(bs)
238  blocknum = blocknum + 1
239  if reporthook:
240  reporthook(blocknum, bs, size)
241  fp.close()
242  tfp.close()
243  del fp
244  del tfp
245  return result
246 
247  # Each method named open_<type> knows how to open that type of URL
248 
249  def open_http(self, url, data=None):
250  """Use HTTP protocol."""
251  import httplib
252  user_passwd = None
253  if type(url) is types.StringType:
254  host, selector = splithost(url)
255  if host:
256  user_passwd, host = splituser(host)
257  host = unquote(host)
258  realhost = host
259  else:
260  host, selector = url
261  urltype, rest = splittype(selector)
262  url = rest
263  user_passwd = None
264  if urltype.lower() != 'http':
265  realhost = None
266  else:
267  realhost, rest = splithost(rest)
268  if realhost:
269  user_passwd, realhost = splituser(realhost)
270  if user_passwd:
271  selector = "%s://%s%s" % (urltype, realhost, rest)
272  if proxy_bypass(realhost):
273  host = realhost
274 
275  #print "proxy via http:", host, selector
276  if not host: raise IOError, ('http error', 'no host given')
277  if user_passwd:
278  import base64
279  auth = base64.encodestring(user_passwd).strip()
280  else:
281  auth = None
282  h = httplib.HTTP(host)
283  if data is not None:
284  h.putrequest('POST', selector)
285  h.putheader('Content-type', 'application/x-www-form-urlencoded')
286  h.putheader('Content-length', '%d' % len(data))
287  else:
288  h.putrequest('GET', selector)
289  if auth: h.putheader('Authorization', 'Basic %s' % auth)
290  if realhost: h.putheader('Host', realhost)
291  for args in self.addheaders: apply(h.putheader, args)
292  h.endheaders()
293  if data is not None:
294  h.send(data)
295  errcode, errmsg, headers = h.getreply()
296  fp = h.getfile()
297  if errcode == 200:
298  return addinfourl(fp, headers, "http:" + url)
299  else:
300  if data is None:
301  return self.http_error(url, fp, errcode, errmsg, headers)
302  else:
303  return self.http_error(url, fp, errcode, errmsg, headers, data)
304 
305  def http_error(self, url, fp, errcode, errmsg, headers, data=None):
306  """Handle http errors.
307  Derived class can override this, or provide specific handlers
308  named http_error_DDD where DDD is the 3-digit error code."""
309  # First check if there's a specific handler for this error
310  name = 'http_error_%d' % errcode
311  if hasattr(self, name):
312  method = getattr(self, name)
313  if data is None:
314  result = method(url, fp, errcode, errmsg, headers)
315  else:
316  result = method(url, fp, errcode, errmsg, headers, data)
317  if result: return result
318  return self.http_error_default(url, fp, errcode, errmsg, headers)
319 
320  def http_error_default(self, url, fp, errcode, errmsg, headers):
321  """Default error handler: close the connection and raise IOError."""
322  void = fp.read()
323  fp.close()
324  raise IOError, ('http error', errcode, errmsg, headers)
325 
326  if hasattr(socket, "ssl"):
327  def open_https(self, url, data=None):
328  """Use HTTPS protocol."""
329  import httplib
330  user_passwd = None
331  if type(url) is types.StringType:
332  host, selector = splithost(url)
333  if host:
334  user_passwd, host = splituser(host)
335  host = unquote(host)
336  realhost = host
337  else:
338  host, selector = url
339  urltype, rest = splittype(selector)
340  url = rest
341  user_passwd = None
342  if urltype.lower() != 'https':
343  realhost = None
344  else:
345  realhost, rest = splithost(rest)
346  if realhost:
347  user_passwd, realhost = splituser(realhost)
348  if user_passwd:
349  selector = "%s://%s%s" % (urltype, realhost, rest)
350  #print "proxy via https:", host, selector
351  if not host: raise IOError, ('https error', 'no host given')
352  if user_passwd:
353  import base64
354  auth = base64.encodestring(user_passwd).strip()
355  else:
356  auth = None
357  h = httplib.HTTPS(host, 0,
358  key_file=self.key_file,
359  cert_file=self.cert_file)
360  if data is not None:
361  h.putrequest('POST', selector)
362  h.putheader('Content-type',
363  'application/x-www-form-urlencoded')
364  h.putheader('Content-length', '%d' % len(data))
365  else:
366  h.putrequest('GET', selector)
367  if auth: h.putheader('Authorization: Basic %s' % auth)
368  if realhost: h.putheader('Host', realhost)
369  for args in self.addheaders: apply(h.putheader, args)
370  h.endheaders()
371  if data is not None:
372  h.send(data)
373  errcode, errmsg, headers = h.getreply()
374  fp = h.getfile()
375  if errcode == 200:
376  return addinfourl(fp, headers, "https:" + url)
377  else:
378  if data is None:
379  return self.http_error(url, fp, errcode, errmsg, headers)
380  else:
381  return self.http_error(url, fp, errcode, errmsg, headers,
382  data)
383 
384  def open_gopher(self, url):
385  """Use Gopher protocol."""
386  import gopherlib
387  host, selector = splithost(url)
388  if not host: raise IOError, ('gopher error', 'no host given')
389  host = unquote(host)
390  type, selector = splitgophertype(selector)
391  selector, query = splitquery(selector)
392  selector = unquote(selector)
393  if query:
394  query = unquote(query)
395  fp = gopherlib.send_query(selector, query, host)
396  else:
397  fp = gopherlib.send_selector(selector, host)
398  return addinfourl(fp, noheaders(), "gopher:" + url)
399 
400  def open_file(self, url):
401  """Use local file or FTP depending on form of URL."""
402  if url[:2] == '//' and url[2:3] != '/':
403  return self.open_ftp(url)
404  else:
405  return self.open_local_file(url)
406 
407  def open_local_file(self, url):
408  """Use local file."""
409  import mimetypes, mimetools, rfc822, StringIO
410  host, file = splithost(url)
411  localname = url2pathname(file)
412  stats = os.stat(localname)
413  size = stats[stat.ST_SIZE]
414  modified = rfc822.formatdate(stats[stat.ST_MTIME])
415  mtype = mimetypes.guess_type(url)[0]
417  'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
418  (mtype or 'text/plain', size, modified)))
419  if not host:
420  urlfile = file
421  if file[:1] == '/':
422  urlfile = 'file://' + file
423  return addinfourl(open(localname, 'rb'),
424  headers, urlfile)
425  host, port = splitport(host)
426  if not port \
427  and socket.gethostbyname(host) in (localhost(), thishost()):
428  urlfile = file
429  if file[:1] == '/':
430  urlfile = 'file://' + file
431  return addinfourl(open(localname, 'rb'),
432  headers, urlfile)
433  raise IOError, ('local file error', 'not on local host')
434 
435  def open_ftp(self, url):
436  """Use FTP protocol."""
437  import mimetypes, mimetools, StringIO
438  host, path = splithost(url)
439  if not host: raise IOError, ('ftp error', 'no host given')
440  host, port = splitport(host)
441  user, host = splituser(host)
442  if user: user, passwd = splitpasswd(user)
443  else: passwd = None
444  host = unquote(host)
445  user = unquote(user or '')
446  passwd = unquote(passwd or '')
447  host = socket.gethostbyname(host)
448  if not port:
449  import ftplib
450  port = ftplib.FTP_PORT
451  else:
452  port = int(port)
453  path, attrs = splitattr(path)
454  path = unquote(path)
455  dirs = path.split('/')
456  dirs, file = dirs[:-1], dirs[-1]
457  if dirs and not dirs[0]: dirs = dirs[1:]
458  if dirs and not dirs[0]: dirs[0] = '/'
459  key = user, host, port, '/'.join(dirs)
460  # XXX thread unsafe!
461  if len(self.ftpcache) > MAXFTPCACHE:
462  # Prune the cache, rather arbitrarily
463  for k in self.ftpcache.keys():
464  if k != key:
465  v = self.ftpcache[k]
466  del self.ftpcache[k]
467  v.close()
468  try:
469  if not self.ftpcache.has_key(key):
470  self.ftpcache[key] = \
471  ftpwrapper(user, passwd, host, port, dirs)
472  if not file: type = 'D'
473  else: type = 'I'
474  for attr in attrs:
475  attr, value = splitvalue(attr)
476  if attr.lower() == 'type' and \
477  value in ('a', 'A', 'i', 'I', 'd', 'D'):
478  type = value.upper()
479  (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
480  mtype = mimetypes.guess_type("ftp:" + url)[0]
481  headers = ""
482  if mtype:
483  headers += "Content-Type: %s\n" % mtype
484  if retrlen is not None and retrlen >= 0:
485  headers += "Content-Length: %d\n" % retrlen
486  headers = mimetools.Message(StringIO.StringIO(headers))
487  return addinfourl(fp, headers, "ftp:" + url)
488  except ftperrors(), msg:
489  raise IOError, ('ftp error', msg), sys.exc_info()[2]
490 
491  def open_data(self, url, data=None):
492  """Use "data" URL."""
493  # ignore POSTed data
494  #
495  # syntax of data URLs:
496  # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
497  # mediatype := [ type "/" subtype ] *( ";" parameter )
498  # data := *urlchar
499  # parameter := attribute "=" value
500  import StringIO, mimetools, time
501  try:
502  [type, data] = url.split(',', 1)
503  except ValueError:
504  raise IOError, ('data error', 'bad data URL')
505  if not type:
506  type = 'text/plain;charset=US-ASCII'
507  semi = type.rfind(';')
508  if semi >= 0 and '=' not in type[semi:]:
509  encoding = type[semi+1:]
510  type = type[:semi]
511  else:
512  encoding = ''
513  msg = []
514  msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
515  time.gmtime(time.time())))
516  msg.append('Content-type: %s' % type)
517  if encoding == 'base64':
518  import base64
519  data = base64.decodestring(data)
520  else:
521  data = unquote(data)
522  msg.append('Content-length: %d' % len(data))
523  msg.append('')
524  msg.append(data)
525  msg = '\n'.join(msg)
526  f = StringIO.StringIO(msg)
527  headers = mimetools.Message(f, 0)
528  f.fileno = None # needed for addinfourl
529  return addinfourl(f, headers, url)
530 
531 
533  """Derived class with handlers for errors we can handle (perhaps)."""
534 
535  def __init__(self, *args):
536  apply(URLopener.__init__, (self,) + args)
537  self.auth_cache = {}
538  self.tries = 0
539  self.maxtries = 10
540 
541  def http_error_default(self, url, fp, errcode, errmsg, headers):
542  """Default error handling -- don't raise an exception."""
543  return addinfourl(fp, headers, "http:" + url)
544 
545  def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
546  """Error 302 -- relocated (temporarily)."""
547  self.tries += 1
548  if self.maxtries and self.tries >= self.maxtries:
549  if hasattr(self, "http_error_500"):
550  meth = self.http_error_500
551  else:
552  meth = self.http_error_default
553  self.tries = 0
554  return meth(url, fp, 500,
555  "Internal Server Error: Redirect Recursion", headers)
556  result = self.redirect_internal(url, fp, errcode, errmsg, headers,
557  data)
558  self.tries = 0
559  return result
560 
561  def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
562  if headers.has_key('location'):
563  newurl = headers['location']
564  elif headers.has_key('uri'):
565  newurl = headers['uri']
566  else:
567  return
568  void = fp.read()
569  fp.close()
570  # In case the server sent a relative URL, join with original:
571  newurl = basejoin(self.type + ":" + url, newurl)
572  if data is None:
573  return self.open(newurl)
574  else:
575  return self.open(newurl, data)
576 
577  def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
578  """Error 301 -- also relocated (permanently)."""
579  return self.http_error_302(url, fp, errcode, errmsg, headers, data)
580 
581  def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
582  """Error 401 -- authentication required.
583  See this URL for a description of the basic authentication scheme:
584  http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
585  if not headers.has_key('www-authenticate'):
586  URLopener.http_error_default(self, url, fp,
587  errcode, errmsg, headers)
588  stuff = headers['www-authenticate']
589  import re
590  match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
591  if not match:
592  URLopener.http_error_default(self, url, fp,
593  errcode, errmsg, headers)
594  scheme, realm = match.groups()
595  if scheme.lower() != 'basic':
596  URLopener.http_error_default(self, url, fp,
597  errcode, errmsg, headers)
598  name = 'retry_' + self.type + '_basic_auth'
599  if data is None:
600  return getattr(self,name)(url, realm)
601  else:
602  return getattr(self,name)(url, realm, data)
603 
604  def retry_http_basic_auth(self, url, realm, data=None):
605  host, selector = splithost(url)
606  i = host.find('@') + 1
607  host = host[i:]
608  user, passwd = self.get_user_passwd(host, realm, i)
609  if not (user or passwd): return None
610  host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
611  newurl = 'http://' + host + selector
612  if data is None:
613  return self.open(newurl)
614  else:
615  return self.open(newurl, data)
616 
617  def retry_https_basic_auth(self, url, realm, data=None):
618  host, selector = splithost(url)
619  i = host.find('@') + 1
620  host = host[i:]
621  user, passwd = self.get_user_passwd(host, realm, i)
622  if not (user or passwd): return None
623  host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
624  newurl = '//' + host + selector
625  return self.open_https(newurl, data)
626 
627  def get_user_passwd(self, host, realm, clear_cache = 0):
628  key = realm + '@' + host.lower()
629  if self.auth_cache.has_key(key):
630  if clear_cache:
631  del self.auth_cache[key]
632  else:
633  return self.auth_cache[key]
634  user, passwd = self.prompt_user_passwd(host, realm)
635  if user or passwd: self.auth_cache[key] = (user, passwd)
636  return user, passwd
637 
638  def prompt_user_passwd(self, host, realm):
639  """Override this in a GUI environment!"""
640  import getpass
641  try:
642  user = raw_input("Enter username for %s at %s: " % (realm,
643  host))
644  passwd = getpass.getpass("Enter password for %s in %s at %s: " %
645  (user, realm, host))
646  return user, passwd
647  except KeyboardInterrupt:
648  print
649  return None, None
650 
651 
652 # Utility functions
653 
654 _localhost = None
655 def localhost():
656  """Return the IP address of the magic hostname 'localhost'."""
657  global _localhost
658  if not _localhost:
659  _localhost = socket.gethostbyname('localhost')
660  return _localhost
661 
662 _thishost = None
663 def thishost():
664  """Return the IP address of the current host."""
665  global _thishost
666  if not _thishost:
667  _thishost = socket.gethostbyname(socket.gethostname())
668  return _thishost
669 
670 _ftperrors = None
671 def ftperrors():
672  """Return the set of errors raised by the FTP class."""
673  global _ftperrors
674  if not _ftperrors:
675  import ftplib
676  _ftperrors = ftplib.all_errors
677  return _ftperrors
678 
679 _noheaders = None
680 def noheaders():
681  """Return an empty mimetools.Message object."""
682  global _noheaders
683  if not _noheaders:
684  import mimetools
685  import StringIO
686  _noheaders = mimetools.Message(StringIO.StringIO(), 0)
687  _noheaders.fp.close() # Recycle file descriptor
688  return _noheaders
689 
690 
691 # Utility classes
692 
694  """Class used by open_ftp() for cache of open FTP connections."""
695 
696  def __init__(self, user, passwd, host, port, dirs):
697  self.user = user
698  self.passwd = passwd
699  self.host = host
700  self.port = port
701  self.dirs = dirs
702  self.init()
703 
704  def init(self):
705  import ftplib
706  self.busy = 0
707  self.ftp = ftplib.FTP()
708  self.ftp.connect(self.host, self.port)
709  self.ftp.login(self.user, self.passwd)
710  for dir in self.dirs:
711  self.ftp.cwd(dir)
712 
713  def retrfile(self, file, type):
714  import ftplib
715  self.endtransfer()
716  if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
717  else: cmd = 'TYPE ' + type; isdir = 0
718  try:
719  self.ftp.voidcmd(cmd)
720  except ftplib.all_errors:
721  self.init()
722  self.ftp.voidcmd(cmd)
723  conn = None
724  if file and not isdir:
725  # Use nlst to see if the file exists at all
726  try:
727  self.ftp.nlst(file)
728  except ftplib.error_perm, reason:
729  raise IOError, ('ftp error', reason), sys.exc_info()[2]
730  # Restore the transfer mode!
731  self.ftp.voidcmd(cmd)
732  # Try to retrieve as a file
733  try:
734  cmd = 'RETR ' + file
735  conn = self.ftp.ntransfercmd(cmd)
736  except ftplib.error_perm, reason:
737  if str(reason)[:3] != '550':
738  raise IOError, ('ftp error', reason), sys.exc_info()[2]
739  if not conn:
740  # Set transfer mode to ASCII!
741  self.ftp.voidcmd('TYPE A')
742  # Try a directory listing
743  if file: cmd = 'LIST ' + file
744  else: cmd = 'LIST'
745  conn = self.ftp.ntransfercmd(cmd)
746  self.busy = 1
747  # Pass back both a suitably decorated object and a retrieval length
748  return (addclosehook(conn[0].makefile('rb'),
749  self.endtransfer), conn[1])
750  def endtransfer(self):
751  if not self.busy:
752  return
753  self.busy = 0
754  try:
755  self.ftp.voidresp()
756  except ftperrors():
757  pass
758 
759  def close(self):
760  self.endtransfer()
761  try:
762  self.ftp.close()
763  except ftperrors():
764  pass
765 
766 class addbase:
767  """Base class for addinfo and addclosehook."""
768 
769  def __init__(self, fp):
770  self.fp = fp
771  self.read = self.fp.read
772  self.readline = self.fp.readline
773  if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
774  if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
775 
776  def __repr__(self):
777  return '<%s at %s whose fp = %s>' % (self.__class__.__name__,
778  `id(self)`, `self.fp`)
779 
780  def close(self):
781  self.read = None
782  self.readline = None
783  self.readlines = None
784  self.fileno = None
785  if self.fp: self.fp.close()
786  self.fp = None
787 
789  """Class to add a close hook to an open file."""
790 
791  def __init__(self, fp, closehook, *hookargs):
792  addbase.__init__(self, fp)
793  self.closehook = closehook
794  self.hookargs = hookargs
795 
796  def close(self):
797  addbase.close(self)
798  if self.closehook:
799  apply(self.closehook, self.hookargs)
800  self.closehook = None
801  self.hookargs = None
802 
804  """class to add an info() method to an open file."""
805 
806  def __init__(self, fp, headers):
807  addbase.__init__(self, fp)
808  self.headers = headers
809 
810  def info(self):
811  return self.headers
812 
814  """class to add info() and geturl() methods to an open file."""
815 
816  def __init__(self, fp, headers, url):
817  addbase.__init__(self, fp)
818  self.headers = headers
819  self.url = url
820 
821  def info(self):
822  return self.headers
823 
824  def geturl(self):
825  return self.url
826 
827 
828 def basejoin(base, url):
829  """Utility to combine a URL with a base URL to form a new URL."""
830  type, path = splittype(url)
831  if type:
832  # if url is complete (i.e., it contains a type), return it
833  return url
834  host, path = splithost(path)
835  type, basepath = splittype(base) # inherit type from base
836  if host:
837  # if url contains host, just inherit type
838  if type: return type + '://' + host + path
839  else:
840  # no type inherited, so url must have started with //
841  # just return it
842  return url
843  host, basepath = splithost(basepath) # inherit host
844  basepath, basetag = splittag(basepath) # remove extraneous cruft
845  basepath, basequery = splitquery(basepath) # idem
846  if path[:1] != '/':
847  # non-absolute path name
848  if path[:1] in ('#', '?'):
849  # path is just a tag or query, attach to basepath
850  i = len(basepath)
851  else:
852  # else replace last component
853  i = basepath.rfind('/')
854  if i < 0:
855  # basepath not absolute
856  if host:
857  # host present, make absolute
858  basepath = '/'
859  else:
860  # else keep non-absolute
861  basepath = ''
862  else:
863  # remove last file component
864  basepath = basepath[:i+1]
865  # Interpret ../ (important because of symlinks)
866  while basepath and path[:3] == '../':
867  path = path[3:]
868  i = basepath[:-1].rfind('/')
869  if i > 0:
870  basepath = basepath[:i+1]
871  elif i == 0:
872  basepath = '/'
873  break
874  else:
875  basepath = ''
876 
877  path = basepath + path
878  if host and path and path[0] != '/':
879  path = '/' + path
880  if type and host: return type + '://' + host + path
881  elif type: return type + ':' + path
882  elif host: return '//' + host + path # don't know what this means
883  else: return path
884 
885 
886 # Utilities to parse URLs (most of these return None for missing parts):
887 # unwrap('<URL:type://host/path>') --> 'type://host/path'
888 # splittype('type:opaquestring') --> 'type', 'opaquestring'
889 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
890 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
891 # splitpasswd('user:passwd') -> 'user', 'passwd'
892 # splitport('host:port') --> 'host', 'port'
893 # splitquery('/path?query') --> '/path', 'query'
894 # splittag('/path#tag') --> '/path', 'tag'
895 # splitattr('/path;attr1=value1;attr2=value2;...') ->
896 # '/path', ['attr1=value1', 'attr2=value2', ...]
897 # splitvalue('attr=value') --> 'attr', 'value'
898 # splitgophertype('/Xselector') --> 'X', 'selector'
899 # unquote('abc%20def') -> 'abc def'
900 # quote('abc def') -> 'abc%20def')
901 
902 def toBytes(url):
903  """toBytes(u"URL") --> 'URL'."""
904  # Most URL schemes require ASCII. If that changes, the conversion
905  # can be relaxed
906  if type(url) is types.UnicodeType:
907  try:
908  url = url.encode("ASCII")
909  except UnicodeError:
910  raise UnicodeError("URL " + repr(url) +
911  " contains non-ASCII characters")
912  return url
913 
914 def unwrap(url):
915  """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
916  url = url.strip()
917  if url[:1] == '<' and url[-1:] == '>':
918  url = url[1:-1].strip()
919  if url[:4] == 'URL:': url = url[4:].strip()
920  return url
921 
922 _typeprog = None
923 def splittype(url):
924  """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
925  global _typeprog
926  if _typeprog is None:
927  import re
928  _typeprog = re.compile('^([^/:]+):')
929 
930  match = _typeprog.match(url)
931  if match:
932  scheme = match.group(1)
933  return scheme.lower(), url[len(scheme) + 1:]
934  return None, url
935 
936 _hostprog = None
937 def splithost(url):
938  """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
939  global _hostprog
940  if _hostprog is None:
941  import re
942  _hostprog = re.compile('^//([^/]*)(.*)$')
943 
944  match = _hostprog.match(url)
945  if match: return match.group(1, 2)
946  return None, url
947 
948 _userprog = None
949 def splituser(host):
950  """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
951  global _userprog
952  if _userprog is None:
953  import re
954  _userprog = re.compile('^([^@]*)@(.*)$')
955 
956  match = _userprog.match(host)
957  if match: return map(unquote, match.group(1, 2))
958  return None, host
959 
960 _passwdprog = None
961 def splitpasswd(user):
962  """splitpasswd('user:passwd') -> 'user', 'passwd'."""
963  global _passwdprog
964  if _passwdprog is None:
965  import re
966  _passwdprog = re.compile('^([^:]*):(.*)$')
967 
968  match = _passwdprog.match(user)
969  if match: return match.group(1, 2)
970  return user, None
971 
972 # splittag('/path#tag') --> '/path', 'tag'
973 _portprog = None
974 def splitport(host):
975  """splitport('host:port') --> 'host', 'port'."""
976  global _portprog
977  if _portprog is None:
978  import re
979  _portprog = re.compile('^(.*):([0-9]+)$')
980 
981  match = _portprog.match(host)
982  if match: return match.group(1, 2)
983  return host, None
984 
985 _nportprog = None
986 def splitnport(host, defport=-1):
987  """Split host and port, returning numeric port.
988  Return given default port if no ':' found; defaults to -1.
989  Return numerical port if a valid number are found after ':'.
990  Return None if ':' but not a valid number."""
991  global _nportprog
992  if _nportprog is None:
993  import re
994  _nportprog = re.compile('^(.*):(.*)$')
995 
996  match = _nportprog.match(host)
997  if match:
998  host, port = match.group(1, 2)
999  try:
1000  if not port: raise ValueError, "no digits"
1001  nport = int(port)
1002  except ValueError:
1003  nport = None
1004  return host, nport
1005  return host, defport
1006 
1007 _queryprog = None
1008 def splitquery(url):
1009  """splitquery('/path?query') --> '/path', 'query'."""
1010  global _queryprog
1011  if _queryprog is None:
1012  import re
1013  _queryprog = re.compile('^(.*)\?([^?]*)$')
1014 
1015  match = _queryprog.match(url)
1016  if match: return match.group(1, 2)
1017  return url, None
1018 
1019 _tagprog = None
1020 def splittag(url):
1021  """splittag('/path#tag') --> '/path', 'tag'."""
1022  global _tagprog
1023  if _tagprog is None:
1024  import re
1025  _tagprog = re.compile('^(.*)#([^#]*)$')
1026 
1027  match = _tagprog.match(url)
1028  if match: return match.group(1, 2)
1029  return url, None
1030 
1031 def splitattr(url):
1032  """splitattr('/path;attr1=value1;attr2=value2;...') ->
1033  '/path', ['attr1=value1', 'attr2=value2', ...]."""
1034  words = url.split(';')
1035  return words[0], words[1:]
1036 
1037 _valueprog = None
1038 def splitvalue(attr):
1039  """splitvalue('attr=value') --> 'attr', 'value'."""
1040  global _valueprog
1041  if _valueprog is None:
1042  import re
1043  _valueprog = re.compile('^([^=]*)=(.*)$')
1044 
1045  match = _valueprog.match(attr)
1046  if match: return match.group(1, 2)
1047  return attr, None
1048 
1049 def splitgophertype(selector):
1050  """splitgophertype('/Xselector') --> 'X', 'selector'."""
1051  if selector[:1] == '/' and selector[1:2]:
1052  return selector[1], selector[2:]
1053  return None, selector
1054 
1055 def unquote(s):
1056  """unquote('abc%20def') -> 'abc def'."""
1057  mychr = chr
1058  myatoi = int
1059  list = s.split('%')
1060  res = [list[0]]
1061  myappend = res.append
1062  del list[0]
1063  for item in list:
1064  if item[1:2]:
1065  try:
1066  myappend(mychr(myatoi(item[:2], 16))
1067  + item[2:])
1068  except ValueError:
1069  myappend('%' + item)
1070  else:
1071  myappend('%' + item)
1072  return "".join(res)
1073 
1075  """unquote('%7e/abc+def') -> '~/abc def'"""
1076  if '+' in s:
1077  # replace '+' with ' '
1078  s = ' '.join(s.split('+'))
1079  return unquote(s)
1080 
1081 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1082  'abcdefghijklmnopqrstuvwxyz'
1083  '0123456789' '_.-')
1084 
1085 _fast_safe_test = always_safe + '/'
1086 _fast_safe = None
1087 
1088 def _fast_quote(s):
1089  global _fast_safe
1090  if _fast_safe is None:
1091  _fast_safe = {}
1092  for c in _fast_safe_test:
1093  _fast_safe[c] = c
1094  res = list(s)
1095  for i in range(len(res)):
1096  c = res[i]
1097  if not _fast_safe.has_key(c):
1098  res[i] = '%%%02X' % ord(c)
1099  return ''.join(res)
1100 
1101 def quote(s, safe = '/'):
1102  """quote('abc def') -> 'abc%20def'
1103 
1104  Each part of a URL, e.g. the path info, the query, etc., has a
1105  different set of reserved characters that must be quoted.
1106 
1107  RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1108  the following reserved characters.
1109 
1110  reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1111  "$" | ","
1112 
1113  Each of these characters is reserved in some component of a URL,
1114  but not necessarily in all of them.
1115 
1116  By default, the quote function is intended for quoting the path
1117  section of a URL. Thus, it will not encode '/'. This character
1118  is reserved, but in typical usage the quote function is being
1119  called on a path where the existing slash characters are used as
1120  reserved characters.
1121  """
1122  safe = always_safe + safe
1123  if _fast_safe_test == safe:
1124  return _fast_quote(s)
1125  res = list(s)
1126  for i in range(len(res)):
1127  c = res[i]
1128  if c not in safe:
1129  res[i] = '%%%02X' % ord(c)
1130  return ''.join(res)
1131 
1132 def quote_plus(s, safe = ''):
1133  """Quote the query fragment of a URL; replacing ' ' with '+'"""
1134  if ' ' in s:
1135  l = s.split(' ')
1136  for i in range(len(l)):
1137  l[i] = quote(l[i], safe)
1138  return '+'.join(l)
1139  else:
1140  return quote(s, safe)
1141 
1142 def urlencode(query,doseq=0):
1143  """Encode a sequence of two-element tuples or dictionary into a URL query string.
1144 
1145  If any values in the query arg are sequences and doseq is true, each
1146  sequence element is converted to a separate parameter.
1147 
1148  If the query arg is a sequence of two-element tuples, the order of the
1149  parameters in the output will match the order of parameters in the
1150  input.
1151  """
1152 
1153  if hasattr(query,"items"):
1154  # mapping objects
1155  query = query.items()
1156  else:
1157  # it's a bother at times that strings and string-like objects are
1158  # sequences...
1159  try:
1160  # non-sequence items should not work with len()
1161  x = len(query)
1162  # non-empty strings will fail this
1163  if len(query) and type(query[0]) != types.TupleType:
1164  raise TypeError
1165  # zero-length sequences of all types will get here and succeed,
1166  # but that's a minor nit - since the original implementation
1167  # allowed empty dicts that type of behavior probably should be
1168  # preserved for consistency
1169  except TypeError:
1170  ty,va,tb = sys.exc_info()
1171  raise TypeError, "not a valid non-string sequence or mapping object", tb
1172 
1173  l = []
1174  if not doseq:
1175  # preserve old behavior
1176  for k, v in query:
1177  k = quote_plus(str(k))
1178  v = quote_plus(str(v))
1179  l.append(k + '=' + v)
1180  else:
1181  for k, v in query:
1182  k = quote_plus(str(k))
1183  if type(v) == types.StringType:
1184  v = quote_plus(v)
1185  l.append(k + '=' + v)
1186  elif type(v) == types.UnicodeType:
1187  # is there a reasonable way to convert to ASCII?
1188  # encode generates a string, but "replace" or "ignore"
1189  # lose information and "strict" can raise UnicodeError
1190  v = quote_plus(v.encode("ASCII","replace"))
1191  l.append(k + '=' + v)
1192  else:
1193  try:
1194  # is this a sufficient test for sequence-ness?
1195  x = len(v)
1196  except TypeError:
1197  # not a sequence
1198  v = quote_plus(str(v))
1199  l.append(k + '=' + v)
1200  else:
1201  # loop over the sequence
1202  for elt in v:
1203  l.append(k + '=' + quote_plus(str(elt)))
1204  return '&'.join(l)
1205 
1206 # Proxy handling
1208  """Return a dictionary of scheme -> proxy server URL mappings.
1209 
1210  Scan the environment for variables named <scheme>_proxy;
1211  this seems to be the standard convention. If you need a
1212  different way, you can pass a proxies dictionary to the
1213  [Fancy]URLopener constructor.
1214 
1215  """
1216  proxies = {}
1217  for name, value in os.environ.items():
1218  name = name.lower()
1219  if value and name[-6:] == '_proxy':
1220  proxies[name[:-6]] = value
1221  return proxies
1222 
1223 if os.name == 'mac':
1224  def getproxies():
1225  """Return a dictionary of scheme -> proxy server URL mappings.
1226 
1227  By convention the mac uses Internet Config to store
1228  proxies. An HTTP proxy, for instance, is stored under
1229  the HttpProxy key.
1230 
1231  """
1232  try:
1233  import ic
1234  except ImportError:
1235  return {}
1236 
1237  try:
1238  config = ic.IC()
1239  except ic.error:
1240  return {}
1241  proxies = {}
1242  # HTTP:
1243  if config.has_key('UseHTTPProxy') and config['UseHTTPProxy']:
1244  try:
1245  value = config['HTTPProxyHost']
1246  except ic.error:
1247  pass
1248  else:
1249  proxies['http'] = 'http://%s' % value
1250  # FTP: XXXX To be done.
1251  # Gopher: XXXX To be done.
1252  return proxies
1253 
1255  return 0
1256 
1257 elif os.name == 'nt':
1259  """Return a dictionary of scheme -> proxy server URL mappings.
1260 
1261  Win32 uses the registry to store proxies.
1262 
1263  """
1264  proxies = {}
1265  try:
1266  import _winreg
1267  except ImportError:
1268  # Std module, so should be around - but you never know!
1269  return proxies
1270  try:
1271  internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1272  r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1273  proxyEnable = _winreg.QueryValueEx(internetSettings,
1274  'ProxyEnable')[0]
1275  if proxyEnable:
1276  # Returned as Unicode but problems if not converted to ASCII
1277  proxyServer = str(_winreg.QueryValueEx(internetSettings,
1278  'ProxyServer')[0])
1279  if '=' in proxyServer:
1280  # Per-protocol settings
1281  for p in proxyServer.split(';'):
1282  protocol, address = p.split('=', 1)
1283  # See if address has a type:// prefix
1284  import re
1285  if not re.match('^([^/:]+)://', address):
1286  address = '%s://%s' % (protocol, address)
1287  proxies[protocol] = address
1288  else:
1289  # Use one setting for all protocols
1290  if proxyServer[:5] == 'http:':
1291  proxies['http'] = proxyServer
1292  else:
1293  proxies['http'] = 'http://%s' % proxyServer
1294  proxies['ftp'] = 'ftp://%s' % proxyServer
1295  internetSettings.Close()
1296  except (WindowsError, ValueError, TypeError):
1297  # Either registry key not found etc, or the value in an
1298  # unexpected format.
1299  # proxies already set up to be empty so nothing to do
1300  pass
1301  return proxies
1302 
1303  def getproxies():
1304  """Return a dictionary of scheme -> proxy server URL mappings.
1305 
1306  Returns settings gathered from the environment, if specified,
1307  or the registry.
1308 
1309  """
1311 
1312  def proxy_bypass(host):
1313  try:
1314  import _winreg
1315  import re
1316  import socket
1317  except ImportError:
1318  # Std modules, so should be around - but you never know!
1319  return 0
1320  try:
1321  internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1322  r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1323  proxyEnable = _winreg.QueryValueEx(internetSettings,
1324  'ProxyEnable')[0]
1325  proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1326  'ProxyOverride')[0])
1327  # ^^^^ Returned as Unicode but problems if not converted to ASCII
1328  except WindowsError:
1329  return 0
1330  if not proxyEnable or not proxyOverride:
1331  return 0
1332  # try to make a host list from name and IP address.
1333  host = [host]
1334  try:
1335  addr = socket.gethostbyname(host[0])
1336  if addr != host:
1337  host.append(addr)
1338  except socket.error:
1339  pass
1340  # make a check value list from the registry entry: replace the
1341  # '<local>' string by the localhost entry and the corresponding
1342  # canonical entry.
1343  proxyOverride = proxyOverride.split(';')
1344  i = 0
1345  while i < len(proxyOverride):
1346  if proxyOverride[i] == '<local>':
1347  proxyOverride[i:i+1] = ['localhost',
1348  '127.0.0.1',
1349  socket.gethostname(),
1350  socket.gethostbyname(
1351  socket.gethostname())]
1352  i += 1
1353  # print proxyOverride
1354  # now check if we match one of the registry values.
1355  for test in proxyOverride:
1356  test = test.replace(".", r"\.") # mask dots
1357  test = test.replace("*", r".*") # change glob sequence
1358  test = test.replace("?", r".") # change glob char
1359  for val in host:
1360  # print "%s <--> %s" %( test, val )
1361  if re.match(test, val, re.I):
1362  return 1
1363  return 0
1364 
1365 else:
1366  # By default use environment variables
1367  getproxies = getproxies_environment
1368 
1369  def proxy_bypass(host):
1370  return 0
1371 
1372 # Test and time quote() and unquote()
1373 def test1():
1374  import time
1375  s = ''
1376  for i in range(256): s = s + chr(i)
1377  s = s*4
1378  t0 = time.time()
1379  qs = quote(s)
1380  uqs = unquote(qs)
1381  t1 = time.time()
1382  if uqs != s:
1383  print 'Wrong!'
1384  print `s`
1385  print `qs`
1386  print `uqs`
1387  print round(t1 - t0, 3), 'sec'
1388 
1389 
1390 def reporthook(blocknum, blocksize, totalsize):
1391  # Report during remote transfers
1392  print "Block number: %d, Block size: %d, Total size: %d" % (
1393  blocknum, blocksize, totalsize)
1394 
1395 # Test program
1396 def test(args=[]):
1397  if not args:
1398  args = [
1399  '/etc/passwd',
1400  'file:/etc/passwd',
1401  'file://localhost/etc/passwd',
1402  'ftp://ftp.python.org/pub/python/README',
1403 ## 'gopher://gopher.micro.umn.edu/1/',
1404  'http://www.python.org/index.html',
1405  ]
1406  if hasattr(URLopener, "open_https"):
1407  args.append('https://synergy.as.cmu.edu/~geek/')
1408  try:
1409  for url in args:
1410  print '-'*10, url, '-'*10
1411  fn, h = urlretrieve(url, None, reporthook)
1412  print fn
1413  if h:
1414  print '======'
1415  for k in h.keys(): print k + ':', h[k]
1416  print '======'
1417  fp = open(fn, 'rb')
1418  data = fp.read()
1419  del fp
1420  if '\r' in data:
1421  table = string.maketrans("", "")
1422  data = data.translate(table, "\r")
1423  print data
1424  fn, h = None, None
1425  print '-'*40
1426  finally:
1427  urlcleanup()
1428 
1429 def main():
1430  import getopt, sys
1431  try:
1432  opts, args = getopt.getopt(sys.argv[1:], "th")
1433  except getopt.error, msg:
1434  print msg
1435  print "Use -h for help"
1436  return
1437  t = 0
1438  for o, a in opts:
1439  if o == '-t':
1440  t = t + 1
1441  if o == '-h':
1442  print "Usage: python urllib.py [-t] [url ...]"
1443  print "-t runs self-test;",
1444  print "otherwise, contents of urls are printed"
1445  return
1446  if t:
1447  if t > 1:
1448  test1()
1449  test(args)
1450  else:
1451  if not args:
1452  print "Use -h for help"
1453  for url in args:
1454  print urlopen(url).read(),
1455 
1456 # Run test program when run as a script
1457 if __name__ == '__main__':
1458  main()