Vega strike Python Modules doc  0.5.1
Documentation of the " Modules " folder of Vega strike
 All Data Structures Namespaces Files Functions Variables
urlparse.py
Go to the documentation of this file.
1 """Parse (absolute and relative) URLs.
2 
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4 UC Irvine, June 1995.
5 """
6 
7 __all__ = ["urlparse", "urlunparse", "urljoin"]
8 
9 # A classification of schemes ('' means apply by default)
10 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
11  'https', 'shttp',
12  'prospero', 'rtsp', 'rtspu', '']
13 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
14  'file',
15  'https', 'shttp', 'snews',
16  'prospero', 'rtsp', 'rtspu', '']
17 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
18  'snews', 'sip',
19  ]
20 uses_params = ['ftp', 'hdl', 'prospero', 'http',
21  'https', 'shttp', 'rtsp', 'rtspu', 'sip',
22  '']
23 uses_query = ['http', 'wais',
24  'https', 'shttp',
25  'gopher', 'rtsp', 'rtspu', 'sip',
26  '']
27 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
28  'https', 'shttp', 'snews',
29  'file', 'prospero', '']
30 
31 # Characters valid in scheme names
32 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
33  'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
34  '0123456789'
35  '+-.')
36 
37 MAX_CACHE_SIZE = 20
38 _parse_cache = {}
39 
41  """Clear the parse cache."""
42  global _parse_cache
43  _parse_cache = {}
44 
45 
46 def urlparse(url, scheme='', allow_fragments=1):
47  """Parse a URL into 6 components:
48  <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
49  Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
50  Note that we don't break the components up in smaller bits
51  (e.g. netloc is a single string) and we don't expand % escapes."""
52  tuple = urlsplit(url, scheme, allow_fragments)
53  scheme, netloc, url, query, fragment = tuple
54  if scheme in uses_params and ';' in url:
55  url, params = _splitparams(url)
56  else:
57  params = ''
58  return scheme, netloc, url, params, query, fragment
59 
60 def _splitparams(url):
61  if '/' in url:
62  i = url.find(';', url.rfind('/'))
63  if i < 0:
64  return url, ''
65  else:
66  i = url.find(';')
67  return url[:i], url[i+1:]
68 
69 def urlsplit(url, scheme='', allow_fragments=1):
70  """Parse a URL into 5 components:
71  <scheme>://<netloc>/<path>?<query>#<fragment>
72  Return a 5-tuple: (scheme, netloc, path, query, fragment).
73  Note that we don't break the components up in smaller bits
74  (e.g. netloc is a single string) and we don't expand % escapes."""
75  key = url, scheme, allow_fragments
76  cached = _parse_cache.get(key, None)
77  if cached:
78  return cached
79  if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
80  clear_cache()
81  netloc = query = fragment = ''
82  i = url.find(':')
83  if i > 0:
84  if url[:i] == 'http': # optimize the common case
85  scheme = url[:i].lower()
86  url = url[i+1:]
87  if url[:2] == '//':
88  i = url.find('/', 2)
89  if i < 0:
90  i = url.find('#')
91  if i < 0:
92  i = len(url)
93  netloc = url[2:i]
94  url = url[i:]
95  if allow_fragments and '#' in url:
96  url, fragment = url.split('#', 1)
97  if '?' in url:
98  url, query = url.split('?', 1)
99  tuple = scheme, netloc, url, query, fragment
100  _parse_cache[key] = tuple
101  return tuple
102  for c in url[:i]:
103  if c not in scheme_chars:
104  break
105  else:
106  scheme, url = url[:i].lower(), url[i+1:]
107  if scheme in uses_netloc:
108  if url[:2] == '//':
109  i = url.find('/', 2)
110  if i < 0:
111  i = len(url)
112  netloc, url = url[2:i], url[i:]
113  if allow_fragments and scheme in uses_fragment and '#' in url:
114  url, fragment = url.split('#', 1)
115  if scheme in uses_query and '?' in url:
116  url, query = url.split('?', 1)
117  tuple = scheme, netloc, url, query, fragment
118  _parse_cache[key] = tuple
119  return tuple
120 
121 def urlunparse((scheme, netloc, url, params, query, fragment)):
122  """Put a parsed URL back together again. This may result in a
123  slightly different, but equivalent URL, if the URL that was parsed
124  originally had redundant delimiters, e.g. a ? with an empty query
125  (the draft states that these are equivalent)."""
126  if params:
127  url = "%s;%s" % (url, params)
128  return urlunsplit((scheme, netloc, url, query, fragment))
129 
130 def urlunsplit((scheme, netloc, url, query, fragment)):
131  if netloc or (scheme in uses_netloc and url[:2] == '//'):
132  if url and url[:1] != '/': url = '/' + url
133  url = '//' + (netloc or '') + url
134  if scheme:
135  url = scheme + ':' + url
136  if query:
137  url = url + '?' + query
138  if fragment:
139  url = url + '#' + fragment
140  return url
141 
142 def urljoin(base, url, allow_fragments = 1):
143  """Join a base URL and a possibly relative URL to form an absolute
144  interpretation of the latter."""
145  if not base:
146  return url
147  if not url:
148  return base
149  bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
150  urlparse(base, '', allow_fragments)
151  scheme, netloc, path, params, query, fragment = \
152  urlparse(url, bscheme, allow_fragments)
153  if scheme != bscheme or scheme not in uses_relative:
154  return url
155  if scheme in uses_netloc:
156  if netloc:
157  return urlunparse((scheme, netloc, path,
158  params, query, fragment))
159  netloc = bnetloc
160  if path[:1] == '/':
161  return urlunparse((scheme, netloc, path,
162  params, query, fragment))
163  if not path:
164  if not params:
165  params = bparams
166  if not query:
167  query = bquery
168  return urlunparse((scheme, netloc, bpath,
169  params, query, fragment))
170  segments = bpath.split('/')[:-1] + path.split('/')
171  # XXX The stuff below is bogus in various ways...
172  if segments[-1] == '.':
173  segments[-1] = ''
174  while '.' in segments:
175  segments.remove('.')
176  while 1:
177  i = 1
178  n = len(segments) - 1
179  while i < n:
180  if (segments[i] == '..'
181  and segments[i-1] not in ('', '..')):
182  del segments[i-1:i+1]
183  break
184  i = i+1
185  else:
186  break
187  if segments == ['', '..']:
188  segments[-1] = ''
189  elif len(segments) >= 2 and segments[-1] == '..':
190  segments[-2:] = ['']
191  return urlunparse((scheme, netloc, '/'.join(segments),
192  params, query, fragment))
193 
194 def urldefrag(url):
195  """Removes any existing fragment from URL.
196 
197  Returns a tuple of the defragmented URL and the fragment. If
198  the URL contained no fragments, the second element is the
199  empty string.
200  """
201  if '#' in url:
202  s, n, p, a, q, frag = urlparse(url)
203  defrag = urlunparse((s, n, p, a, q, ''))
204  return defrag, frag
205  else:
206  return url, ''
207 
208 
209 test_input = """
210  http://a/b/c/d
211 
212  g:h = <URL:g:h>
213  http:g = <URL:http://a/b/c/g>
214  http: = <URL:http://a/b/c/d>
215  g = <URL:http://a/b/c/g>
216  ./g = <URL:http://a/b/c/g>
217  g/ = <URL:http://a/b/c/g/>
218  /g = <URL:http://a/g>
219  //g = <URL:http://g>
220  ?y = <URL:http://a/b/c/d?y>
221  g?y = <URL:http://a/b/c/g?y>
222  g?y/./x = <URL:http://a/b/c/g?y/./x>
223  . = <URL:http://a/b/c/>
224  ./ = <URL:http://a/b/c/>
225  .. = <URL:http://a/b/>
226  ../ = <URL:http://a/b/>
227  ../g = <URL:http://a/b/g>
228  ../.. = <URL:http://a/>
229  ../../g = <URL:http://a/g>
230  ../../../g = <URL:http://a/../g>
231  ./../g = <URL:http://a/b/g>
232  ./g/. = <URL:http://a/b/c/g/>
233  /./g = <URL:http://a/./g>
234  g/./h = <URL:http://a/b/c/g/h>
235  g/../h = <URL:http://a/b/c/h>
236  http:g = <URL:http://a/b/c/g>
237  http: = <URL:http://a/b/c/d>
238  http:?y = <URL:http://a/b/c/d?y>
239  http:g?y = <URL:http://a/b/c/g?y>
240  http:g?y/./x = <URL:http://a/b/c/g?y/./x>
241 """
242 # XXX The result for //g is actually http://g/; is this a problem?
243 
244 def test():
245  import sys
246  base = ''
247  if sys.argv[1:]:
248  fn = sys.argv[1]
249  if fn == '-':
250  fp = sys.stdin
251  else:
252  fp = open(fn)
253  else:
254  import StringIO
255  fp = StringIO.StringIO(test_input)
256  while 1:
257  line = fp.readline()
258  if not line: break
259  words = line.split()
260  if not words:
261  continue
262  url = words[0]
263  parts = urlparse(url)
264  print '%-10s : %s' % (url, parts)
265  abs = urljoin(base, url)
266  if not base:
267  base = abs
268  wrapped = '<URL:%s>' % abs
269  print '%-10s = %s' % (url, wrapped)
270  if len(words) == 3 and words[1] == '=':
271  if wrapped != words[2]:
272  print 'EXPECTED', words[2], '!!!!!!!!!!'
273 
274 if __name__ == '__main__':
275  test()