1 | """Python module for web browsing and scraping.
|
---|
2 |
|
---|
3 | Done:
|
---|
4 | - navigate to absolute and relative URLs
|
---|
5 | - follow links in page or region
|
---|
6 | - find first or all occurrences of string or RE in page or region
|
---|
7 | - find first, last, next, previous, or all tags with given name/attributes
|
---|
8 | - find first, last, next, previous, enclosing, or all elements with given
|
---|
9 | name/attributes/content
|
---|
10 | - set form fields
|
---|
11 | - submit forms
|
---|
12 | - strip tags from arbitrary strings of HTML
|
---|
13 |
|
---|
14 | Todo:
|
---|
15 | - cookie-handling is dumb (sends all cookies to all sites)
|
---|
16 | - handle CDATA and RCDATA marked sections
|
---|
17 | - support for submitting forms with file upload
|
---|
18 | - use Regions in striptags instead of duplicating work
|
---|
19 | - map of enders
|
---|
20 | """
|
---|
21 |
|
---|
22 | __author__ = 'Ka-Ping Yee'
|
---|
23 | __date__ = '2005-03-29'
|
---|
24 | __version__ = '$Revision: 1.16 $'
|
---|
25 |
|
---|
26 | import os, socket, re, marshal, subprocess
|
---|
27 | from tempfile import gettempdir
|
---|
28 | from urlparse import urljoin, urlsplit
|
---|
29 | from urllib import urlencode
|
---|
30 |
|
---|
31 | def connect(server, port):
|
---|
32 | """Return a TCP socket connected to the given server and port."""
|
---|
33 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
---|
34 | sock.connect((server, port))
|
---|
35 | return sock
|
---|
36 |
|
---|
37 | def receive(sock):
|
---|
38 | """Read all the data from a socket until it closes."""
|
---|
39 | chunks = []
|
---|
40 | while 1:
|
---|
41 | chunk = sock.recv(4096)
|
---|
42 | if chunk: chunks.append(chunk)
|
---|
43 | else: return ''.join(chunks)
|
---|
44 |
|
---|
45 | def request(host, method, path, headers, entity=None):
|
---|
46 | """Make an HTTP request and return (status, message, headers, document)."""
|
---|
47 | sock = connect(host, 80)
|
---|
48 | request = method + ' ' + path + ' HTTP/1.0\r\n'
|
---|
49 | for name in headers:
|
---|
50 | capname = '-'.join([part.capitalize() for part in name.split('-')])
|
---|
51 | request += capname + ': ' + str(headers[name]) + '\r\n'
|
---|
52 | request += '\r\n'
|
---|
53 | if entity:
|
---|
54 | request += entity
|
---|
55 | sock.sendall(request)
|
---|
56 | data = receive(sock)
|
---|
57 | try: return splitreply(data)
|
---|
58 | except: return (0, '', {}, data)
|
---|
59 |
|
---|
60 | def splitreply(reply):
|
---|
61 | """Split an HTTP response into (status, message, headers, document)."""
|
---|
62 | if '\r\n\r\n' in reply:
|
---|
63 | head, document = reply.split('\r\n\r\n', 1)
|
---|
64 | else:
|
---|
65 | head, document = reply, ''
|
---|
66 | headers = []
|
---|
67 | while True:
|
---|
68 | if '\r\n' in head:
|
---|
69 | response, head = head.split('\r\n', 1)
|
---|
70 | for line in head.split('\r\n'):
|
---|
71 | name, value = line.split(': ', 1)
|
---|
72 | headers.append((name.lower(), value))
|
---|
73 | else:
|
---|
74 | response, head = head, ''
|
---|
75 | status = int(response.split()[1])
|
---|
76 | message = ' '.join(response.split()[2:])
|
---|
77 | if document.startswith('HTTP/1.') and '\r\n\r\n' in document:
|
---|
78 | head, document = document.split('\r\n\r\n', 1)
|
---|
79 | else:
|
---|
80 | return status, message, headers, document
|
---|
81 |
|
---|
82 | def shellquote(text):
|
---|
83 | """Quote a string literal for sh."""
|
---|
84 | return "'" + text.replace("'", "'\\''") + "'"
|
---|
85 |
|
---|
86 | def curl(url, entity=None, follow=1, cookies=[], referrer=None):
|
---|
87 | """Invoke curl to perform an HTTP request."""
|
---|
88 | command = ['curl', '-s', '-i']
|
---|
89 | if referrer:
|
---|
90 | command += ['-e', referrer]
|
---|
91 | if entity:
|
---|
92 | if not isinstance(entity, str):
|
---|
93 | entity = urlencode(entity, doseq=1)
|
---|
94 | command += ['-d', entity]
|
---|
95 | if not follow:
|
---|
96 | command += ['-Z', '0']
|
---|
97 | else:
|
---|
98 | command += ['-L']
|
---|
99 | if cookies:
|
---|
100 | command += ['-b', '; '.join(cookies)]
|
---|
101 | command.append(url)
|
---|
102 | reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read()
|
---|
103 | return splitreply(reply)
|
---|
104 |
|
---|
105 | def fetch(url, entity=None, follow=1):
|
---|
106 | """Fetch one document in a one-shot session."""
|
---|
107 | return Session().fetch(url, entity, follow)
|
---|
108 |
|
---|
109 | class ScrapeError(Exception): pass
|
---|
110 | class HTTPError(ScrapeError): pass
|
---|
111 | LAST_URL = object()
|
---|
112 |
|
---|
113 | class Session:
|
---|
114 | """A Web-browsing session.
|
---|
115 |
|
---|
116 | Exposed attributes:
|
---|
117 |
|
---|
118 | agent - set or get the User-Agent string
|
---|
119 | location - get the current (i.e. last successfully fetched) URL
|
---|
120 | status - get the status code of the last successful request
|
---|
121 | message - get the status message of the last successful request
|
---|
122 | headers - get the dictionary of headers from the last successful request
|
---|
123 | document - get the document returned by the last successful request
|
---|
124 | region - get a Region spanning the entire document
|
---|
125 | """
|
---|
126 |
|
---|
127 | def __init__(self, agent=None):
|
---|
128 | self.cookies = []
|
---|
129 | self.agent = agent
|
---|
130 | self.location = self.status = self.message = None
|
---|
131 | self.headers = self.document = self.region = None
|
---|
132 | self.history = []
|
---|
133 |
|
---|
134 | def fetch(self, url, entity=None, follow=1, referrer=LAST_URL):
|
---|
135 | scheme, host, path, query, fragment = urlsplit(url)
|
---|
136 | if referrer is LAST_URL:
|
---|
137 | referrer = self.location
|
---|
138 | self.location = url
|
---|
139 | if scheme == 'https':
|
---|
140 | status, message, headers, document = \
|
---|
141 | curl(url, entity, follow, self.cookies)
|
---|
142 | elif scheme == 'http':
|
---|
143 | if query:
|
---|
144 | path += '?' + query
|
---|
145 | headers = {}
|
---|
146 | headers['host'] = host
|
---|
147 | headers['accept'] = '*/*'
|
---|
148 | if referrer:
|
---|
149 | headers['referer'] = referrer
|
---|
150 | self.location = url
|
---|
151 | if self.agent:
|
---|
152 | headers['user-agent'] = self.agent
|
---|
153 | if self.cookies:
|
---|
154 | headers['cookie'] = '; '.join(self.cookies)
|
---|
155 | if entity:
|
---|
156 | if not isinstance(entity, str):
|
---|
157 | entity = urlencode(entity, doseq=1)
|
---|
158 | headers['content-type'] = 'application/x-www-form-urlencoded'
|
---|
159 | headers['content-length'] = len(entity)
|
---|
160 | method = entity and 'POST' or 'GET'
|
---|
161 | status, message, headers, document = \
|
---|
162 | request(host, method, path, headers, entity)
|
---|
163 | else:
|
---|
164 | raise ValueError, scheme + ' not supported'
|
---|
165 | headerdict = {}
|
---|
166 | for name, value in headers:
|
---|
167 | if name == 'set-cookie':
|
---|
168 | cookie = value.split(';')[0]
|
---|
169 | if cookie not in self.cookies:
|
---|
170 | self.cookies.append(cookie)
|
---|
171 | else:
|
---|
172 | headerdict[name] = value
|
---|
173 | if follow and status in [301, 302] and 'location' in headerdict:
|
---|
174 | return self.fetch(urljoin(url, headerdict['location']))
|
---|
175 | return status, message, headerdict, document
|
---|
176 |
|
---|
177 | def go(self, url, entity=None, follow=1, referrer=LAST_URL):
|
---|
178 | """Navigate to a given URL. If the URL is relative, it is resolved
|
---|
179 | with respect to the current location. If the document is successfully
|
---|
180 | fetched, return a Region spanning the entire document."""
|
---|
181 | historyentry = (self.location, self.status, self.message,
|
---|
182 | self.headers, self.document, self.region)
|
---|
183 | if self.location:
|
---|
184 | url = urljoin(self.location, url)
|
---|
185 | results = self.fetch(url, entity, follow, referrer)
|
---|
186 | if results[0] == 200:
|
---|
187 | self.history.append(historyentry)
|
---|
188 | self.status, self.message, self.headers, self.document = results
|
---|
189 | self.region = Region(self.document)
|
---|
190 | return self.region
|
---|
191 | raise HTTPError(self.status, self.message)
|
---|
192 |
|
---|
193 | def back(self):
|
---|
194 | """Return to the previous page."""
|
---|
195 | (self.location, self.status, self.message,
|
---|
196 | self.headers, self.document, self.region) = self.history.pop()
|
---|
197 | return self.location
|
---|
198 |
|
---|
199 | def follow(self, anchor, region=None):
|
---|
200 | """Follow the first link with the given anchor text. The anchor may
|
---|
201 | be given as a string or a compiled RE. If a region is given, the
|
---|
202 | link is sought within that region instead of the whole document."""
|
---|
203 | link = (region or self.region).first('a', content=anchor)
|
---|
204 | if not link:
|
---|
205 | raise ScrapeError('link %r not found' % anchor)
|
---|
206 | if not link['href']:
|
---|
207 | raise ScrapeError('link %r has no href' % link)
|
---|
208 | return self.go(link['href'])
|
---|
209 |
|
---|
210 | def submit(self, form, button=None, **params):
|
---|
211 | """Submit a form, optionally by clicking a given button."""
|
---|
212 | if form.tagname != 'form':
|
---|
213 | raise ScrapeError('%r is not a form' % form)
|
---|
214 | p = form.params
|
---|
215 | if button:
|
---|
216 | p[button['name']] = button['value']
|
---|
217 | p.update(params)
|
---|
218 | method = form['method'].lower() or 'get'
|
---|
219 | if method == 'post':
|
---|
220 | return self.go(form['action'], p)
|
---|
221 | elif method == 'get':
|
---|
222 | return self.go(form['action'] + '?' + urlencode(p, doseq=1))
|
---|
223 | else:
|
---|
224 | raise ScrapeError('unknown form method %r' % method)
|
---|
225 |
|
---|
226 | tagcontent_re = r'''(('[^']*'|"[^"]*"|--([^-]|-[^-])*--|-(?!-)|[^'">-])*)'''
|
---|
227 |
|
---|
228 | def tag_re(tagname_re):
|
---|
229 | return '<' + tagname_re + tagcontent_re + '>'
|
---|
230 |
|
---|
231 | anytag_re = tag_re(r'(\?|!\w*|/?[a-zA-Z_:][\w:.-]*)')
|
---|
232 | tagpat = re.compile(anytag_re)
|
---|
233 |
|
---|
234 | def htmldec(text):
|
---|
235 | """Decode HTML entities in the given text."""
|
---|
236 | chunks = text.split('&#')
|
---|
237 | for i in range(1, len(chunks)):
|
---|
238 | number, rest = chunks[i].split(';', 1)
|
---|
239 | chunks[i] = chr(int(number)) + rest
|
---|
240 | text = ''.join(chunks)
|
---|
241 | text = text.replace('\xa0', ' ')
|
---|
242 | text = text.replace(' ', ' ')
|
---|
243 | text = text.replace('<', '<')
|
---|
244 | text = text.replace('>', '>')
|
---|
245 | text = text.replace('"', '"')
|
---|
246 | text = text.replace('&', '&')
|
---|
247 | return text
|
---|
248 |
|
---|
249 | def htmlenc(text):
|
---|
250 | """Use HTML entities to encode special characters in the given text."""
|
---|
251 | text = text.replace('&', '&')
|
---|
252 | text = text.replace('"', '"')
|
---|
253 | text = text.replace('<', '<')
|
---|
254 | text = text.replace('>', '>')
|
---|
255 | return text
|
---|
256 |
|
---|
257 | def no_groups(re):
|
---|
258 | return re.replace('(', '(?:').replace('(?:?', '(?')
|
---|
259 |
|
---|
260 | tagsplitter = re.compile(no_groups(anytag_re))
|
---|
261 | parasplitter = re.compile(no_groups(tag_re('(p|table|form)')), re.I)
|
---|
262 | linesplitter = re.compile(no_groups(tag_re('(div|br|tr)')), re.I)
|
---|
263 | scriptpat = re.compile(r'<script\b', re.I)
|
---|
264 | endscriptpat = re.compile(r'</script[^>]*>', re.I)
|
---|
265 | endcommentpat = re.compile(r'--\s*>')
|
---|
266 |
|
---|
267 | def striptags(text):
|
---|
268 | """Strip HTML tags from the given text, yielding line breaks for DIV,
|
---|
269 | BR, or TR tags and blank lines for P, TABLE, or FORM tags."""
|
---|
270 | chunks = scriptpat.split(text)
|
---|
271 | for i in range(1, len(chunks)):
|
---|
272 | chunks[i] = endscriptpat.split(chunks[i], 1)[1]
|
---|
273 | text = ''.join(chunks)
|
---|
274 | chunks = text.split('<!')
|
---|
275 | for i in range(1, len(chunks)):
|
---|
276 | if chunks[i].split('>', 1)[0].find('--') >= 0:
|
---|
277 | chunks[i] = endcommentpat.split(chunks[i], 1)[1]
|
---|
278 | else:
|
---|
279 | chunks[i] = chunks[i].split('>', 1)[1]
|
---|
280 | text = ''.join(chunks)
|
---|
281 |
|
---|
282 | paragraphs = []
|
---|
283 | for paragraph in parasplitter.split(text):
|
---|
284 | lines = []
|
---|
285 | for line in linesplitter.split(paragraph):
|
---|
286 | line = ''.join(tagsplitter.split(line))
|
---|
287 | line = htmldec(line)
|
---|
288 | line = ' '.join(line.split())
|
---|
289 | lines.append(line)
|
---|
290 | paragraphs.append('\n'.join(lines))
|
---|
291 | return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip()
|
---|
292 |
|
---|
293 | attr_re = r'''\s*([\w:.-]+)(\s*=\s*('[^']*'|"[^"]*"|[^\s>]*))?'''
|
---|
294 | attrpat = re.compile(attr_re)
|
---|
295 |
|
---|
296 | def parseattrs(text):
|
---|
297 | """Turn a string of name=value pairs into an attribute dictionary."""
|
---|
298 | attrs = {}
|
---|
299 | pos = 0
|
---|
300 | while 1:
|
---|
301 | match = attrpat.search(text, pos)
|
---|
302 | if not match: break
|
---|
303 | pos = match.end()
|
---|
304 | name, value = match.group(1), match.group(3) or ''
|
---|
305 | if value[:1] in ["'", '"']:
|
---|
306 | value = value[1:-1]
|
---|
307 | attrs[name.lower()] = htmldec(value)
|
---|
308 | return attrs
|
---|
309 |
|
---|
310 | def matchcontent(specimen, desired):
|
---|
311 | if hasattr(desired, 'match'):
|
---|
312 | return desired.match(specimen)
|
---|
313 | elif callable(desired):
|
---|
314 | return desired(specimen)
|
---|
315 | else:
|
---|
316 | return specimen == desired
|
---|
317 |
|
---|
318 | def matchattrs(specimen, desired):
|
---|
319 | for name, value in desired.items():
|
---|
320 | name = name.strip('_').replace('_', '-')
|
---|
321 | if not (name in specimen and matchcontent(specimen[name], value)):
|
---|
322 | return 0
|
---|
323 | return 1
|
---|
324 |
|
---|
325 | class Region:
|
---|
326 | """A Region object represents a contiguous region of a document together
|
---|
327 | with an associated HTML or XML tag and its attributes."""
|
---|
328 |
|
---|
329 | def __init__(self, parent, start=0, end=None, starttag=None, endtag=None):
|
---|
330 | """Create a Region. The parent argument is a string or another
|
---|
331 | Region. The start and end arguments, if given, specify non-negative
|
---|
332 | indices into the original string (not into a parent subregion)."""
|
---|
333 | if isinstance(parent, Region):
|
---|
334 | self.document = parent.document
|
---|
335 | self.tags = parent.tags
|
---|
336 | else:
|
---|
337 | self.document = parent
|
---|
338 | self.tags = self.scantags(self.document)
|
---|
339 | if end is None:
|
---|
340 | end = len(self.document)
|
---|
341 | self.start, self.end = start, end
|
---|
342 | self.tagname, self.attrs = None, {}
|
---|
343 |
|
---|
344 | # If only starttag is specified, this Region is a tag.
|
---|
345 | # If starttag and endtag are specified, this Region is an element.
|
---|
346 | self.starttag, self.endtag = starttag, endtag
|
---|
347 | if starttag is not None:
|
---|
348 | self.start, self.end, self.tagname, self.attrs = self.tags[starttag]
|
---|
349 | if endtag is not None:
|
---|
350 | self.start, self.end = self.tags[starttag][1], self.tags[endtag][0]
|
---|
351 |
|
---|
352 | # Find the minimum and maximum indices of tags within this Region.
|
---|
353 | if starttag and endtag:
|
---|
354 | self.tagmin, self.tagmax = starttag + 1, endtag - 1
|
---|
355 | else:
|
---|
356 | self.tagmin, self.tagmax = len(self.tags), -1
|
---|
357 | for i, (start, end, tagname, attrs) in enumerate(self.tags):
|
---|
358 | if start >= self.start and i < self.tagmin:
|
---|
359 | self.tagmin = i
|
---|
360 | if end <= self.end and i > self.tagmax:
|
---|
361 | self.tagmax = i
|
---|
362 |
|
---|
363 | def __repr__(self):
|
---|
364 | if self.tagname:
|
---|
365 | attrs = ''.join([' %s=%r' % item for item in self.attrs.items()])
|
---|
366 | return '<Region %d:%d %s%s>' % (
|
---|
367 | self.start, self.end, self.tagname, attrs)
|
---|
368 | else:
|
---|
369 | return '<Region %d:%d>' % (self.start, self.end)
|
---|
370 |
|
---|
371 | # Utilities that operate on the array of scanned tags.
|
---|
372 | def scantags(self, document):
|
---|
373 | """Generate a list of all the tags in a document."""
|
---|
374 | tags = []
|
---|
375 | pos = 0
|
---|
376 | while 1:
|
---|
377 | match = tagpat.search(document, pos)
|
---|
378 | if not match: break
|
---|
379 | start, end = match.span()
|
---|
380 | tagname = match.group(1).lower()
|
---|
381 | attrs = match.group(2)
|
---|
382 | tags.append([start, end, tagname, attrs])
|
---|
383 | if tagname == 'script':
|
---|
384 | match = endscriptpat.search(document, end)
|
---|
385 | if not match: break
|
---|
386 | start, end = match.span()
|
---|
387 | tags.append([start, end, '/' + tagname, ''])
|
---|
388 | pos = end
|
---|
389 | return tags
|
---|
390 |
|
---|
391 | def matchtag(self, i, tagname, attrs):
|
---|
392 | """Return 1 if the ith tag matches the given tagname and attributes."""
|
---|
393 | itagname, iattrs = self.tags[i][2], self.tags[i][3]
|
---|
394 | if itagname[:1] not in ['', '?', '!', '/']:
|
---|
395 | if itagname == tagname or tagname is None:
|
---|
396 | if isinstance(iattrs, str):
|
---|
397 | self.tags[i][3] = iattrs = parseattrs(iattrs)
|
---|
398 | return matchattrs(iattrs, attrs)
|
---|
399 |
|
---|
400 | def findendtag(self, starttag, outside=0):
|
---|
401 | """Find the index of the matching end tag for the given start tag.
|
---|
402 | If outside is 0, look for the end tag within the current region;
|
---|
403 | if outside is 1, look beyond the end of the current region."""
|
---|
404 | tagname = self.tags[starttag][2]
|
---|
405 | depth = 1
|
---|
406 | for i in range(starttag + 1, len(self.tags)):
|
---|
407 | if self.tags[i][2] == tagname:
|
---|
408 | depth += 1
|
---|
409 | if self.tags[i][2] == '/' + tagname:
|
---|
410 | depth -= 1
|
---|
411 | if depth == 0:
|
---|
412 | if not outside and i <= self.tagmax:
|
---|
413 | return i
|
---|
414 | if outside and i > self.tagmax:
|
---|
415 | return i
|
---|
416 | break
|
---|
417 |
|
---|
418 | def matchelement(self, starttag, content=None, outside=0):
|
---|
419 | """If the element with the given start tag matches the given content,
|
---|
420 | return the index of the matching end tag. See findendtag() for the
|
---|
421 | meaning of the outside flag."""
|
---|
422 | endtag = self.findendtag(starttag, outside)
|
---|
423 | if endtag is not None:
|
---|
424 | start, end = self.tags[starttag][1], self.tags[endtag][0]
|
---|
425 | stripped = striptags(self.document[start:end])
|
---|
426 | if content is None or matchcontent(stripped, content):
|
---|
427 | return endtag
|
---|
428 |
|
---|
429 | # Provide the "content" and "text" attributes to access the contents.
|
---|
430 | content = property(lambda self: self.document[self.start:self.end])
|
---|
431 | text = property(lambda self: striptags(self.content))
|
---|
432 |
|
---|
433 | def getparams(self):
|
---|
434 | """Get a dictionary of default values for all the form parameters."""
|
---|
435 | if self.tagname == 'form':
|
---|
436 | params = {}
|
---|
437 | for input in self.alltags('input'):
|
---|
438 | if 'disabled' not in input:
|
---|
439 | type = input['type'].lower()
|
---|
440 | if type in ['text', 'password', 'hidden'] or (
|
---|
441 | type in ['checkbox', 'radio'] and 'checked' in input):
|
---|
442 | params[input['name']] = input['value']
|
---|
443 | for select in self.all('select'):
|
---|
444 | if 'disabled' not in select:
|
---|
445 | selections = [option['value']
|
---|
446 | for option in select.alltags('option')
|
---|
447 | if 'selected' in option]
|
---|
448 | if 'multiple' in select:
|
---|
449 | params[select['name']] = selections
|
---|
450 | elif selections:
|
---|
451 | params[select['name']] = selections[0]
|
---|
452 | for textarea in self.all('textarea'):
|
---|
453 | if 'disabled' not in textarea:
|
---|
454 | params[textarea['name']] = textarea.content
|
---|
455 | return params
|
---|
456 |
|
---|
457 | def getbuttons(self):
|
---|
458 | """Get a list of all the form submission buttons."""
|
---|
459 | if self.tagname == 'form':
|
---|
460 | return [tag for tag in self.alltags('input')
|
---|
461 | if tag['type'].lower() in ['submit', 'image']
|
---|
462 | ] + [tag for tag in self.alltags('button')
|
---|
463 | if tag['type'].lower() in ['submit', '']]
|
---|
464 |
|
---|
465 | params = property(getparams)
|
---|
466 | buttons = property(getbuttons)
|
---|
467 |
|
---|
468 | # Provide a dictionary-like interface to the tag attributes.
|
---|
469 | def __contains__(self, name):
|
---|
470 | return name in self.attrs
|
---|
471 |
|
---|
472 | def __getitem__(self, name):
|
---|
473 | return self.attrs.get(name, '')
|
---|
474 |
|
---|
475 | # Provide subregions by slicing.
|
---|
476 | def __getslice__(self, start, end):
|
---|
477 | start += (start < 0) and self.end or self.start
|
---|
478 | end += (end < 0) and self.end or self.start
|
---|
479 | return Region(self, start, end)
|
---|
480 |
|
---|
481 | # Search for text.
|
---|
482 | def find(self, target, group=0):
|
---|
483 | """Search this Region for a string or a compiled RE and return a
|
---|
484 | Region representing the match. The optional group argument specifies
|
---|
485 | which grouped subexpression should be returned as the match."""
|
---|
486 | if hasattr(target, 'search'):
|
---|
487 | match = target.search(self.content)
|
---|
488 | if match:
|
---|
489 | return self[match.start(group):match.end(group)]
|
---|
490 | else:
|
---|
491 | start = self.content.find(target)
|
---|
492 | if start > -1:
|
---|
493 | return self[start:start+len(target)]
|
---|
494 |
|
---|
495 | def findall(self, target, group=0):
|
---|
496 | """Search this Region for a string or a compiled RE and return a
|
---|
497 | sequence of Regions representing all the matches."""
|
---|
498 | pos = 0
|
---|
499 | content = self.content
|
---|
500 | matches = []
|
---|
501 | if hasattr(target, 'search'):
|
---|
502 | while 1:
|
---|
503 | match = target.search(content, pos)
|
---|
504 | if not match:
|
---|
505 | break
|
---|
506 | start, pos = match.span(group)
|
---|
507 | matches.append(self[start:pos])
|
---|
508 | else:
|
---|
509 | while 1:
|
---|
510 | start = content.find(target, pos)
|
---|
511 | if start < 0:
|
---|
512 | break
|
---|
513 | pos = start + len(target)
|
---|
514 | matches.append(self[start:pos])
|
---|
515 | return matches
|
---|
516 |
|
---|
517 | # Search for tags.
|
---|
518 | def firsttag(self, tagname=None, **attrs):
|
---|
519 | """Return the Region for the first tag entirely within this Region
|
---|
520 | with the given tag name and attributes."""
|
---|
521 | for i in range(self.tagmin, self.tagmax + 1):
|
---|
522 | if self.matchtag(i, tagname, attrs):
|
---|
523 | return Region(self, 0, 0, i)
|
---|
524 |
|
---|
525 | def lasttag(self, tagname=None, **attrs):
|
---|
526 | """Return the Region for the last tag entirely within this Region
|
---|
527 | with the given tag name and attributes."""
|
---|
528 | for i in range(self.tagmax, self.tagmin - 1, -1):
|
---|
529 | if self.matchtag(i, tagname, attrs):
|
---|
530 | return Region(self, 0, 0, i)
|
---|
531 |
|
---|
532 | def alltags(self, tagname=None, **attrs):
|
---|
533 | """Return a list of Regions for all the tags entirely within this
|
---|
534 | Region with the given tag name and attributes."""
|
---|
535 | tags = []
|
---|
536 | for i in range(self.tagmin, self.tagmax + 1):
|
---|
537 | if self.matchtag(i, tagname, attrs):
|
---|
538 | tags.append(Region(self, 0, 0, i))
|
---|
539 | return tags
|
---|
540 |
|
---|
541 | def nexttag(self, tagname=None, **attrs):
|
---|
542 | """Return the Region for the nearest tag after the end of this Region
|
---|
543 | with the given tag name and attributes."""
|
---|
544 | return Region(self, self.end).firsttag(tagname, **attrs)
|
---|
545 |
|
---|
546 | def previoustag(self, tagname=None, **attrs):
|
---|
547 | """Return the Region for the nearest tag before the start of this
|
---|
548 | Region with the given tag name and attributes."""
|
---|
549 | return Region(self, 0, self.start).lasttag(tagname, **attrs)
|
---|
550 |
|
---|
551 | # Search for elements.
|
---|
552 | def first(self, tagname=None, content=None, **attrs):
|
---|
553 | """Return the Region for the first properly balanced element entirely
|
---|
554 | within this Region with the given tag name, content, and attributes.
|
---|
555 | The element content is passed through striptags(). If the content
|
---|
556 | argument has a match() method, the stripped content is passed into
|
---|
557 | this method; otherwise it is compared directly as a string."""
|
---|
558 | for starttag in range(self.tagmin, self.tagmax + 1):
|
---|
559 | if self.matchtag(starttag, tagname, attrs):
|
---|
560 | endtag = self.matchelement(starttag, content)
|
---|
561 | if endtag is not None:
|
---|
562 | return Region(self, 0, 0, starttag, endtag)
|
---|
563 |
|
---|
564 | def last(self, tagname=None, content=None, **attrs):
|
---|
565 | """Return the Region for the last properly balanced element entirely
|
---|
566 | within this Region with the given tag name, content, and attributes."""
|
---|
567 | for starttag in range(self.tagmax, self.tagmin - 1, -1):
|
---|
568 | if self.matchtag(starttag, tagname, attrs):
|
---|
569 | endtag = self.matchelement(starttag, content)
|
---|
570 | if endtag is not None:
|
---|
571 | return Region(self, 0, 0, starttag, endtag)
|
---|
572 |
|
---|
573 | def all(self, tagname=None, content=None, **attrs):
|
---|
574 | """Return Regions for all non-overlapping balanced elements entirely
|
---|
575 | within this Region with the given tag name, content, and attributes."""
|
---|
576 | elements = []
|
---|
577 | starttag = self.tagmin
|
---|
578 | while starttag <= self.tagmax:
|
---|
579 | if self.matchtag(starttag, tagname, attrs):
|
---|
580 | endtag = self.matchelement(starttag, content)
|
---|
581 | if endtag is not None:
|
---|
582 | elements.append(Region(self, 0, 0, starttag, endtag))
|
---|
583 | starttag = endtag
|
---|
584 | starttag += 1
|
---|
585 | return elements
|
---|
586 |
|
---|
587 | def next(self, tagname=None, content=None, **attrs):
|
---|
588 | """Return the Region for the nearest balanced element after the end of
|
---|
589 | this Region with the given tag name, content, and attributes."""
|
---|
590 | return Region(self, self.end).first(tagname, content, **attrs)
|
---|
591 |
|
---|
592 | def previous(self, tagname=None, content=None, **attrs):
|
---|
593 | """Return the Region for the nearest balanced element before the start
|
---|
594 | of this Region with the given tag name, content, and attributes."""
|
---|
595 | return Region(self, 0, self.start).last(tagname, content, **attrs)
|
---|
596 |
|
---|
597 | def enclosing(self, tagname=None, content=None, **attrs):
|
---|
598 | """Return the Region for the nearest balanced element that encloses
|
---|
599 | this Region with the given tag name, content, and attributes."""
|
---|
600 | if self.starttag and self.endtag: # skip our own start tag
|
---|
601 | laststarttag = self.starttag - 1
|
---|
602 | else:
|
---|
603 | laststarttag = self.tagmin - 1
|
---|
604 | for starttag in range(laststarttag, -1, -1):
|
---|
605 | if self.matchtag(starttag, tagname, attrs):
|
---|
606 | endtag = self.matchelement(starttag, content, outside=1)
|
---|
607 | if endtag is not None:
|
---|
608 | return Region(self, 0, 0, starttag, endtag)
|
---|
609 |
|
---|
610 | def read(path):
|
---|
611 | """Read and return the entire contents of the file at the given path."""
|
---|
612 | return open(path).read()
|
---|
613 |
|
---|
614 | def write(path, text):
|
---|
615 | """Write the given text to a file at the given path."""
|
---|
616 | file = open(path, 'w')
|
---|
617 | file.write(text)
|
---|
618 | file.close()
|
---|
619 |
|
---|
620 | def load(path):
|
---|
621 | """Return the deserialized contents of the file at the given path."""
|
---|
622 | return marshal.load(open(path))
|
---|
623 |
|
---|
624 | def dump(path, data):
|
---|
625 | """Serialize the given data and write it to a file at the given path."""
|
---|
626 | file = open(path, 'w')
|
---|
627 | marshal.dump(data, file)
|
---|
628 | file.close()
|
---|
629 |
|
---|
630 | def getnumber(text):
|
---|
631 | """Find and parse a floating-point or integer number in the given text,
|
---|
632 | ignoring commas, percentage signs, and non-numeric words."""
|
---|
633 | for word in striptags(text).replace(',', '').replace('%', '').split():
|
---|
634 | try: return int(word)
|
---|
635 | except:
|
---|
636 | try: return float(word)
|
---|
637 | except: continue
|
---|