1 | """Python module for web browsing and scraping. |
---|
2 | |
---|
3 | Done: |
---|
4 | - navigate to absolute and relative URLs |
---|
5 | - follow links in page or region |
---|
6 | - find first or all occurrences of string or RE in page or region |
---|
7 | - find first, last, next, previous, or all tags with given name/attributes |
---|
8 | - find first, last, next, previous, enclosing, or all elements with given |
---|
9 | name/attributes/content |
---|
10 | - set form fields |
---|
11 | - submit forms |
---|
12 | - strip tags from arbitrary strings of HTML |
---|
13 | |
---|
14 | Todo: |
---|
15 | - cookie-handling is dumb (sends all cookies to all sites) |
---|
16 | - handle CDATA and RCDATA marked sections |
---|
17 | - support for submitting forms with file upload |
---|
18 | - use Regions in striptags instead of duplicating work |
---|
19 | - map of enders |
---|
20 | """ |
---|
21 | |
---|
22 | __author__ = 'Ka-Ping Yee' |
---|
23 | __date__ = '2005-03-29' |
---|
24 | __version__ = '$Revision: 1.16 $' |
---|
25 | |
---|
26 | import os, socket, re, marshal, subprocess |
---|
27 | from tempfile import gettempdir |
---|
28 | from urlparse import urljoin, urlsplit |
---|
29 | from urllib import urlencode |
---|
30 | |
---|
31 | def connect(server, port): |
---|
32 | """Return a TCP socket connected to the given server and port.""" |
---|
33 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
---|
34 | sock.connect((server, port)) |
---|
35 | return sock |
---|
36 | |
---|
37 | def receive(sock): |
---|
38 | """Read all the data from a socket until it closes.""" |
---|
39 | chunks = [] |
---|
40 | while 1: |
---|
41 | chunk = sock.recv(4096) |
---|
42 | if chunk: chunks.append(chunk) |
---|
43 | else: return ''.join(chunks) |
---|
44 | |
---|
45 | def request(host, method, path, headers, entity=None): |
---|
46 | """Make an HTTP request and return (status, message, headers, document).""" |
---|
47 | sock = connect(host, 80) |
---|
48 | request = method + ' ' + path + ' HTTP/1.0\r\n' |
---|
49 | for name in headers: |
---|
50 | capname = '-'.join([part.capitalize() for part in name.split('-')]) |
---|
51 | request += capname + ': ' + str(headers[name]) + '\r\n' |
---|
52 | request += '\r\n' |
---|
53 | if entity: |
---|
54 | request += entity |
---|
55 | sock.sendall(request) |
---|
56 | data = receive(sock) |
---|
57 | try: return splitreply(data) |
---|
58 | except: return (0, '', {}, data) |
---|
59 | |
---|
60 | def splitreply(reply): |
---|
61 | """Split an HTTP response into (status, message, headers, document).""" |
---|
62 | if '\r\n\r\n' in reply: |
---|
63 | head, document = reply.split('\r\n\r\n', 1) |
---|
64 | else: |
---|
65 | head, document = reply, '' |
---|
66 | headers = [] |
---|
67 | while True: |
---|
68 | if '\r\n' in head: |
---|
69 | response, head = head.split('\r\n', 1) |
---|
70 | for line in head.split('\r\n'): |
---|
71 | name, value = line.split(': ', 1) |
---|
72 | headers.append((name.lower(), value)) |
---|
73 | else: |
---|
74 | response, head = head, '' |
---|
75 | status = int(response.split()[1]) |
---|
76 | message = ' '.join(response.split()[2:]) |
---|
77 | if document.startswith('HTTP/1.') and '\r\n\r\n' in document: |
---|
78 | head, document = document.split('\r\n\r\n', 1) |
---|
79 | else: |
---|
80 | return status, message, headers, document |
---|
81 | |
---|
82 | def shellquote(text): |
---|
83 | """Quote a string literal for sh.""" |
---|
84 | return "'" + text.replace("'", "'\\''") + "'" |
---|
85 | |
---|
86 | def curl(url, entity=None, follow=1, cookies=[], referrer=None): |
---|
87 | """Invoke curl to perform an HTTP request.""" |
---|
88 | command = ['curl', '-s', '-i'] |
---|
89 | if referrer: |
---|
90 | command += ['-e', referrer] |
---|
91 | if entity: |
---|
92 | if not isinstance(entity, str): |
---|
93 | entity = urlencode(entity, doseq=1) |
---|
94 | command += ['-d', entity] |
---|
95 | if not follow: |
---|
96 | command += ['-Z', '0'] |
---|
97 | else: |
---|
98 | command += ['-L'] |
---|
99 | if cookies: |
---|
100 | command += ['-b', '; '.join(cookies)] |
---|
101 | command.append(url) |
---|
102 | reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read() |
---|
103 | return splitreply(reply) |
---|
104 | |
---|
105 | def fetch(url, entity=None, follow=1): |
---|
106 | """Fetch one document in a one-shot session.""" |
---|
107 | return Session().fetch(url, entity, follow) |
---|
108 | |
---|
109 | class ScrapeError(Exception): pass |
---|
110 | class HTTPError(ScrapeError): pass |
---|
111 | LAST_URL = object() |
---|
112 | |
---|
113 | class Session: |
---|
114 | """A Web-browsing session. |
---|
115 | |
---|
116 | Exposed attributes: |
---|
117 | |
---|
118 | agent - set or get the User-Agent string |
---|
119 | location - get the current (i.e. last successfully fetched) URL |
---|
120 | status - get the status code of the last successful request |
---|
121 | message - get the status message of the last successful request |
---|
122 | headers - get the dictionary of headers from the last successful request |
---|
123 | document - get the document returned by the last successful request |
---|
124 | region - get a Region spanning the entire document |
---|
125 | """ |
---|
126 | |
---|
127 | def __init__(self, agent=None): |
---|
128 | self.cookies = [] |
---|
129 | self.agent = agent |
---|
130 | self.location = self.status = self.message = None |
---|
131 | self.headers = self.document = self.region = None |
---|
132 | self.history = [] |
---|
133 | |
---|
134 | def fetch(self, url, entity=None, follow=1, referrer=LAST_URL): |
---|
135 | scheme, host, path, query, fragment = urlsplit(url) |
---|
136 | if referrer is LAST_URL: |
---|
137 | referrer = self.location |
---|
138 | self.location = url |
---|
139 | if scheme == 'https': |
---|
140 | status, message, headers, document = \ |
---|
141 | curl(url, entity, follow, self.cookies) |
---|
142 | elif scheme == 'http': |
---|
143 | if query: |
---|
144 | path += '?' + query |
---|
145 | headers = {} |
---|
146 | headers['host'] = host |
---|
147 | headers['accept'] = '*/*' |
---|
148 | if referrer: |
---|
149 | headers['referer'] = referrer |
---|
150 | self.location = url |
---|
151 | if self.agent: |
---|
152 | headers['user-agent'] = self.agent |
---|
153 | if self.cookies: |
---|
154 | headers['cookie'] = '; '.join(self.cookies) |
---|
155 | if entity: |
---|
156 | if not isinstance(entity, str): |
---|
157 | entity = urlencode(entity, doseq=1) |
---|
158 | headers['content-type'] = 'application/x-www-form-urlencoded' |
---|
159 | headers['content-length'] = len(entity) |
---|
160 | method = entity and 'POST' or 'GET' |
---|
161 | status, message, headers, document = \ |
---|
162 | request(host, method, path, headers, entity) |
---|
163 | else: |
---|
164 | raise ValueError, scheme + ' not supported' |
---|
165 | headerdict = {} |
---|
166 | for name, value in headers: |
---|
167 | if name == 'set-cookie': |
---|
168 | cookie = value.split(';')[0] |
---|
169 | if cookie not in self.cookies: |
---|
170 | self.cookies.append(cookie) |
---|
171 | else: |
---|
172 | headerdict[name] = value |
---|
173 | if follow and status in [301, 302] and 'location' in headerdict: |
---|
174 | return self.fetch(urljoin(url, headerdict['location'])) |
---|
175 | return status, message, headerdict, document |
---|
176 | |
---|
177 | def go(self, url, entity=None, follow=1, referrer=LAST_URL): |
---|
178 | """Navigate to a given URL. If the URL is relative, it is resolved |
---|
179 | with respect to the current location. If the document is successfully |
---|
180 | fetched, return a Region spanning the entire document.""" |
---|
181 | historyentry = (self.location, self.status, self.message, |
---|
182 | self.headers, self.document, self.region) |
---|
183 | if self.location: |
---|
184 | url = urljoin(self.location, url) |
---|
185 | results = self.fetch(url, entity, follow, referrer) |
---|
186 | if results[0] == 200: |
---|
187 | self.history.append(historyentry) |
---|
188 | self.status, self.message, self.headers, self.document = results |
---|
189 | self.region = Region(self.document) |
---|
190 | return self.region |
---|
191 | raise HTTPError(self.status, self.message) |
---|
192 | |
---|
193 | def back(self): |
---|
194 | """Return to the previous page.""" |
---|
195 | (self.location, self.status, self.message, |
---|
196 | self.headers, self.document, self.region) = self.history.pop() |
---|
197 | return self.location |
---|
198 | |
---|
199 | def follow(self, anchor, region=None): |
---|
200 | """Follow the first link with the given anchor text. The anchor may |
---|
201 | be given as a string or a compiled RE. If a region is given, the |
---|
202 | link is sought within that region instead of the whole document.""" |
---|
203 | link = (region or self.region).first('a', content=anchor) |
---|
204 | if not link: |
---|
205 | raise ScrapeError('link %r not found' % anchor) |
---|
206 | if not link['href']: |
---|
207 | raise ScrapeError('link %r has no href' % link) |
---|
208 | return self.go(link['href']) |
---|
209 | |
---|
210 | def submit(self, form, button=None, **params): |
---|
211 | """Submit a form, optionally by clicking a given button.""" |
---|
212 | if form.tagname != 'form': |
---|
213 | raise ScrapeError('%r is not a form' % form) |
---|
214 | p = form.params |
---|
215 | if button: |
---|
216 | p[button['name']] = button['value'] |
---|
217 | p.update(params) |
---|
218 | method = form['method'].lower() or 'get' |
---|
219 | if method == 'post': |
---|
220 | return self.go(form['action'], p) |
---|
221 | elif method == 'get': |
---|
222 | return self.go(form['action'] + '?' + urlencode(p, doseq=1)) |
---|
223 | else: |
---|
224 | raise ScrapeError('unknown form method %r' % method) |
---|
225 | |
---|
226 | tagcontent_re = r'''(('[^']*'|"[^"]*"|--([^-]|-[^-])*--|-(?!-)|[^'">-])*)''' |
---|
227 | |
---|
228 | def tag_re(tagname_re): |
---|
229 | return '<' + tagname_re + tagcontent_re + '>' |
---|
230 | |
---|
231 | anytag_re = tag_re(r'(\?|!\w*|/?[a-zA-Z_:][\w:.-]*)') |
---|
232 | tagpat = re.compile(anytag_re) |
---|
233 | |
---|
234 | def htmldec(text): |
---|
235 | """Decode HTML entities in the given text.""" |
---|
236 | chunks = text.split('&#') |
---|
237 | for i in range(1, len(chunks)): |
---|
238 | number, rest = chunks[i].split(';', 1) |
---|
239 | chunks[i] = chr(int(number)) + rest |
---|
240 | text = ''.join(chunks) |
---|
241 | text = text.replace('\xa0', ' ') |
---|
242 | text = text.replace(' ', ' ') |
---|
243 | text = text.replace('<', '<') |
---|
244 | text = text.replace('>', '>') |
---|
245 | text = text.replace('"', '"') |
---|
246 | text = text.replace('&', '&') |
---|
247 | return text |
---|
248 | |
---|
249 | def htmlenc(text): |
---|
250 | """Use HTML entities to encode special characters in the given text.""" |
---|
251 | text = text.replace('&', '&') |
---|
252 | text = text.replace('"', '"') |
---|
253 | text = text.replace('<', '<') |
---|
254 | text = text.replace('>', '>') |
---|
255 | return text |
---|
256 | |
---|
257 | def no_groups(re): |
---|
258 | return re.replace('(', '(?:').replace('(?:?', '(?') |
---|
259 | |
---|
260 | tagsplitter = re.compile(no_groups(anytag_re)) |
---|
261 | parasplitter = re.compile(no_groups(tag_re('(p|table|form)')), re.I) |
---|
262 | linesplitter = re.compile(no_groups(tag_re('(div|br|tr)')), re.I) |
---|
263 | scriptpat = re.compile(r'<script\b', re.I) |
---|
264 | endscriptpat = re.compile(r'</script[^>]*>', re.I) |
---|
265 | endcommentpat = re.compile(r'--\s*>') |
---|
266 | |
---|
267 | def striptags(text): |
---|
268 | """Strip HTML tags from the given text, yielding line breaks for DIV, |
---|
269 | BR, or TR tags and blank lines for P, TABLE, or FORM tags.""" |
---|
270 | chunks = scriptpat.split(text) |
---|
271 | for i in range(1, len(chunks)): |
---|
272 | chunks[i] = endscriptpat.split(chunks[i], 1)[1] |
---|
273 | text = ''.join(chunks) |
---|
274 | chunks = text.split('<!') |
---|
275 | for i in range(1, len(chunks)): |
---|
276 | if chunks[i].split('>', 1)[0].find('--') >= 0: |
---|
277 | chunks[i] = endcommentpat.split(chunks[i], 1)[1] |
---|
278 | else: |
---|
279 | chunks[i] = chunks[i].split('>', 1)[1] |
---|
280 | text = ''.join(chunks) |
---|
281 | |
---|
282 | paragraphs = [] |
---|
283 | for paragraph in parasplitter.split(text): |
---|
284 | lines = [] |
---|
285 | for line in linesplitter.split(paragraph): |
---|
286 | line = ''.join(tagsplitter.split(line)) |
---|
287 | line = htmldec(line) |
---|
288 | line = ' '.join(line.split()) |
---|
289 | lines.append(line) |
---|
290 | paragraphs.append('\n'.join(lines)) |
---|
291 | return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip() |
---|
292 | |
---|
293 | attr_re = r'''\s*([\w:.-]+)(\s*=\s*('[^']*'|"[^"]*"|[^\s>]*))?''' |
---|
294 | attrpat = re.compile(attr_re) |
---|
295 | |
---|
296 | def parseattrs(text): |
---|
297 | """Turn a string of name=value pairs into an attribute dictionary.""" |
---|
298 | attrs = {} |
---|
299 | pos = 0 |
---|
300 | while 1: |
---|
301 | match = attrpat.search(text, pos) |
---|
302 | if not match: break |
---|
303 | pos = match.end() |
---|
304 | name, value = match.group(1), match.group(3) or '' |
---|
305 | if value[:1] in ["'", '"']: |
---|
306 | value = value[1:-1] |
---|
307 | attrs[name.lower()] = htmldec(value) |
---|
308 | return attrs |
---|
309 | |
---|
310 | def matchcontent(specimen, desired): |
---|
311 | if hasattr(desired, 'match'): |
---|
312 | return desired.match(specimen) |
---|
313 | elif callable(desired): |
---|
314 | return desired(specimen) |
---|
315 | else: |
---|
316 | return specimen == desired |
---|
317 | |
---|
318 | def matchattrs(specimen, desired): |
---|
319 | for name, value in desired.items(): |
---|
320 | name = name.strip('_').replace('_', '-') |
---|
321 | if not (name in specimen and matchcontent(specimen[name], value)): |
---|
322 | return 0 |
---|
323 | return 1 |
---|
324 | |
---|
325 | class Region: |
---|
326 | """A Region object represents a contiguous region of a document together |
---|
327 | with an associated HTML or XML tag and its attributes.""" |
---|
328 | |
---|
329 | def __init__(self, parent, start=0, end=None, starttag=None, endtag=None): |
---|
330 | """Create a Region. The parent argument is a string or another |
---|
331 | Region. The start and end arguments, if given, specify non-negative |
---|
332 | indices into the original string (not into a parent subregion).""" |
---|
333 | if isinstance(parent, Region): |
---|
334 | self.document = parent.document |
---|
335 | self.tags = parent.tags |
---|
336 | else: |
---|
337 | self.document = parent |
---|
338 | self.tags = self.scantags(self.document) |
---|
339 | if end is None: |
---|
340 | end = len(self.document) |
---|
341 | self.start, self.end = start, end |
---|
342 | self.tagname, self.attrs = None, {} |
---|
343 | |
---|
344 | # If only starttag is specified, this Region is a tag. |
---|
345 | # If starttag and endtag are specified, this Region is an element. |
---|
346 | self.starttag, self.endtag = starttag, endtag |
---|
347 | if starttag is not None: |
---|
348 | self.start, self.end, self.tagname, self.attrs = self.tags[starttag] |
---|
349 | if endtag is not None: |
---|
350 | self.start, self.end = self.tags[starttag][1], self.tags[endtag][0] |
---|
351 | |
---|
352 | # Find the minimum and maximum indices of tags within this Region. |
---|
353 | if starttag and endtag: |
---|
354 | self.tagmin, self.tagmax = starttag + 1, endtag - 1 |
---|
355 | else: |
---|
356 | self.tagmin, self.tagmax = len(self.tags), -1 |
---|
357 | for i, (start, end, tagname, attrs) in enumerate(self.tags): |
---|
358 | if start >= self.start and i < self.tagmin: |
---|
359 | self.tagmin = i |
---|
360 | if end <= self.end and i > self.tagmax: |
---|
361 | self.tagmax = i |
---|
362 | |
---|
363 | def __repr__(self): |
---|
364 | if self.tagname: |
---|
365 | attrs = ''.join([' %s=%r' % item for item in self.attrs.items()]) |
---|
366 | return '<Region %d:%d %s%s>' % ( |
---|
367 | self.start, self.end, self.tagname, attrs) |
---|
368 | else: |
---|
369 | return '<Region %d:%d>' % (self.start, self.end) |
---|
370 | |
---|
371 | # Utilities that operate on the array of scanned tags. |
---|
372 | def scantags(self, document): |
---|
373 | """Generate a list of all the tags in a document.""" |
---|
374 | tags = [] |
---|
375 | pos = 0 |
---|
376 | while 1: |
---|
377 | match = tagpat.search(document, pos) |
---|
378 | if not match: break |
---|
379 | start, end = match.span() |
---|
380 | tagname = match.group(1).lower() |
---|
381 | attrs = match.group(2) |
---|
382 | tags.append([start, end, tagname, attrs]) |
---|
383 | if tagname == 'script': |
---|
384 | match = endscriptpat.search(document, end) |
---|
385 | if not match: break |
---|
386 | start, end = match.span() |
---|
387 | tags.append([start, end, '/' + tagname, '']) |
---|
388 | pos = end |
---|
389 | return tags |
---|
390 | |
---|
391 | def matchtag(self, i, tagname, attrs): |
---|
392 | """Return 1 if the ith tag matches the given tagname and attributes.""" |
---|
393 | itagname, iattrs = self.tags[i][2], self.tags[i][3] |
---|
394 | if itagname[:1] not in ['', '?', '!', '/']: |
---|
395 | if itagname == tagname or tagname is None: |
---|
396 | if isinstance(iattrs, str): |
---|
397 | self.tags[i][3] = iattrs = parseattrs(iattrs) |
---|
398 | return matchattrs(iattrs, attrs) |
---|
399 | |
---|
400 | def findendtag(self, starttag, outside=0): |
---|
401 | """Find the index of the matching end tag for the given start tag. |
---|
402 | If outside is 0, look for the end tag within the current region; |
---|
403 | if outside is 1, look beyond the end of the current region.""" |
---|
404 | tagname = self.tags[starttag][2] |
---|
405 | depth = 1 |
---|
406 | for i in range(starttag + 1, len(self.tags)): |
---|
407 | if self.tags[i][2] == tagname: |
---|
408 | depth += 1 |
---|
409 | if self.tags[i][2] == '/' + tagname: |
---|
410 | depth -= 1 |
---|
411 | if depth == 0: |
---|
412 | if not outside and i <= self.tagmax: |
---|
413 | return i |
---|
414 | if outside and i > self.tagmax: |
---|
415 | return i |
---|
416 | break |
---|
417 | |
---|
418 | def matchelement(self, starttag, content=None, outside=0): |
---|
419 | """If the element with the given start tag matches the given content, |
---|
420 | return the index of the matching end tag. See findendtag() for the |
---|
421 | meaning of the outside flag.""" |
---|
422 | endtag = self.findendtag(starttag, outside) |
---|
423 | if endtag is not None: |
---|
424 | start, end = self.tags[starttag][1], self.tags[endtag][0] |
---|
425 | stripped = striptags(self.document[start:end]) |
---|
426 | if content is None or matchcontent(stripped, content): |
---|
427 | return endtag |
---|
428 | |
---|
429 | # Provide the "content" and "text" attributes to access the contents. |
---|
430 | content = property(lambda self: self.document[self.start:self.end]) |
---|
431 | text = property(lambda self: striptags(self.content)) |
---|
432 | |
---|
433 | def getparams(self): |
---|
434 | """Get a dictionary of default values for all the form parameters.""" |
---|
435 | if self.tagname == 'form': |
---|
436 | params = {} |
---|
437 | for input in self.alltags('input'): |
---|
438 | if 'disabled' not in input: |
---|
439 | type = input['type'].lower() |
---|
440 | if type in ['text', 'password', 'hidden'] or ( |
---|
441 | type in ['checkbox', 'radio'] and 'checked' in input): |
---|
442 | params[input['name']] = input['value'] |
---|
443 | for select in self.all('select'): |
---|
444 | if 'disabled' not in select: |
---|
445 | selections = [option['value'] |
---|
446 | for option in select.alltags('option') |
---|
447 | if 'selected' in option] |
---|
448 | if 'multiple' in select: |
---|
449 | params[select['name']] = selections |
---|
450 | elif selections: |
---|
451 | params[select['name']] = selections[0] |
---|
452 | for textarea in self.all('textarea'): |
---|
453 | if 'disabled' not in textarea: |
---|
454 | params[textarea['name']] = textarea.content |
---|
455 | return params |
---|
456 | |
---|
457 | def getbuttons(self): |
---|
458 | """Get a list of all the form submission buttons.""" |
---|
459 | if self.tagname == 'form': |
---|
460 | return [tag for tag in self.alltags('input') |
---|
461 | if tag['type'].lower() in ['submit', 'image'] |
---|
462 | ] + [tag for tag in self.alltags('button') |
---|
463 | if tag['type'].lower() in ['submit', '']] |
---|
464 | |
---|
465 | params = property(getparams) |
---|
466 | buttons = property(getbuttons) |
---|
467 | |
---|
468 | # Provide a dictionary-like interface to the tag attributes. |
---|
469 | def __contains__(self, name): |
---|
470 | return name in self.attrs |
---|
471 | |
---|
472 | def __getitem__(self, name): |
---|
473 | return self.attrs.get(name, '') |
---|
474 | |
---|
475 | # Provide subregions by slicing. |
---|
476 | def __getslice__(self, start, end): |
---|
477 | start += (start < 0) and self.end or self.start |
---|
478 | end += (end < 0) and self.end or self.start |
---|
479 | return Region(self, start, end) |
---|
480 | |
---|
481 | # Search for text. |
---|
482 | def find(self, target, group=0): |
---|
483 | """Search this Region for a string or a compiled RE and return a |
---|
484 | Region representing the match. The optional group argument specifies |
---|
485 | which grouped subexpression should be returned as the match.""" |
---|
486 | if hasattr(target, 'search'): |
---|
487 | match = target.search(self.content) |
---|
488 | if match: |
---|
489 | return self[match.start(group):match.end(group)] |
---|
490 | else: |
---|
491 | start = self.content.find(target) |
---|
492 | if start > -1: |
---|
493 | return self[start:start+len(target)] |
---|
494 | |
---|
495 | def findall(self, target, group=0): |
---|
496 | """Search this Region for a string or a compiled RE and return a |
---|
497 | sequence of Regions representing all the matches.""" |
---|
498 | pos = 0 |
---|
499 | content = self.content |
---|
500 | matches = [] |
---|
501 | if hasattr(target, 'search'): |
---|
502 | while 1: |
---|
503 | match = target.search(content, pos) |
---|
504 | if not match: |
---|
505 | break |
---|
506 | start, pos = match.span(group) |
---|
507 | matches.append(self[start:pos]) |
---|
508 | else: |
---|
509 | while 1: |
---|
510 | start = content.find(target, pos) |
---|
511 | if start < 0: |
---|
512 | break |
---|
513 | pos = start + len(target) |
---|
514 | matches.append(self[start:pos]) |
---|
515 | return matches |
---|
516 | |
---|
517 | # Search for tags. |
---|
518 | def firsttag(self, tagname=None, **attrs): |
---|
519 | """Return the Region for the first tag entirely within this Region |
---|
520 | with the given tag name and attributes.""" |
---|
521 | for i in range(self.tagmin, self.tagmax + 1): |
---|
522 | if self.matchtag(i, tagname, attrs): |
---|
523 | return Region(self, 0, 0, i) |
---|
524 | |
---|
525 | def lasttag(self, tagname=None, **attrs): |
---|
526 | """Return the Region for the last tag entirely within this Region |
---|
527 | with the given tag name and attributes.""" |
---|
528 | for i in range(self.tagmax, self.tagmin - 1, -1): |
---|
529 | if self.matchtag(i, tagname, attrs): |
---|
530 | return Region(self, 0, 0, i) |
---|
531 | |
---|
532 | def alltags(self, tagname=None, **attrs): |
---|
533 | """Return a list of Regions for all the tags entirely within this |
---|
534 | Region with the given tag name and attributes.""" |
---|
535 | tags = [] |
---|
536 | for i in range(self.tagmin, self.tagmax + 1): |
---|
537 | if self.matchtag(i, tagname, attrs): |
---|
538 | tags.append(Region(self, 0, 0, i)) |
---|
539 | return tags |
---|
540 | |
---|
541 | def nexttag(self, tagname=None, **attrs): |
---|
542 | """Return the Region for the nearest tag after the end of this Region |
---|
543 | with the given tag name and attributes.""" |
---|
544 | return Region(self, self.end).firsttag(tagname, **attrs) |
---|
545 | |
---|
546 | def previoustag(self, tagname=None, **attrs): |
---|
547 | """Return the Region for the nearest tag before the start of this |
---|
548 | Region with the given tag name and attributes.""" |
---|
549 | return Region(self, 0, self.start).lasttag(tagname, **attrs) |
---|
550 | |
---|
551 | # Search for elements. |
---|
552 | def first(self, tagname=None, content=None, **attrs): |
---|
553 | """Return the Region for the first properly balanced element entirely |
---|
554 | within this Region with the given tag name, content, and attributes. |
---|
555 | The element content is passed through striptags(). If the content |
---|
556 | argument has a match() method, the stripped content is passed into |
---|
557 | this method; otherwise it is compared directly as a string.""" |
---|
558 | for starttag in range(self.tagmin, self.tagmax + 1): |
---|
559 | if self.matchtag(starttag, tagname, attrs): |
---|
560 | endtag = self.matchelement(starttag, content) |
---|
561 | if endtag is not None: |
---|
562 | return Region(self, 0, 0, starttag, endtag) |
---|
563 | |
---|
564 | def last(self, tagname=None, content=None, **attrs): |
---|
565 | """Return the Region for the last properly balanced element entirely |
---|
566 | within this Region with the given tag name, content, and attributes.""" |
---|
567 | for starttag in range(self.tagmax, self.tagmin - 1, -1): |
---|
568 | if self.matchtag(starttag, tagname, attrs): |
---|
569 | endtag = self.matchelement(starttag, content) |
---|
570 | if endtag is not None: |
---|
571 | return Region(self, 0, 0, starttag, endtag) |
---|
572 | |
---|
573 | def all(self, tagname=None, content=None, **attrs): |
---|
574 | """Return Regions for all non-overlapping balanced elements entirely |
---|
575 | within this Region with the given tag name, content, and attributes.""" |
---|
576 | elements = [] |
---|
577 | starttag = self.tagmin |
---|
578 | while starttag <= self.tagmax: |
---|
579 | if self.matchtag(starttag, tagname, attrs): |
---|
580 | endtag = self.matchelement(starttag, content) |
---|
581 | if endtag is not None: |
---|
582 | elements.append(Region(self, 0, 0, starttag, endtag)) |
---|
583 | starttag = endtag |
---|
584 | starttag += 1 |
---|
585 | return elements |
---|
586 | |
---|
587 | def next(self, tagname=None, content=None, **attrs): |
---|
588 | """Return the Region for the nearest balanced element after the end of |
---|
589 | this Region with the given tag name, content, and attributes.""" |
---|
590 | return Region(self, self.end).first(tagname, content, **attrs) |
---|
591 | |
---|
592 | def previous(self, tagname=None, content=None, **attrs): |
---|
593 | """Return the Region for the nearest balanced element before the start |
---|
594 | of this Region with the given tag name, content, and attributes.""" |
---|
595 | return Region(self, 0, self.start).last(tagname, content, **attrs) |
---|
596 | |
---|
597 | def enclosing(self, tagname=None, content=None, **attrs): |
---|
598 | """Return the Region for the nearest balanced element that encloses |
---|
599 | this Region with the given tag name, content, and attributes.""" |
---|
600 | if self.starttag and self.endtag: # skip our own start tag |
---|
601 | laststarttag = self.starttag - 1 |
---|
602 | else: |
---|
603 | laststarttag = self.tagmin - 1 |
---|
604 | for starttag in range(laststarttag, -1, -1): |
---|
605 | if self.matchtag(starttag, tagname, attrs): |
---|
606 | endtag = self.matchelement(starttag, content, outside=1) |
---|
607 | if endtag is not None: |
---|
608 | return Region(self, 0, 0, starttag, endtag) |
---|
609 | |
---|
610 | def read(path): |
---|
611 | """Read and return the entire contents of the file at the given path.""" |
---|
612 | return open(path).read() |
---|
613 | |
---|
614 | def write(path, text): |
---|
615 | """Write the given text to a file at the given path.""" |
---|
616 | file = open(path, 'w') |
---|
617 | file.write(text) |
---|
618 | file.close() |
---|
619 | |
---|
620 | def load(path): |
---|
621 | """Return the deserialized contents of the file at the given path.""" |
---|
622 | return marshal.load(open(path)) |
---|
623 | |
---|
624 | def dump(path, data): |
---|
625 | """Serialize the given data and write it to a file at the given path.""" |
---|
626 | file = open(path, 'w') |
---|
627 | marshal.dump(data, file) |
---|
628 | file.close() |
---|
629 | |
---|
630 | def getnumber(text): |
---|
631 | """Find and parse a floating-point or integer number in the given text, |
---|
632 | ignoring commas, percentage signs, and non-numeric words.""" |
---|
633 | for word in striptags(text).replace(',', '').replace('%', '').split(): |
---|
634 | try: return int(word) |
---|
635 | except: |
---|
636 | try: return float(word) |
---|
637 | except: continue |
---|