source: trunk/StreamVision/scrape.py@ 527

Last change on this file since 527 was 188, checked in by Nicholas Riley, 19 years ago

StreamVision

File size: 25.0 KB
RevLine 
[188]1"""Python module for web browsing and scraping.
2
3Done:
4 - navigate to absolute and relative URLs
5 - follow links in page or region
6 - find first or all occurrences of string or RE in page or region
7 - find first, last, next, previous, or all tags with given name/attributes
8 - find first, last, next, previous, enclosing, or all elements with given
9 name/attributes/content
10 - set form fields
11 - submit forms
12 - strip tags from arbitrary strings of HTML
13
14Todo:
15 - cookie-handling is dumb (sends all cookies to all sites)
16 - handle CDATA and RCDATA marked sections
17 - support for submitting forms with file upload
18 - use Regions in striptags instead of duplicating work
19 - map of enders
20"""
21
22__author__ = 'Ka-Ping Yee'
23__date__ = '2005-03-29'
24__version__ = '$Revision: 1.16 $'
25
26import os, socket, re, marshal, subprocess
27from tempfile import gettempdir
28from urlparse import urljoin, urlsplit
29from urllib import urlencode
30
31def connect(server, port):
32 """Return a TCP socket connected to the given server and port."""
33 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34 sock.connect((server, port))
35 return sock
36
37def receive(sock):
38 """Read all the data from a socket until it closes."""
39 chunks = []
40 while 1:
41 chunk = sock.recv(4096)
42 if chunk: chunks.append(chunk)
43 else: return ''.join(chunks)
44
45def request(host, method, path, headers, entity=None):
46 """Make an HTTP request and return (status, message, headers, document)."""
47 sock = connect(host, 80)
48 request = method + ' ' + path + ' HTTP/1.0\r\n'
49 for name in headers:
50 capname = '-'.join([part.capitalize() for part in name.split('-')])
51 request += capname + ': ' + str(headers[name]) + '\r\n'
52 request += '\r\n'
53 if entity:
54 request += entity
55 sock.sendall(request)
56 data = receive(sock)
57 try: return splitreply(data)
58 except: return (0, '', {}, data)
59
60def splitreply(reply):
61 """Split an HTTP response into (status, message, headers, document)."""
62 if '\r\n\r\n' in reply:
63 head, document = reply.split('\r\n\r\n', 1)
64 else:
65 head, document = reply, ''
66 headers = []
67 while True:
68 if '\r\n' in head:
69 response, head = head.split('\r\n', 1)
70 for line in head.split('\r\n'):
71 name, value = line.split(': ', 1)
72 headers.append((name.lower(), value))
73 else:
74 response, head = head, ''
75 status = int(response.split()[1])
76 message = ' '.join(response.split()[2:])
77 if document.startswith('HTTP/1.') and '\r\n\r\n' in document:
78 head, document = document.split('\r\n\r\n', 1)
79 else:
80 return status, message, headers, document
81
82def shellquote(text):
83 """Quote a string literal for sh."""
84 return "'" + text.replace("'", "'\\''") + "'"
85
86def curl(url, entity=None, follow=1, cookies=[], referrer=None):
87 """Invoke curl to perform an HTTP request."""
88 command = ['curl', '-s', '-i']
89 if referrer:
90 command += ['-e', referrer]
91 if entity:
92 if not isinstance(entity, str):
93 entity = urlencode(entity, doseq=1)
94 command += ['-d', entity]
95 if not follow:
96 command += ['-Z', '0']
97 else:
98 command += ['-L']
99 if cookies:
100 command += ['-b', '; '.join(cookies)]
101 command.append(url)
102 reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read()
103 return splitreply(reply)
104
105def fetch(url, entity=None, follow=1):
106 """Fetch one document in a one-shot session."""
107 return Session().fetch(url, entity, follow)
108
109class ScrapeError(Exception): pass
110class HTTPError(ScrapeError): pass
111LAST_URL = object()
112
113class Session:
114 """A Web-browsing session.
115
116 Exposed attributes:
117
118 agent - set or get the User-Agent string
119 location - get the current (i.e. last successfully fetched) URL
120 status - get the status code of the last successful request
121 message - get the status message of the last successful request
122 headers - get the dictionary of headers from the last successful request
123 document - get the document returned by the last successful request
124 region - get a Region spanning the entire document
125 """
126
127 def __init__(self, agent=None):
128 self.cookies = []
129 self.agent = agent
130 self.location = self.status = self.message = None
131 self.headers = self.document = self.region = None
132 self.history = []
133
134 def fetch(self, url, entity=None, follow=1, referrer=LAST_URL):
135 scheme, host, path, query, fragment = urlsplit(url)
136 if referrer is LAST_URL:
137 referrer = self.location
138 self.location = url
139 if scheme == 'https':
140 status, message, headers, document = \
141 curl(url, entity, follow, self.cookies)
142 elif scheme == 'http':
143 if query:
144 path += '?' + query
145 headers = {}
146 headers['host'] = host
147 headers['accept'] = '*/*'
148 if referrer:
149 headers['referer'] = referrer
150 self.location = url
151 if self.agent:
152 headers['user-agent'] = self.agent
153 if self.cookies:
154 headers['cookie'] = '; '.join(self.cookies)
155 if entity:
156 if not isinstance(entity, str):
157 entity = urlencode(entity, doseq=1)
158 headers['content-type'] = 'application/x-www-form-urlencoded'
159 headers['content-length'] = len(entity)
160 method = entity and 'POST' or 'GET'
161 status, message, headers, document = \
162 request(host, method, path, headers, entity)
163 else:
164 raise ValueError, scheme + ' not supported'
165 headerdict = {}
166 for name, value in headers:
167 if name == 'set-cookie':
168 cookie = value.split(';')[0]
169 if cookie not in self.cookies:
170 self.cookies.append(cookie)
171 else:
172 headerdict[name] = value
173 if follow and status in [301, 302] and 'location' in headerdict:
174 return self.fetch(urljoin(url, headerdict['location']))
175 return status, message, headerdict, document
176
177 def go(self, url, entity=None, follow=1, referrer=LAST_URL):
178 """Navigate to a given URL. If the URL is relative, it is resolved
179 with respect to the current location. If the document is successfully
180 fetched, return a Region spanning the entire document."""
181 historyentry = (self.location, self.status, self.message,
182 self.headers, self.document, self.region)
183 if self.location:
184 url = urljoin(self.location, url)
185 results = self.fetch(url, entity, follow, referrer)
186 if results[0] == 200:
187 self.history.append(historyentry)
188 self.status, self.message, self.headers, self.document = results
189 self.region = Region(self.document)
190 return self.region
191 raise HTTPError(self.status, self.message)
192
193 def back(self):
194 """Return to the previous page."""
195 (self.location, self.status, self.message,
196 self.headers, self.document, self.region) = self.history.pop()
197 return self.location
198
199 def follow(self, anchor, region=None):
200 """Follow the first link with the given anchor text. The anchor may
201 be given as a string or a compiled RE. If a region is given, the
202 link is sought within that region instead of the whole document."""
203 link = (region or self.region).first('a', content=anchor)
204 if not link:
205 raise ScrapeError('link %r not found' % anchor)
206 if not link['href']:
207 raise ScrapeError('link %r has no href' % link)
208 return self.go(link['href'])
209
210 def submit(self, form, button=None, **params):
211 """Submit a form, optionally by clicking a given button."""
212 if form.tagname != 'form':
213 raise ScrapeError('%r is not a form' % form)
214 p = form.params
215 if button:
216 p[button['name']] = button['value']
217 p.update(params)
218 method = form['method'].lower() or 'get'
219 if method == 'post':
220 return self.go(form['action'], p)
221 elif method == 'get':
222 return self.go(form['action'] + '?' + urlencode(p, doseq=1))
223 else:
224 raise ScrapeError('unknown form method %r' % method)
225
226tagcontent_re = r'''(('[^']*'|"[^"]*"|--([^-]|-[^-])*--|-(?!-)|[^'">-])*)'''
227
228def tag_re(tagname_re):
229 return '<' + tagname_re + tagcontent_re + '>'
230
231anytag_re = tag_re(r'(\?|!\w*|/?[a-zA-Z_:][\w:.-]*)')
232tagpat = re.compile(anytag_re)
233
234def htmldec(text):
235 """Decode HTML entities in the given text."""
236 chunks = text.split('&#')
237 for i in range(1, len(chunks)):
238 number, rest = chunks[i].split(';', 1)
239 chunks[i] = chr(int(number)) + rest
240 text = ''.join(chunks)
241 text = text.replace('\xa0', ' ')
242 text = text.replace('&nbsp;', ' ')
243 text = text.replace('&lt;', '<')
244 text = text.replace('&gt;', '>')
245 text = text.replace('&quot;', '"')
246 text = text.replace('&amp;', '&')
247 return text
248
249def htmlenc(text):
250 """Use HTML entities to encode special characters in the given text."""
251 text = text.replace('&', '&amp;')
252 text = text.replace('"', '&quot;')
253 text = text.replace('<', '&lt;')
254 text = text.replace('>', '&gt;')
255 return text
256
257def no_groups(re):
258 return re.replace('(', '(?:').replace('(?:?', '(?')
259
260tagsplitter = re.compile(no_groups(anytag_re))
261parasplitter = re.compile(no_groups(tag_re('(p|table|form)')), re.I)
262linesplitter = re.compile(no_groups(tag_re('(div|br|tr)')), re.I)
263scriptpat = re.compile(r'<script\b', re.I)
264endscriptpat = re.compile(r'</script[^>]*>', re.I)
265endcommentpat = re.compile(r'--\s*>')
266
267def striptags(text):
268 """Strip HTML tags from the given text, yielding line breaks for DIV,
269 BR, or TR tags and blank lines for P, TABLE, or FORM tags."""
270 chunks = scriptpat.split(text)
271 for i in range(1, len(chunks)):
272 chunks[i] = endscriptpat.split(chunks[i], 1)[1]
273 text = ''.join(chunks)
274 chunks = text.split('<!')
275 for i in range(1, len(chunks)):
276 if chunks[i].split('>', 1)[0].find('--') >= 0:
277 chunks[i] = endcommentpat.split(chunks[i], 1)[1]
278 else:
279 chunks[i] = chunks[i].split('>', 1)[1]
280 text = ''.join(chunks)
281
282 paragraphs = []
283 for paragraph in parasplitter.split(text):
284 lines = []
285 for line in linesplitter.split(paragraph):
286 line = ''.join(tagsplitter.split(line))
287 line = htmldec(line)
288 line = ' '.join(line.split())
289 lines.append(line)
290 paragraphs.append('\n'.join(lines))
291 return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip()
292
293attr_re = r'''\s*([\w:.-]+)(\s*=\s*('[^']*'|"[^"]*"|[^\s>]*))?'''
294attrpat = re.compile(attr_re)
295
296def parseattrs(text):
297 """Turn a string of name=value pairs into an attribute dictionary."""
298 attrs = {}
299 pos = 0
300 while 1:
301 match = attrpat.search(text, pos)
302 if not match: break
303 pos = match.end()
304 name, value = match.group(1), match.group(3) or ''
305 if value[:1] in ["'", '"']:
306 value = value[1:-1]
307 attrs[name.lower()] = htmldec(value)
308 return attrs
309
310def matchcontent(specimen, desired):
311 if hasattr(desired, 'match'):
312 return desired.match(specimen)
313 elif callable(desired):
314 return desired(specimen)
315 else:
316 return specimen == desired
317
318def matchattrs(specimen, desired):
319 for name, value in desired.items():
320 name = name.strip('_').replace('_', '-')
321 if not (name in specimen and matchcontent(specimen[name], value)):
322 return 0
323 return 1
324
325class Region:
326 """A Region object represents a contiguous region of a document together
327 with an associated HTML or XML tag and its attributes."""
328
329 def __init__(self, parent, start=0, end=None, starttag=None, endtag=None):
330 """Create a Region. The parent argument is a string or another
331 Region. The start and end arguments, if given, specify non-negative
332 indices into the original string (not into a parent subregion)."""
333 if isinstance(parent, Region):
334 self.document = parent.document
335 self.tags = parent.tags
336 else:
337 self.document = parent
338 self.tags = self.scantags(self.document)
339 if end is None:
340 end = len(self.document)
341 self.start, self.end = start, end
342 self.tagname, self.attrs = None, {}
343
344 # If only starttag is specified, this Region is a tag.
345 # If starttag and endtag are specified, this Region is an element.
346 self.starttag, self.endtag = starttag, endtag
347 if starttag is not None:
348 self.start, self.end, self.tagname, self.attrs = self.tags[starttag]
349 if endtag is not None:
350 self.start, self.end = self.tags[starttag][1], self.tags[endtag][0]
351
352 # Find the minimum and maximum indices of tags within this Region.
353 if starttag and endtag:
354 self.tagmin, self.tagmax = starttag + 1, endtag - 1
355 else:
356 self.tagmin, self.tagmax = len(self.tags), -1
357 for i, (start, end, tagname, attrs) in enumerate(self.tags):
358 if start >= self.start and i < self.tagmin:
359 self.tagmin = i
360 if end <= self.end and i > self.tagmax:
361 self.tagmax = i
362
363 def __repr__(self):
364 if self.tagname:
365 attrs = ''.join([' %s=%r' % item for item in self.attrs.items()])
366 return '<Region %d:%d %s%s>' % (
367 self.start, self.end, self.tagname, attrs)
368 else:
369 return '<Region %d:%d>' % (self.start, self.end)
370
371 # Utilities that operate on the array of scanned tags.
372 def scantags(self, document):
373 """Generate a list of all the tags in a document."""
374 tags = []
375 pos = 0
376 while 1:
377 match = tagpat.search(document, pos)
378 if not match: break
379 start, end = match.span()
380 tagname = match.group(1).lower()
381 attrs = match.group(2)
382 tags.append([start, end, tagname, attrs])
383 if tagname == 'script':
384 match = endscriptpat.search(document, end)
385 if not match: break
386 start, end = match.span()
387 tags.append([start, end, '/' + tagname, ''])
388 pos = end
389 return tags
390
391 def matchtag(self, i, tagname, attrs):
392 """Return 1 if the ith tag matches the given tagname and attributes."""
393 itagname, iattrs = self.tags[i][2], self.tags[i][3]
394 if itagname[:1] not in ['', '?', '!', '/']:
395 if itagname == tagname or tagname is None:
396 if isinstance(iattrs, str):
397 self.tags[i][3] = iattrs = parseattrs(iattrs)
398 return matchattrs(iattrs, attrs)
399
400 def findendtag(self, starttag, outside=0):
401 """Find the index of the matching end tag for the given start tag.
402 If outside is 0, look for the end tag within the current region;
403 if outside is 1, look beyond the end of the current region."""
404 tagname = self.tags[starttag][2]
405 depth = 1
406 for i in range(starttag + 1, len(self.tags)):
407 if self.tags[i][2] == tagname:
408 depth += 1
409 if self.tags[i][2] == '/' + tagname:
410 depth -= 1
411 if depth == 0:
412 if not outside and i <= self.tagmax:
413 return i
414 if outside and i > self.tagmax:
415 return i
416 break
417
418 def matchelement(self, starttag, content=None, outside=0):
419 """If the element with the given start tag matches the given content,
420 return the index of the matching end tag. See findendtag() for the
421 meaning of the outside flag."""
422 endtag = self.findendtag(starttag, outside)
423 if endtag is not None:
424 start, end = self.tags[starttag][1], self.tags[endtag][0]
425 stripped = striptags(self.document[start:end])
426 if content is None or matchcontent(stripped, content):
427 return endtag
428
429 # Provide the "content" and "text" attributes to access the contents.
430 content = property(lambda self: self.document[self.start:self.end])
431 text = property(lambda self: striptags(self.content))
432
433 def getparams(self):
434 """Get a dictionary of default values for all the form parameters."""
435 if self.tagname == 'form':
436 params = {}
437 for input in self.alltags('input'):
438 if 'disabled' not in input:
439 type = input['type'].lower()
440 if type in ['text', 'password', 'hidden'] or (
441 type in ['checkbox', 'radio'] and 'checked' in input):
442 params[input['name']] = input['value']
443 for select in self.all('select'):
444 if 'disabled' not in select:
445 selections = [option['value']
446 for option in select.alltags('option')
447 if 'selected' in option]
448 if 'multiple' in select:
449 params[select['name']] = selections
450 elif selections:
451 params[select['name']] = selections[0]
452 for textarea in self.all('textarea'):
453 if 'disabled' not in textarea:
454 params[textarea['name']] = textarea.content
455 return params
456
457 def getbuttons(self):
458 """Get a list of all the form submission buttons."""
459 if self.tagname == 'form':
460 return [tag for tag in self.alltags('input')
461 if tag['type'].lower() in ['submit', 'image']
462 ] + [tag for tag in self.alltags('button')
463 if tag['type'].lower() in ['submit', '']]
464
465 params = property(getparams)
466 buttons = property(getbuttons)
467
468 # Provide a dictionary-like interface to the tag attributes.
469 def __contains__(self, name):
470 return name in self.attrs
471
472 def __getitem__(self, name):
473 return self.attrs.get(name, '')
474
475 # Provide subregions by slicing.
476 def __getslice__(self, start, end):
477 start += (start < 0) and self.end or self.start
478 end += (end < 0) and self.end or self.start
479 return Region(self, start, end)
480
481 # Search for text.
482 def find(self, target, group=0):
483 """Search this Region for a string or a compiled RE and return a
484 Region representing the match. The optional group argument specifies
485 which grouped subexpression should be returned as the match."""
486 if hasattr(target, 'search'):
487 match = target.search(self.content)
488 if match:
489 return self[match.start(group):match.end(group)]
490 else:
491 start = self.content.find(target)
492 if start > -1:
493 return self[start:start+len(target)]
494
495 def findall(self, target, group=0):
496 """Search this Region for a string or a compiled RE and return a
497 sequence of Regions representing all the matches."""
498 pos = 0
499 content = self.content
500 matches = []
501 if hasattr(target, 'search'):
502 while 1:
503 match = target.search(content, pos)
504 if not match:
505 break
506 start, pos = match.span(group)
507 matches.append(self[start:pos])
508 else:
509 while 1:
510 start = content.find(target, pos)
511 if start < 0:
512 break
513 pos = start + len(target)
514 matches.append(self[start:pos])
515 return matches
516
517 # Search for tags.
518 def firsttag(self, tagname=None, **attrs):
519 """Return the Region for the first tag entirely within this Region
520 with the given tag name and attributes."""
521 for i in range(self.tagmin, self.tagmax + 1):
522 if self.matchtag(i, tagname, attrs):
523 return Region(self, 0, 0, i)
524
525 def lasttag(self, tagname=None, **attrs):
526 """Return the Region for the last tag entirely within this Region
527 with the given tag name and attributes."""
528 for i in range(self.tagmax, self.tagmin - 1, -1):
529 if self.matchtag(i, tagname, attrs):
530 return Region(self, 0, 0, i)
531
532 def alltags(self, tagname=None, **attrs):
533 """Return a list of Regions for all the tags entirely within this
534 Region with the given tag name and attributes."""
535 tags = []
536 for i in range(self.tagmin, self.tagmax + 1):
537 if self.matchtag(i, tagname, attrs):
538 tags.append(Region(self, 0, 0, i))
539 return tags
540
541 def nexttag(self, tagname=None, **attrs):
542 """Return the Region for the nearest tag after the end of this Region
543 with the given tag name and attributes."""
544 return Region(self, self.end).firsttag(tagname, **attrs)
545
546 def previoustag(self, tagname=None, **attrs):
547 """Return the Region for the nearest tag before the start of this
548 Region with the given tag name and attributes."""
549 return Region(self, 0, self.start).lasttag(tagname, **attrs)
550
551 # Search for elements.
552 def first(self, tagname=None, content=None, **attrs):
553 """Return the Region for the first properly balanced element entirely
554 within this Region with the given tag name, content, and attributes.
555 The element content is passed through striptags(). If the content
556 argument has a match() method, the stripped content is passed into
557 this method; otherwise it is compared directly as a string."""
558 for starttag in range(self.tagmin, self.tagmax + 1):
559 if self.matchtag(starttag, tagname, attrs):
560 endtag = self.matchelement(starttag, content)
561 if endtag is not None:
562 return Region(self, 0, 0, starttag, endtag)
563
564 def last(self, tagname=None, content=None, **attrs):
565 """Return the Region for the last properly balanced element entirely
566 within this Region with the given tag name, content, and attributes."""
567 for starttag in range(self.tagmax, self.tagmin - 1, -1):
568 if self.matchtag(starttag, tagname, attrs):
569 endtag = self.matchelement(starttag, content)
570 if endtag is not None:
571 return Region(self, 0, 0, starttag, endtag)
572
573 def all(self, tagname=None, content=None, **attrs):
574 """Return Regions for all non-overlapping balanced elements entirely
575 within this Region with the given tag name, content, and attributes."""
576 elements = []
577 starttag = self.tagmin
578 while starttag <= self.tagmax:
579 if self.matchtag(starttag, tagname, attrs):
580 endtag = self.matchelement(starttag, content)
581 if endtag is not None:
582 elements.append(Region(self, 0, 0, starttag, endtag))
583 starttag = endtag
584 starttag += 1
585 return elements
586
587 def next(self, tagname=None, content=None, **attrs):
588 """Return the Region for the nearest balanced element after the end of
589 this Region with the given tag name, content, and attributes."""
590 return Region(self, self.end).first(tagname, content, **attrs)
591
592 def previous(self, tagname=None, content=None, **attrs):
593 """Return the Region for the nearest balanced element before the start
594 of this Region with the given tag name, content, and attributes."""
595 return Region(self, 0, self.start).last(tagname, content, **attrs)
596
597 def enclosing(self, tagname=None, content=None, **attrs):
598 """Return the Region for the nearest balanced element that encloses
599 this Region with the given tag name, content, and attributes."""
600 if self.starttag and self.endtag: # skip our own start tag
601 laststarttag = self.starttag - 1
602 else:
603 laststarttag = self.tagmin - 1
604 for starttag in range(laststarttag, -1, -1):
605 if self.matchtag(starttag, tagname, attrs):
606 endtag = self.matchelement(starttag, content, outside=1)
607 if endtag is not None:
608 return Region(self, 0, 0, starttag, endtag)
609
610def read(path):
611 """Read and return the entire contents of the file at the given path."""
612 return open(path).read()
613
614def write(path, text):
615 """Write the given text to a file at the given path."""
616 file = open(path, 'w')
617 file.write(text)
618 file.close()
619
620def load(path):
621 """Return the deserialized contents of the file at the given path."""
622 return marshal.load(open(path))
623
624def dump(path, data):
625 """Serialize the given data and write it to a file at the given path."""
626 file = open(path, 'w')
627 marshal.dump(data, file)
628 file.close()
629
630def getnumber(text):
631 """Find and parse a floating-point or integer number in the given text,
632 ignoring commas, percentage signs, and non-numeric words."""
633 for word in striptags(text).replace(',', '').replace('%', '').split():
634 try: return int(word)
635 except:
636 try: return float(word)
637 except: continue
Note: See TracBrowser for help on using the repository browser.