1 | """Python module for web browsing and scraping.
2 |
3 | Done:
4 | - navigate to absolute and relative URLs
5 | - follow links in page or region
6 | - find first or all occurrences of string or RE in page or region
7 | - find first, last, next, previous, or all tags with given name/attributes
8 | - find first, last, next, previous, enclosing, or all elements with given
9 | name/attributes/content
10 | - set form fields
11 | - submit forms
12 | - strip tags from arbitrary strings of HTML
13 |
14 | Todo:
15 | - cookie-handling is dumb (sends all cookies to all sites)
16 | - handle CDATA and RCDATA marked sections
17 | - support for submitting forms with file upload
18 | - use Regions in striptags instead of duplicating work
19 | - map of enders
20 | """
21 |
22 | __author__ = 'Ka-Ping Yee'
23 | __date__ = '2005-03-29'
24 | __version__ = '$Revision: 1.16 $'
25 |
26 | import os, socket, re, marshal, subprocess
27 | from tempfile import gettempdir
28 | from urlparse import urljoin, urlsplit
29 | from urllib import urlencode
30 |
31 | def connect(server, port):
32 | """Return a TCP socket connected to the given server and port."""
33 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34 | sock.connect((server, port))
35 | return sock
36 |
37 | def receive(sock):
38 | """Read all the data from a socket until it closes."""
39 | chunks = []
40 | while 1:
41 | chunk = sock.recv(4096)
42 | if chunk: chunks.append(chunk)
43 | else: return ''.join(chunks)
44 |
45 | def request(host, method, path, headers, entity=None):
46 | """Make an HTTP request and return (status, message, headers, document)."""
47 | sock = connect(host, 80)
48 | request = method + ' ' + path + ' HTTP/1.0\r\n'
49 | for name in headers:
50 | capname = '-'.join([part.capitalize() for part in name.split('-')])
51 | request += capname + ': ' + str(headers[name]) + '\r\n'
52 | request += '\r\n'
53 | if entity:
54 | request += entity
55 | sock.sendall(request)
56 | data = receive(sock)
57 | try: return splitreply(data)
58 | except: return (0, '', {}, data)
59 |
60 | def splitreply(reply):
61 | """Split an HTTP response into (status, message, headers, document)."""
62 | if '\r\n\r\n' in reply:
63 | head, document = reply.split('\r\n\r\n', 1)
64 | else:
65 | head, document = reply, ''
66 | headers = []
67 | while True:
68 | if '\r\n' in head:
69 | response, head = head.split('\r\n', 1)
70 | for line in head.split('\r\n'):
71 | name, value = line.split(': ', 1)
72 | headers.append((name.lower(), value))
73 | else:
74 | response, head = head, ''
75 | status = int(response.split()[1])
76 | message = ' '.join(response.split()[2:])
77 | if document.startswith('HTTP/1.') and '\r\n\r\n' in document:
78 | head, document = document.split('\r\n\r\n', 1)
79 | else:
80 | return status, message, headers, document
81 |
82 | def shellquote(text):
83 | """Quote a string literal for sh."""
84 | return "'" + text.replace("'", "'\\''") + "'"
85 |
86 | def curl(url, entity=None, follow=1, cookies=[], referrer=None):
87 | """Invoke curl to perform an HTTP request."""
88 | command = ['curl', '-s', '-i']
89 | if referrer:
90 | command += ['-e', referrer]
91 | if entity:
92 | if not isinstance(entity, str):
93 | entity = urlencode(entity, doseq=1)
94 | command += ['-d', entity]
95 | if not follow:
96 | command += ['-Z', '0']
97 | else:
98 | command += ['-L']
99 | if cookies:
100 | command += ['-b', '; '.join(cookies)]
101 | command.append(url)
102 | reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read()
103 | return splitreply(reply)
104 |
105 | def fetch(url, entity=None, follow=1):
106 | """Fetch one document in a one-shot session."""
107 | return Session().fetch(url, entity, follow)
108 |
109 | class ScrapeError(Exception): pass
110 | class HTTPError(ScrapeError): pass
111 | LAST_URL = object()
112 |
113 | class Session:
114 | """A Web-browsing session.
115 |
116 | Exposed attributes:
117 |
118 | agent - set or get the User-Agent string
119 | location - get the current (i.e. last successfully fetched) URL
120 | status - get the status code of the last successful request
121 | message - get the status message of the last successful request
122 | headers - get the dictionary of headers from the last successful request
123 | document - get the document returned by the last successful request
124 | region - get a Region spanning the entire document
125 | """
126 |
127 | def __init__(self, agent=None):
128 | self.cookies = []
129 | self.agent = agent
130 | self.location = self.status = self.message = None
131 | self.headers = self.document = self.region = None
132 | self.history = []
133 |
134 | def fetch(self, url, entity=None, follow=1, referrer=LAST_URL):
135 | scheme, host, path, query, fragment = urlsplit(url)
136 | if referrer is LAST_URL:
137 | referrer = self.location
138 | self.location = url
139 | if scheme == 'https':
140 | status, message, headers, document = \
141 | curl(url, entity, follow, self.cookies)
142 | elif scheme == 'http':
143 | if query:
144 | path += '?' + query
145 | headers = {}
146 | headers['host'] = host
147 | headers['accept'] = '*/*'
148 | if referrer:
149 | headers['referer'] = referrer
150 | self.location = url
151 | if self.agent:
152 | headers['user-agent'] = self.agent
153 | if self.cookies:
154 | headers['cookie'] = '; '.join(self.cookies)
155 | if entity:
156 | if not isinstance(entity, str):
157 | entity = urlencode(entity, doseq=1)
158 | headers['content-type'] = 'application/x-www-form-urlencoded'
159 | headers['content-length'] = len(entity)
160 | method = entity and 'POST' or 'GET'
161 | status, message, headers, document = \
162 | request(host, method, path, headers, entity)
163 | else:
164 | raise ValueError, scheme + ' not supported'
165 | headerdict = {}
166 | for name, value in headers:
167 | if name == 'set-cookie':
168 | cookie = value.split(';')[0]
169 | if cookie not in self.cookies:
170 | self.cookies.append(cookie)
171 | else:
172 | headerdict[name] = value
173 | if follow and status in [301, 302] and 'location' in headerdict:
174 | return self.fetch(urljoin(url, headerdict['location']))
175 | return status, message, headerdict, document
176 |
177 | def go(self, url, entity=None, follow=1, referrer=LAST_URL):
178 | """Navigate to a given URL. If the URL is relative, it is resolved
179 | with respect to the current location. If the document is successfully
180 | fetched, return a Region spanning the entire document."""
181 | historyentry = (self.location, self.status, self.message,
182 | self.headers, self.document, self.region)
183 | if self.location:
184 | url = urljoin(self.location, url)
185 | results = self.fetch(url, entity, follow, referrer)
186 | if results[0] == 200:
187 | self.history.append(historyentry)
188 | self.status, self.message, self.headers, self.document = results
189 | self.region = Region(self.document)
190 | return self.region
191 | raise HTTPError(self.status, self.message)
192 |
193 | def back(self):
194 | """Return to the previous page."""
195 | (self.location, self.status, self.message,
196 | self.headers, self.document, self.region) = self.history.pop()
197 | return self.location
198 |
199 | def follow(self, anchor, region=None):
200 | """Follow the first link with the given anchor text. The anchor may
201 | be given as a string or a compiled RE. If a region is given, the
202 | link is sought within that region instead of the whole document."""
203 | link = (region or self.region).first('a', content=anchor)
204 | if not link:
205 | raise ScrapeError('link %r not found' % anchor)
206 | if not link['href']:
207 | raise ScrapeError('link %r has no href' % link)
208 | return self.go(link['href'])
209 |
210 | def submit(self, form, button=None, **params):
211 | """Submit a form, optionally by clicking a given button."""
212 | if form.tagname != 'form':
213 | raise ScrapeError('%r is not a form' % form)
214 | p = form.params
215 | if button:
216 | p[button['name']] = button['value']
217 | p.update(params)
218 | method = form['method'].lower() or 'get'
219 | if method == 'post':
220 | return self.go(form['action'], p)
221 | elif method == 'get':
222 | return self.go(form['action'] + '?' + urlencode(p, doseq=1))
223 | else:
224 | raise ScrapeError('unknown form method %r' % method)
225 |
226 | tagcontent_re = r'''(('[^']*'|"[^"]*"|--([^-]|-[^-])*--|-(?!-)|[^'">-])*)'''
227 |
228 | def tag_re(tagname_re):
229 | return '<' + tagname_re + tagcontent_re + '>'
230 |
231 | anytag_re = tag_re(r'(\?|!\w*|/?[a-zA-Z_:][\w:.-]*)')
232 | tagpat = re.compile(anytag_re)
233 |
234 | def htmldec(text):
235 | """Decode HTML entities in the given text."""
236 | chunks = text.split('&#')
237 | for i in range(1, len(chunks)):
238 | number, rest = chunks[i].split(';', 1)
239 | chunks[i] = chr(int(number)) + rest
240 | text = ''.join(chunks)
241 | text = text.replace('\xa0', ' ')
242 | text = text.replace(' ', ' ')
243 | text = text.replace('<', '<')
244 | text = text.replace('>', '>')
245 | text = text.replace('"', '"')
246 | text = text.replace('&', '&')
247 | return text
248 |
249 | def htmlenc(text):
250 | """Use HTML entities to encode special characters in the given text."""
251 | text = text.replace('&', '&')
252 | text = text.replace('"', '"')
253 | text = text.replace('<', '<')
254 | text = text.replace('>', '>')
255 | return text
256 |
257 | def no_groups(re):
258 | return re.replace('(', '(?:').replace('(?:?', '(?')
259 |
260 | tagsplitter = re.compile(no_groups(anytag_re))
261 | parasplitter = re.compile(no_groups(tag_re('(p|table|form)')), re.I)
262 | linesplitter = re.compile(no_groups(tag_re('(div|br|tr)')), re.I)
263 | scriptpat = re.compile(r'<script\b', re.I)
264 | endscriptpat = re.compile(r'</script[^>]*>', re.I)
265 | endcommentpat = re.compile(r'--\s*>')
266 |
267 | def striptags(text):
268 | """Strip HTML tags from the given text, yielding line breaks for DIV,
269 | BR, or TR tags and blank lines for P, TABLE, or FORM tags."""
270 | chunks = scriptpat.split(text)
271 | for i in range(1, len(chunks)):
272 | chunks[i] = endscriptpat.split(chunks[i], 1)[1]
273 | text = ''.join(chunks)
274 | chunks = text.split('<!')
275 | for i in range(1, len(chunks)):
276 | if chunks[i].split('>', 1)[0].find('--') >= 0:
277 | chunks[i] = endcommentpat.split(chunks[i], 1)[1]
278 | else:
279 | chunks[i] = chunks[i].split('>', 1)[1]
280 | text = ''.join(chunks)
281 |
282 | paragraphs = []
283 | for paragraph in parasplitter.split(text):
284 | lines = []
285 | for line in linesplitter.split(paragraph):
286 | line = ''.join(tagsplitter.split(line))
287 | line = htmldec(line)
288 | line = ' '.join(line.split())
289 | lines.append(line)
290 | paragraphs.append('\n'.join(lines))
291 | return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip()
292 |
293 | attr_re = r'''\s*([\w:.-]+)(\s*=\s*('[^']*'|"[^"]*"|[^\s>]*))?'''
294 | attrpat = re.compile(attr_re)
295 |
296 | def parseattrs(text):
297 | """Turn a string of name=value pairs into an attribute dictionary."""
298 | attrs = {}
299 | pos = 0
300 | while 1:
301 | match = attrpat.search(text, pos)
302 | if not match: break
303 | pos = match.end()
304 | name, value = match.group(1), match.group(3) or ''
305 | if value[:1] in ["'", '"']:
306 | value = value[1:-1]
307 | attrs[name.lower()] = htmldec(value)
308 | return attrs
309 |
310 | def matchcontent(specimen, desired):
311 | if hasattr(desired, 'match'):
312 | return desired.match(specimen)
313 | elif callable(desired):
314 | return desired(specimen)
315 | else:
316 | return specimen == desired
317 |
318 | def matchattrs(specimen, desired):
319 | for name, value in desired.items():
320 | name = name.strip('_').replace('_', '-')
321 | if not (name in specimen and matchcontent(specimen[name], value)):
322 | return 0
323 | return 1
324 |
325 | class Region:
326 | """A Region object represents a contiguous region of a document together
327 | with an associated HTML or XML tag and its attributes."""
328 |
329 | def __init__(self, parent, start=0, end=None, starttag=None, endtag=None):
330 | """Create a Region. The parent argument is a string or another
331 | Region. The start and end arguments, if given, specify non-negative
332 | indices into the original string (not into a parent subregion)."""
333 | if isinstance(parent, Region):
334 | self.document = parent.document
335 | self.tags = parent.tags
336 | else:
337 | self.document = parent
338 | self.tags = self.scantags(self.document)
339 | if end is None:
340 | end = len(self.document)
341 | self.start, self.end = start, end
342 | self.tagname, self.attrs = None, {}
343 |
344 | # If only starttag is specified, this Region is a tag.
345 | # If starttag and endtag are specified, this Region is an element.
346 | self.starttag, self.endtag = starttag, endtag
347 | if starttag is not None:
348 | self.start, self.end, self.tagname, self.attrs = self.tags[starttag]
349 | if endtag is not None:
350 | self.start, self.end = self.tags[starttag][1], self.tags[endtag][0]
351 |
352 | # Find the minimum and maximum indices of tags within this Region.
353 | if starttag and endtag:
354 | self.tagmin, self.tagmax = starttag + 1, endtag - 1
355 | else:
356 | self.tagmin, self.tagmax = len(self.tags), -1
357 | for i, (start, end, tagname, attrs) in enumerate(self.tags):
358 | if start >= self.start and i < self.tagmin:
359 | self.tagmin = i
360 | if end <= self.end and i > self.tagmax:
361 | self.tagmax = i
362 |
363 | def __repr__(self):
364 | if self.tagname:
365 | attrs = ''.join([' %s=%r' % item for item in self.attrs.items()])
366 | return '<Region %d:%d %s%s>' % (
367 | self.start, self.end, self.tagname, attrs)
368 | else:
369 | return '<Region %d:%d>' % (self.start, self.end)
370 |
371 | # Utilities that operate on the array of scanned tags.
372 | def scantags(self, document):
373 | """Generate a list of all the tags in a document."""
374 | tags = []
375 | pos = 0
376 | while 1:
377 | match = tagpat.search(document, pos)
378 | if not match: break
379 | start, end = match.span()
380 | tagname = match.group(1).lower()
381 | attrs = match.group(2)
382 | tags.append([start, end, tagname, attrs])
383 | if tagname == 'script':
384 | match = endscriptpat.search(document, end)
385 | if not match: break
386 | start, end = match.span()
387 | tags.append([start, end, '/' + tagname, ''])
388 | pos = end
389 | return tags
390 |
391 | def matchtag(self, i, tagname, attrs):
392 | """Return 1 if the ith tag matches the given tagname and attributes."""
393 | itagname, iattrs = self.tags[i][2], self.tags[i][3]
394 | if itagname[:1] not in ['', '?', '!', '/']:
395 | if itagname == tagname or tagname is None:
396 | if isinstance(iattrs, str):
397 | self.tags[i][3] = iattrs = parseattrs(iattrs)
398 | return matchattrs(iattrs, attrs)
399 |
400 | def findendtag(self, starttag, outside=0):
401 | """Find the index of the matching end tag for the given start tag.
402 | If outside is 0, look for the end tag within the current region;
403 | if outside is 1, look beyond the end of the current region."""
404 | tagname = self.tags[starttag][2]
405 | depth = 1
406 | for i in range(starttag + 1, len(self.tags)):
407 | if self.tags[i][2] == tagname:
408 | depth += 1
409 | if self.tags[i][2] == '/' + tagname:
410 | depth -= 1
411 | if depth == 0:
412 | if not outside and i <= self.tagmax:
413 | return i
414 | if outside and i > self.tagmax:
415 | return i
416 | break
417 |
418 | def matchelement(self, starttag, content=None, outside=0):
419 | """If the element with the given start tag matches the given content,
420 | return the index of the matching end tag. See findendtag() for the
421 | meaning of the outside flag."""
422 | endtag = self.findendtag(starttag, outside)
423 | if endtag is not None:
424 | start, end = self.tags[starttag][1], self.tags[endtag][0]
425 | stripped = striptags(self.document[start:end])
426 | if content is None or matchcontent(stripped, content):
427 | return endtag
428 |
429 | # Provide the "content" and "text" attributes to access the contents.
430 | content = property(lambda self: self.document[self.start:self.end])
431 | text = property(lambda self: striptags(self.content))
432 |
433 | def getparams(self):
434 | """Get a dictionary of default values for all the form parameters."""
435 | if self.tagname == 'form':
436 | params = {}
437 | for input in self.alltags('input'):
438 | if 'disabled' not in input:
439 | type = input['type'].lower()
440 | if type in ['text', 'password', 'hidden'] or (
441 | type in ['checkbox', 'radio'] and 'checked' in input):
442 | params[input['name']] = input['value']
443 | for select in self.all('select'):
444 | if 'disabled' not in select:
445 | selections = [option['value']
446 | for option in select.alltags('option')
447 | if 'selected' in option]
448 | if 'multiple' in select:
449 | params[select['name']] = selections
450 | elif selections:
451 | params[select['name']] = selections[0]
452 | for textarea in self.all('textarea'):
453 | if 'disabled' not in textarea:
454 | params[textarea['name']] = textarea.content
455 | return params
456 |
457 | def getbuttons(self):
458 | """Get a list of all the form submission buttons."""
459 | if self.tagname == 'form':
460 | return [tag for tag in self.alltags('input')
461 | if tag['type'].lower() in ['submit', 'image']
462 | ] + [tag for tag in self.alltags('button')
463 | if tag['type'].lower() in ['submit', '']]
464 |
465 | params = property(getparams)
466 | buttons = property(getbuttons)
467 |
468 | # Provide a dictionary-like interface to the tag attributes.
469 | def __contains__(self, name):
470 | return name in self.attrs
471 |
472 | def __getitem__(self, name):
473 | return self.attrs.get(name, '')
474 |
475 | # Provide subregions by slicing.
476 | def __getslice__(self, start, end):
477 | start += (start < 0) and self.end or self.start
478 | end += (end < 0) and self.end or self.start
479 | return Region(self, start, end)
480 |
481 | # Search for text.
482 | def find(self, target, group=0):
483 | """Search this Region for a string or a compiled RE and return a
484 | Region representing the match. The optional group argument specifies
485 | which grouped subexpression should be returned as the match."""
486 | if hasattr(target, 'search'):
487 | match = target.search(self.content)
488 | if match:
489 | return self[match.start(group):match.end(group)]
490 | else:
491 | start = self.content.find(target)
492 | if start > -1:
493 | return self[start:start+len(target)]
494 |
495 | def findall(self, target, group=0):
496 | """Search this Region for a string or a compiled RE and return a
497 | sequence of Regions representing all the matches."""
498 | pos = 0
499 | content = self.content
500 | matches = []
501 | if hasattr(target, 'search'):
502 | while 1:
503 | match = target.search(content, pos)
504 | if not match:
505 | break
506 | start, pos = match.span(group)
507 | matches.append(self[start:pos])
508 | else:
509 | while 1:
510 | start = content.find(target, pos)
511 | if start < 0:
512 | break
513 | pos = start + len(target)
514 | matches.append(self[start:pos])
515 | return matches
516 |
517 | # Search for tags.
518 | def firsttag(self, tagname=None, **attrs):
519 | """Return the Region for the first tag entirely within this Region
520 | with the given tag name and attributes."""
521 | for i in range(self.tagmin, self.tagmax + 1):
522 | if self.matchtag(i, tagname, attrs):
523 | return Region(self, 0, 0, i)
524 |
525 | def lasttag(self, tagname=None, **attrs):
526 | """Return the Region for the last tag entirely within this Region
527 | with the given tag name and attributes."""
528 | for i in range(self.tagmax, self.tagmin - 1, -1):
529 | if self.matchtag(i, tagname, attrs):
530 | return Region(self, 0, 0, i)
531 |
532 | def alltags(self, tagname=None, **attrs):
533 | """Return a list of Regions for all the tags entirely within this
534 | Region with the given tag name and attributes."""
535 | tags = []
536 | for i in range(self.tagmin, self.tagmax + 1):
537 | if self.matchtag(i, tagname, attrs):
538 | tags.append(Region(self, 0, 0, i))
539 | return tags
540 |
541 | def nexttag(self, tagname=None, **attrs):
542 | """Return the Region for the nearest tag after the end of this Region
543 | with the given tag name and attributes."""
544 | return Region(self, self.end).firsttag(tagname, **attrs)
545 |
546 | def previoustag(self, tagname=None, **attrs):
547 | """Return the Region for the nearest tag before the start of this
548 | Region with the given tag name and attributes."""
549 | return Region(self, 0, self.start).lasttag(tagname, **attrs)
550 |
551 | # Search for elements.
552 | def first(self, tagname=None, content=None, **attrs):
553 | """Return the Region for the first properly balanced element entirely
554 | within this Region with the given tag name, content, and attributes.
555 | The element content is passed through striptags(). If the content
556 | argument has a match() method, the stripped content is passed into
557 | this method; otherwise it is compared directly as a string."""
558 | for starttag in range(self.tagmin, self.tagmax + 1):
559 | if self.matchtag(starttag, tagname, attrs):
560 | endtag = self.matchelement(starttag, content)
561 | if endtag is not None:
562 | return Region(self, 0, 0, starttag, endtag)
563 |
564 | def last(self, tagname=None, content=None, **attrs):
565 | """Return the Region for the last properly balanced element entirely
566 | within this Region with the given tag name, content, and attributes."""
567 | for starttag in range(self.tagmax, self.tagmin - 1, -1):
568 | if self.matchtag(starttag, tagname, attrs):
569 | endtag = self.matchelement(starttag, content)
570 | if endtag is not None:
571 | return Region(self, 0, 0, starttag, endtag)
572 |
573 | def all(self, tagname=None, content=None, **attrs):
574 | """Return Regions for all non-overlapping balanced elements entirely
575 | within this Region with the given tag name, content, and attributes."""
576 | elements = []
577 | starttag = self.tagmin
578 | while starttag <= self.tagmax:
579 | if self.matchtag(starttag, tagname, attrs):
580 | endtag = self.matchelement(starttag, content)
581 | if endtag is not None:
582 | elements.append(Region(self, 0, 0, starttag, endtag))
583 | starttag = endtag
584 | starttag += 1
585 | return elements
586 |
587 | def next(self, tagname=None, content=None, **attrs):
588 | """Return the Region for the nearest balanced element after the end of
589 | this Region with the given tag name, content, and attributes."""
590 | return Region(self, self.end).first(tagname, content, **attrs)
591 |
592 | def previous(self, tagname=None, content=None, **attrs):
593 | """Return the Region for the nearest balanced element before the start
594 | of this Region with the given tag name, content, and attributes."""
595 | return Region(self, 0, self.start).last(tagname, content, **attrs)
596 |
597 | def enclosing(self, tagname=None, content=None, **attrs):
598 | """Return the Region for the nearest balanced element that encloses
599 | this Region with the given tag name, content, and attributes."""
600 | if self.starttag and self.endtag: # skip our own start tag
601 | laststarttag = self.starttag - 1
602 | else:
603 | laststarttag = self.tagmin - 1
604 | for starttag in range(laststarttag, -1, -1):
605 | if self.matchtag(starttag, tagname, attrs):
606 | endtag = self.matchelement(starttag, content, outside=1)
607 | if endtag is not None:
608 | return Region(self, 0, 0, starttag, endtag)
609 |
610 | def read(path):
611 | """Read and return the entire contents of the file at the given path."""
612 | return open(path).read()
613 |
614 | def write(path, text):
615 | """Write the given text to a file at the given path."""
616 | file = open(path, 'w')
617 | file.write(text)
618 | file.close()
619 |
620 | def load(path):
621 | """Return the deserialized contents of the file at the given path."""
622 | return marshal.load(open(path))
623 |
624 | def dump(path, data):
625 | """Serialize the given data and write it to a file at the given path."""
626 | file = open(path, 'w')
627 | marshal.dump(data, file)
628 | file.close()
629 |
630 | def getnumber(text):
631 | """Find and parse a floating-point or integer number in the given text,
632 | ignoring commas, percentage signs, and non-numeric words."""
633 | for word in striptags(text).replace(',', '').replace('%', '').split():
634 | try: return int(word)
635 | except:
636 | try: return float(word)
637 | except: continue