Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/StreamVision/scrape.py@ 510

Last change on this file since 510 was 188, checked in by Nicholas Riley, 19 years ago
StreamVision
File size: 25.0 KB

Line
1	"""Python module for web browsing and scraping.
2
3	Done:
4	- navigate to absolute and relative URLs
5	- follow links in page or region
6	- find first or all occurrences of string or RE in page or region
7	- find first, last, next, previous, or all tags with given name/attributes
8	- find first, last, next, previous, enclosing, or all elements with given
9	name/attributes/content
10	- set form fields
11	- submit forms
12	- strip tags from arbitrary strings of HTML
13
14	Todo:
15	- cookie-handling is dumb (sends all cookies to all sites)
16	- handle CDATA and RCDATA marked sections
17	- support for submitting forms with file upload
18	- use Regions in striptags instead of duplicating work
19	- map of enders
20	"""
21
22	__author__ = 'Ka-Ping Yee'
23	__date__ = '2005-03-29'
24	__version__ = '$Revision: 1.16 $'
25
26	import os, socket, re, marshal, subprocess
27	from tempfile import gettempdir
28	from urlparse import urljoin, urlsplit
29	from urllib import urlencode
30
31	def connect(server, port):
32	"""Return a TCP socket connected to the given server and port."""
33	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34	sock.connect((server, port))
35	return sock
36
37	def receive(sock):
38	"""Read all the data from a socket until it closes."""
39	chunks = []
40	while 1:
41	chunk = sock.recv(4096)
42	if chunk: chunks.append(chunk)
43	else: return ''.join(chunks)
44
45	def request(host, method, path, headers, entity=None):
46	"""Make an HTTP request and return (status, message, headers, document)."""
47	sock = connect(host, 80)
48	request = method + ' ' + path + ' HTTP/1.0\r\n'
49	for name in headers:
50	capname = '-'.join([part.capitalize() for part in name.split('-')])
51	request += capname + ': ' + str(headers[name]) + '\r\n'
52	request += '\r\n'
53	if entity:
54	request += entity
55	sock.sendall(request)
56	data = receive(sock)
57	try: return splitreply(data)
58	except: return (0, '', {}, data)
59
60	def splitreply(reply):
61	"""Split an HTTP response into (status, message, headers, document)."""
62	if '\r\n\r\n' in reply:
63	head, document = reply.split('\r\n\r\n', 1)
64	else:
65	head, document = reply, ''
66	headers = []
67	while True:
68	if '\r\n' in head:
69	response, head = head.split('\r\n', 1)
70	for line in head.split('\r\n'):
71	name, value = line.split(': ', 1)
72	headers.append((name.lower(), value))
73	else:
74	response, head = head, ''
75	status = int(response.split()[1])
76	message = ' '.join(response.split()[2:])
77	if document.startswith('HTTP/1.') and '\r\n\r\n' in document:
78	head, document = document.split('\r\n\r\n', 1)
79	else:
80	return status, message, headers, document
81
82	def shellquote(text):
83	"""Quote a string literal for sh."""
84	return "'" + text.replace("'", "'\\''") + "'"
85
86	def curl(url, entity=None, follow=1, cookies=[], referrer=None):
87	"""Invoke curl to perform an HTTP request."""
88	command = ['curl', '-s', '-i']
89	if referrer:
90	command += ['-e', referrer]
91	if entity:
92	if not isinstance(entity, str):
93	entity = urlencode(entity, doseq=1)
94	command += ['-d', entity]
95	if not follow:
96	command += ['-Z', '0']
97	else:
98	command += ['-L']
99	if cookies:
100	command += ['-b', '; '.join(cookies)]
101	command.append(url)
102	reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read()
103	return splitreply(reply)
104
105	def fetch(url, entity=None, follow=1):
106	"""Fetch one document in a one-shot session."""
107	return Session().fetch(url, entity, follow)
108
109	class ScrapeError(Exception): pass
110	class HTTPError(ScrapeError): pass
111	LAST_URL = object()
112
113	class Session:
114	"""A Web-browsing session.
115
116	Exposed attributes:
117
118	agent - set or get the User-Agent string
119	location - get the current (i.e. last successfully fetched) URL
120	status - get the status code of the last successful request
121	message - get the status message of the last successful request
122	headers - get the dictionary of headers from the last successful request
123	document - get the document returned by the last successful request
124	region - get a Region spanning the entire document
125	"""
126
127	def __init__(self, agent=None):
128	self.cookies = []
129	self.agent = agent
130	self.location = self.status = self.message = None
131	self.headers = self.document = self.region = None
132	self.history = []
133
134	def fetch(self, url, entity=None, follow=1, referrer=LAST_URL):
135	scheme, host, path, query, fragment = urlsplit(url)
136	if referrer is LAST_URL:
137	referrer = self.location
138	self.location = url
139	if scheme == 'https':
140	status, message, headers, document = \
141	curl(url, entity, follow, self.cookies)
142	elif scheme == 'http':
143	if query:
144	path += '?' + query
145	headers = {}
146	headers['host'] = host
147	headers['accept'] = '/'
148	if referrer:
149	headers['referer'] = referrer
150	self.location = url
151	if self.agent:
152	headers['user-agent'] = self.agent
153	if self.cookies:
154	headers['cookie'] = '; '.join(self.cookies)
155	if entity:
156	if not isinstance(entity, str):
157	entity = urlencode(entity, doseq=1)
158	headers['content-type'] = 'application/x-www-form-urlencoded'
159	headers['content-length'] = len(entity)
160	method = entity and 'POST' or 'GET'
161	status, message, headers, document = \
162	request(host, method, path, headers, entity)
163	else:
164	raise ValueError, scheme + ' not supported'
165	headerdict = {}
166	for name, value in headers:
167	if name == 'set-cookie':
168	cookie = value.split(';')[0]
169	if cookie not in self.cookies:
170	self.cookies.append(cookie)
171	else:
172	headerdict[name] = value
173	if follow and status in [301, 302] and 'location' in headerdict:
174	return self.fetch(urljoin(url, headerdict['location']))
175	return status, message, headerdict, document
176
177	def go(self, url, entity=None, follow=1, referrer=LAST_URL):
178	"""Navigate to a given URL. If the URL is relative, it is resolved
179	with respect to the current location. If the document is successfully
180	fetched, return a Region spanning the entire document."""
181	historyentry = (self.location, self.status, self.message,
182	self.headers, self.document, self.region)
183	if self.location:
184	url = urljoin(self.location, url)
185	results = self.fetch(url, entity, follow, referrer)
186	if results[0] == 200:
187	self.history.append(historyentry)
188	self.status, self.message, self.headers, self.document = results
189	self.region = Region(self.document)
190	return self.region
191	raise HTTPError(self.status, self.message)
192
193	def back(self):
194	"""Return to the previous page."""
195	(self.location, self.status, self.message,
196	self.headers, self.document, self.region) = self.history.pop()
197	return self.location
198
199	def follow(self, anchor, region=None):
200	"""Follow the first link with the given anchor text. The anchor may
201	be given as a string or a compiled RE. If a region is given, the
202	link is sought within that region instead of the whole document."""
203	link = (region or self.region).first('a', content=anchor)
204	if not link:
205	raise ScrapeError('link %r not found' % anchor)
206	if not link['href']:
207	raise ScrapeError('link %r has no href' % link)
208	return self.go(link['href'])
209
210	def submit(self, form, button=None, **params):
211	"""Submit a form, optionally by clicking a given button."""
212	if form.tagname != 'form':
213	raise ScrapeError('%r is not a form' % form)
214	p = form.params
215	if button:
216	p[button['name']] = button['value']
217	p.update(params)
218	method = form['method'].lower() or 'get'
219	if method == 'post':
220	return self.go(form['action'], p)
221	elif method == 'get':
222	return self.go(form['action'] + '?' + urlencode(p, doseq=1))
223	else:
224	raise ScrapeError('unknown form method %r' % method)
225
226	tagcontent_re = r'''(('[^']'\|"[^"]"\|--([^-]\|-[^-])--\|-(?!-)\|[^'">-]))'''
227
228	def tag_re(tagname_re):
229	return '<' + tagname_re + tagcontent_re + '>'
230
231	anytag_re = tag_re(r'(\?\|!\w\|/?[a-zA-Z_:][\w:.-])')
232	tagpat = re.compile(anytag_re)
233
234	def htmldec(text):
235	"""Decode HTML entities in the given text."""
236	chunks = text.split('&#')
237	for i in range(1, len(chunks)):
238	number, rest = chunks[i].split(';', 1)
239	chunks[i] = chr(int(number)) + rest
240	text = ''.join(chunks)
241	text = text.replace('\xa0', ' ')
242	text = text.replace(' ', ' ')
243	text = text.replace('<', '<')
244	text = text.replace('>', '>')
245	text = text.replace('"', '"')
246	text = text.replace('&', '&')
247	return text
248
249	def htmlenc(text):
250	"""Use HTML entities to encode special characters in the given text."""
251	text = text.replace('&', '&')
252	text = text.replace('"', '"')
253	text = text.replace('<', '<')
254	text = text.replace('>', '>')
255	return text
256
257	def no_groups(re):
258	return re.replace('(', '(?:').replace('(?:?', '(?')
259
260	tagsplitter = re.compile(no_groups(anytag_re))
261	parasplitter = re.compile(no_groups(tag_re('(p\|table\|form)')), re.I)
262	linesplitter = re.compile(no_groups(tag_re('(div\|br\|tr)')), re.I)
263	scriptpat = re.compile(r'<script\b', re.I)
264	endscriptpat = re.compile(r'</script[^>]*>', re.I)
265	endcommentpat = re.compile(r'--\s*>')
266
267	def striptags(text):
268	"""Strip HTML tags from the given text, yielding line breaks for DIV,
269	BR, or TR tags and blank lines for P, TABLE, or FORM tags."""
270	chunks = scriptpat.split(text)
271	for i in range(1, len(chunks)):
272	chunks[i] = endscriptpat.split(chunks[i], 1)[1]
273	text = ''.join(chunks)
274	chunks = text.split('<!')
275	for i in range(1, len(chunks)):
276	if chunks[i].split('>', 1)[0].find('--') >= 0:
277	chunks[i] = endcommentpat.split(chunks[i], 1)[1]
278	else:
279	chunks[i] = chunks[i].split('>', 1)[1]
280	text = ''.join(chunks)
281
282	paragraphs = []
283	for paragraph in parasplitter.split(text):
284	lines = []
285	for line in linesplitter.split(paragraph):
286	line = ''.join(tagsplitter.split(line))
287	line = htmldec(line)
288	line = ' '.join(line.split())
289	lines.append(line)
290	paragraphs.append('\n'.join(lines))
291	return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip()
292
293	attr_re = r'''\s([\w:.-]+)(\s=\s('[^']'\|"[^"]"\|[^\s>]))?'''
294	attrpat = re.compile(attr_re)
295
296	def parseattrs(text):
297	"""Turn a string of name=value pairs into an attribute dictionary."""
298	attrs = {}
299	pos = 0
300	while 1:
301	match = attrpat.search(text, pos)
302	if not match: break
303	pos = match.end()
304	name, value = match.group(1), match.group(3) or ''
305	if value[:1] in ["'", '"']:
306	value = value[1:-1]
307	attrs[name.lower()] = htmldec(value)
308	return attrs
309
310	def matchcontent(specimen, desired):
311	if hasattr(desired, 'match'):
312	return desired.match(specimen)
313	elif callable(desired):
314	return desired(specimen)
315	else:
316	return specimen == desired
317
318	def matchattrs(specimen, desired):
319	for name, value in desired.items():
320	name = name.strip('_').replace('_', '-')
321	if not (name in specimen and matchcontent(specimen[name], value)):
322	return 0
323	return 1
324
325	class Region:
326	"""A Region object represents a contiguous region of a document together
327	with an associated HTML or XML tag and its attributes."""
328
329	def __init__(self, parent, start=0, end=None, starttag=None, endtag=None):
330	"""Create a Region. The parent argument is a string or another
331	Region. The start and end arguments, if given, specify non-negative
332	indices into the original string (not into a parent subregion)."""
333	if isinstance(parent, Region):
334	self.document = parent.document
335	self.tags = parent.tags
336	else:
337	self.document = parent
338	self.tags = self.scantags(self.document)
339	if end is None:
340	end = len(self.document)
341	self.start, self.end = start, end
342	self.tagname, self.attrs = None, {}
343
344	# If only starttag is specified, this Region is a tag.
345	# If starttag and endtag are specified, this Region is an element.
346	self.starttag, self.endtag = starttag, endtag
347	if starttag is not None:
348	self.start, self.end, self.tagname, self.attrs = self.tags[starttag]
349	if endtag is not None:
350	self.start, self.end = self.tags[starttag][1], self.tags[endtag][0]
351
352	# Find the minimum and maximum indices of tags within this Region.
353	if starttag and endtag:
354	self.tagmin, self.tagmax = starttag + 1, endtag - 1
355	else:
356	self.tagmin, self.tagmax = len(self.tags), -1
357	for i, (start, end, tagname, attrs) in enumerate(self.tags):
358	if start >= self.start and i < self.tagmin:
359	self.tagmin = i
360	if end <= self.end and i > self.tagmax:
361	self.tagmax = i
362
363	def __repr__(self):
364	if self.tagname:
365	attrs = ''.join([' %s=%r' % item for item in self.attrs.items()])
366	return '<Region %d:%d %s%s>' % (
367	self.start, self.end, self.tagname, attrs)
368	else:
369	return '<Region %d:%d>' % (self.start, self.end)
370
371	# Utilities that operate on the array of scanned tags.
372	def scantags(self, document):
373	"""Generate a list of all the tags in a document."""
374	tags = []
375	pos = 0
376	while 1:
377	match = tagpat.search(document, pos)
378	if not match: break
379	start, end = match.span()
380	tagname = match.group(1).lower()
381	attrs = match.group(2)
382	tags.append([start, end, tagname, attrs])
383	if tagname == 'script':
384	match = endscriptpat.search(document, end)
385	if not match: break
386	start, end = match.span()
387	tags.append([start, end, '/' + tagname, ''])
388	pos = end
389	return tags
390
391	def matchtag(self, i, tagname, attrs):
392	"""Return 1 if the ith tag matches the given tagname and attributes."""
393	itagname, iattrs = self.tags[i][2], self.tags[i][3]
394	if itagname[:1] not in ['', '?', '!', '/']:
395	if itagname == tagname or tagname is None:
396	if isinstance(iattrs, str):
397	self.tags[i][3] = iattrs = parseattrs(iattrs)
398	return matchattrs(iattrs, attrs)
399
400	def findendtag(self, starttag, outside=0):
401	"""Find the index of the matching end tag for the given start tag.
402	If outside is 0, look for the end tag within the current region;
403	if outside is 1, look beyond the end of the current region."""
404	tagname = self.tags[starttag][2]
405	depth = 1
406	for i in range(starttag + 1, len(self.tags)):
407	if self.tags[i][2] == tagname:
408	depth += 1
409	if self.tags[i][2] == '/' + tagname:
410	depth -= 1
411	if depth == 0:
412	if not outside and i <= self.tagmax:
413	return i
414	if outside and i > self.tagmax:
415	return i
416	break
417
418	def matchelement(self, starttag, content=None, outside=0):
419	"""If the element with the given start tag matches the given content,
420	return the index of the matching end tag. See findendtag() for the
421	meaning of the outside flag."""
422	endtag = self.findendtag(starttag, outside)
423	if endtag is not None:
424	start, end = self.tags[starttag][1], self.tags[endtag][0]
425	stripped = striptags(self.document[start:end])
426	if content is None or matchcontent(stripped, content):
427	return endtag
428
429	# Provide the "content" and "text" attributes to access the contents.
430	content = property(lambda self: self.document[self.start:self.end])
431	text = property(lambda self: striptags(self.content))
432
433	def getparams(self):
434	"""Get a dictionary of default values for all the form parameters."""
435	if self.tagname == 'form':
436	params = {}
437	for input in self.alltags('input'):
438	if 'disabled' not in input:
439	type = input['type'].lower()
440	if type in ['text', 'password', 'hidden'] or (
441	type in ['checkbox', 'radio'] and 'checked' in input):
442	params[input['name']] = input['value']
443	for select in self.all('select'):
444	if 'disabled' not in select:
445	selections = [option['value']
446	for option in select.alltags('option')
447	if 'selected' in option]
448	if 'multiple' in select:
449	params[select['name']] = selections
450	elif selections:
451	params[select['name']] = selections[0]
452	for textarea in self.all('textarea'):
453	if 'disabled' not in textarea:
454	params[textarea['name']] = textarea.content
455	return params
456
457	def getbuttons(self):
458	"""Get a list of all the form submission buttons."""
459	if self.tagname == 'form':
460	return [tag for tag in self.alltags('input')
461	if tag['type'].lower() in ['submit', 'image']
462	] + [tag for tag in self.alltags('button')
463	if tag['type'].lower() in ['submit', '']]
464
465	params = property(getparams)
466	buttons = property(getbuttons)
467
468	# Provide a dictionary-like interface to the tag attributes.
469	def __contains__(self, name):
470	return name in self.attrs
471
472	def __getitem__(self, name):
473	return self.attrs.get(name, '')
474
475	# Provide subregions by slicing.
476	def __getslice__(self, start, end):
477	start += (start < 0) and self.end or self.start
478	end += (end < 0) and self.end or self.start
479	return Region(self, start, end)
480
481	# Search for text.
482	def find(self, target, group=0):
483	"""Search this Region for a string or a compiled RE and return a
484	Region representing the match. The optional group argument specifies
485	which grouped subexpression should be returned as the match."""
486	if hasattr(target, 'search'):
487	match = target.search(self.content)
488	if match:
489	return self[match.start(group):match.end(group)]
490	else:
491	start = self.content.find(target)
492	if start > -1:
493	return self[start:start+len(target)]
494
495	def findall(self, target, group=0):
496	"""Search this Region for a string or a compiled RE and return a
497	sequence of Regions representing all the matches."""
498	pos = 0
499	content = self.content
500	matches = []
501	if hasattr(target, 'search'):
502	while 1:
503	match = target.search(content, pos)
504	if not match:
505	break
506	start, pos = match.span(group)
507	matches.append(self[start:pos])
508	else:
509	while 1:
510	start = content.find(target, pos)
511	if start < 0:
512	break
513	pos = start + len(target)
514	matches.append(self[start:pos])
515	return matches
516
517	# Search for tags.
518	def firsttag(self, tagname=None, **attrs):
519	"""Return the Region for the first tag entirely within this Region
520	with the given tag name and attributes."""
521	for i in range(self.tagmin, self.tagmax + 1):
522	if self.matchtag(i, tagname, attrs):
523	return Region(self, 0, 0, i)
524
525	def lasttag(self, tagname=None, **attrs):
526	"""Return the Region for the last tag entirely within this Region
527	with the given tag name and attributes."""
528	for i in range(self.tagmax, self.tagmin - 1, -1):
529	if self.matchtag(i, tagname, attrs):
530	return Region(self, 0, 0, i)
531
532	def alltags(self, tagname=None, **attrs):
533	"""Return a list of Regions for all the tags entirely within this
534	Region with the given tag name and attributes."""
535	tags = []
536	for i in range(self.tagmin, self.tagmax + 1):
537	if self.matchtag(i, tagname, attrs):
538	tags.append(Region(self, 0, 0, i))
539	return tags
540
541	def nexttag(self, tagname=None, **attrs):
542	"""Return the Region for the nearest tag after the end of this Region
543	with the given tag name and attributes."""
544	return Region(self, self.end).firsttag(tagname, **attrs)
545
546	def previoustag(self, tagname=None, **attrs):
547	"""Return the Region for the nearest tag before the start of this
548	Region with the given tag name and attributes."""
549	return Region(self, 0, self.start).lasttag(tagname, **attrs)
550
551	# Search for elements.
552	def first(self, tagname=None, content=None, **attrs):
553	"""Return the Region for the first properly balanced element entirely
554	within this Region with the given tag name, content, and attributes.
555	The element content is passed through striptags(). If the content
556	argument has a match() method, the stripped content is passed into
557	this method; otherwise it is compared directly as a string."""
558	for starttag in range(self.tagmin, self.tagmax + 1):
559	if self.matchtag(starttag, tagname, attrs):
560	endtag = self.matchelement(starttag, content)
561	if endtag is not None:
562	return Region(self, 0, 0, starttag, endtag)
563
564	def last(self, tagname=None, content=None, **attrs):
565	"""Return the Region for the last properly balanced element entirely
566	within this Region with the given tag name, content, and attributes."""
567	for starttag in range(self.tagmax, self.tagmin - 1, -1):
568	if self.matchtag(starttag, tagname, attrs):
569	endtag = self.matchelement(starttag, content)
570	if endtag is not None:
571	return Region(self, 0, 0, starttag, endtag)
572
573	def all(self, tagname=None, content=None, **attrs):
574	"""Return Regions for all non-overlapping balanced elements entirely
575	within this Region with the given tag name, content, and attributes."""
576	elements = []
577	starttag = self.tagmin
578	while starttag <= self.tagmax:
579	if self.matchtag(starttag, tagname, attrs):
580	endtag = self.matchelement(starttag, content)
581	if endtag is not None:
582	elements.append(Region(self, 0, 0, starttag, endtag))
583	starttag = endtag
584	starttag += 1
585	return elements
586
587	def next(self, tagname=None, content=None, **attrs):
588	"""Return the Region for the nearest balanced element after the end of
589	this Region with the given tag name, content, and attributes."""
590	return Region(self, self.end).first(tagname, content, **attrs)
591
592	def previous(self, tagname=None, content=None, **attrs):
593	"""Return the Region for the nearest balanced element before the start
594	of this Region with the given tag name, content, and attributes."""
595	return Region(self, 0, self.start).last(tagname, content, **attrs)
596
597	def enclosing(self, tagname=None, content=None, **attrs):
598	"""Return the Region for the nearest balanced element that encloses
599	this Region with the given tag name, content, and attributes."""
600	if self.starttag and self.endtag: # skip our own start tag
601	laststarttag = self.starttag - 1
602	else:
603	laststarttag = self.tagmin - 1
604	for starttag in range(laststarttag, -1, -1):
605	if self.matchtag(starttag, tagname, attrs):
606	endtag = self.matchelement(starttag, content, outside=1)
607	if endtag is not None:
608	return Region(self, 0, 0, starttag, endtag)
609
610	def read(path):
611	"""Read and return the entire contents of the file at the given path."""
612	return open(path).read()
613
614	def write(path, text):
615	"""Write the given text to a file at the given path."""
616	file = open(path, 'w')
617	file.write(text)
618	file.close()
619
620	def load(path):
621	"""Return the deserialized contents of the file at the given path."""
622	return marshal.load(open(path))
623
624	def dump(path, data):
625	"""Serialize the given data and write it to a file at the given path."""
626	file = open(path, 'w')
627	marshal.dump(data, file)
628	file.close()
629
630	def getnumber(text):
631	"""Find and parse a floating-point or integer number in the given text,
632	ignoring commas, percentage signs, and non-numeric words."""
633	for word in striptags(text).replace(',', '').replace('%', '').split():
634	try: return int(word)
635	except:
636	try: return float(word)
637	except: continue

Note: See TracBrowser for help on using the repository browser.

Download in other formats: