Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/StreamVision/scrape.py@ 527

Last change on this file since 527 was 188, checked in by Nicholas Riley, 19 years ago
StreamVision
File size: 25.0 KB

Rev	Line
[188]	1	"""Python module for web browsing and scraping.
	2
	3	Done:
	4	- navigate to absolute and relative URLs
	5	- follow links in page or region
	6	- find first or all occurrences of string or RE in page or region
	7	- find first, last, next, previous, or all tags with given name/attributes
	8	- find first, last, next, previous, enclosing, or all elements with given
	9	name/attributes/content
	10	- set form fields
	11	- submit forms
	12	- strip tags from arbitrary strings of HTML
	13
	14	Todo:
	15	- cookie-handling is dumb (sends all cookies to all sites)
	16	- handle CDATA and RCDATA marked sections
	17	- support for submitting forms with file upload
	18	- use Regions in striptags instead of duplicating work
	19	- map of enders
	20	"""
	21
	22	__author__ = 'Ka-Ping Yee'
	23	__date__ = '2005-03-29'
	24	__version__ = '$Revision: 1.16 $'
	25
	26	import os, socket, re, marshal, subprocess
	27	from tempfile import gettempdir
	28	from urlparse import urljoin, urlsplit
	29	from urllib import urlencode
	30
	31	def connect(server, port):
	32	"""Return a TCP socket connected to the given server and port."""
	33	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	34	sock.connect((server, port))
	35	return sock
	36
	37	def receive(sock):
	38	"""Read all the data from a socket until it closes."""
	39	chunks = []
	40	while 1:
	41	chunk = sock.recv(4096)
	42	if chunk: chunks.append(chunk)
	43	else: return ''.join(chunks)
	44
	45	def request(host, method, path, headers, entity=None):
	46	"""Make an HTTP request and return (status, message, headers, document)."""
	47	sock = connect(host, 80)
	48	request = method + ' ' + path + ' HTTP/1.0\r\n'
	49	for name in headers:
	50	capname = '-'.join([part.capitalize() for part in name.split('-')])
	51	request += capname + ': ' + str(headers[name]) + '\r\n'
	52	request += '\r\n'
	53	if entity:
	54	request += entity
	55	sock.sendall(request)
	56	data = receive(sock)
	57	try: return splitreply(data)
	58	except: return (0, '', {}, data)
	59
	60	def splitreply(reply):
	61	"""Split an HTTP response into (status, message, headers, document)."""
	62	if '\r\n\r\n' in reply:
	63	head, document = reply.split('\r\n\r\n', 1)
	64	else:
	65	head, document = reply, ''
	66	headers = []
	67	while True:
	68	if '\r\n' in head:
	69	response, head = head.split('\r\n', 1)
	70	for line in head.split('\r\n'):
	71	name, value = line.split(': ', 1)
	72	headers.append((name.lower(), value))
	73	else:
	74	response, head = head, ''
	75	status = int(response.split()[1])
	76	message = ' '.join(response.split()[2:])
	77	if document.startswith('HTTP/1.') and '\r\n\r\n' in document:
	78	head, document = document.split('\r\n\r\n', 1)
	79	else:
	80	return status, message, headers, document
	81
	82	def shellquote(text):
	83	"""Quote a string literal for sh."""
	84	return "'" + text.replace("'", "'\\''") + "'"
	85
	86	def curl(url, entity=None, follow=1, cookies=[], referrer=None):
	87	"""Invoke curl to perform an HTTP request."""
	88	command = ['curl', '-s', '-i']
	89	if referrer:
	90	command += ['-e', referrer]
	91	if entity:
	92	if not isinstance(entity, str):
	93	entity = urlencode(entity, doseq=1)
	94	command += ['-d', entity]
	95	if not follow:
	96	command += ['-Z', '0']
	97	else:
	98	command += ['-L']
	99	if cookies:
	100	command += ['-b', '; '.join(cookies)]
	101	command.append(url)
	102	reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read()
	103	return splitreply(reply)
	104
	105	def fetch(url, entity=None, follow=1):
	106	"""Fetch one document in a one-shot session."""
	107	return Session().fetch(url, entity, follow)
	108
	109	class ScrapeError(Exception): pass
	110	class HTTPError(ScrapeError): pass
	111	LAST_URL = object()
	112
	113	class Session:
	114	"""A Web-browsing session.
	115
	116	Exposed attributes:
	117
	118	agent - set or get the User-Agent string
	119	location - get the current (i.e. last successfully fetched) URL
	120	status - get the status code of the last successful request
	121	message - get the status message of the last successful request
	122	headers - get the dictionary of headers from the last successful request
	123	document - get the document returned by the last successful request
	124	region - get a Region spanning the entire document
	125	"""
	126
	127	def __init__(self, agent=None):
	128	self.cookies = []
	129	self.agent = agent
	130	self.location = self.status = self.message = None
	131	self.headers = self.document = self.region = None
	132	self.history = []
	133
	134	def fetch(self, url, entity=None, follow=1, referrer=LAST_URL):
	135	scheme, host, path, query, fragment = urlsplit(url)
	136	if referrer is LAST_URL:
	137	referrer = self.location
	138	self.location = url
	139	if scheme == 'https':
	140	status, message, headers, document = \
	141	curl(url, entity, follow, self.cookies)
	142	elif scheme == 'http':
	143	if query:
	144	path += '?' + query
	145	headers = {}
	146	headers['host'] = host
	147	headers['accept'] = '/'
	148	if referrer:
	149	headers['referer'] = referrer
	150	self.location = url
	151	if self.agent:
	152	headers['user-agent'] = self.agent
	153	if self.cookies:
	154	headers['cookie'] = '; '.join(self.cookies)
	155	if entity:
	156	if not isinstance(entity, str):
	157	entity = urlencode(entity, doseq=1)
	158	headers['content-type'] = 'application/x-www-form-urlencoded'
	159	headers['content-length'] = len(entity)
	160	method = entity and 'POST' or 'GET'
	161	status, message, headers, document = \
	162	request(host, method, path, headers, entity)
	163	else:
	164	raise ValueError, scheme + ' not supported'
	165	headerdict = {}
	166	for name, value in headers:
	167	if name == 'set-cookie':
	168	cookie = value.split(';')[0]
	169	if cookie not in self.cookies:
	170	self.cookies.append(cookie)
	171	else:
	172	headerdict[name] = value
	173	if follow and status in [301, 302] and 'location' in headerdict:
	174	return self.fetch(urljoin(url, headerdict['location']))
	175	return status, message, headerdict, document
	176
	177	def go(self, url, entity=None, follow=1, referrer=LAST_URL):
	178	"""Navigate to a given URL. If the URL is relative, it is resolved
	179	with respect to the current location. If the document is successfully
	180	fetched, return a Region spanning the entire document."""
	181	historyentry = (self.location, self.status, self.message,
	182	self.headers, self.document, self.region)
	183	if self.location:
	184	url = urljoin(self.location, url)
	185	results = self.fetch(url, entity, follow, referrer)
	186	if results[0] == 200:
	187	self.history.append(historyentry)
	188	self.status, self.message, self.headers, self.document = results
	189	self.region = Region(self.document)
	190	return self.region
	191	raise HTTPError(self.status, self.message)
	192
	193	def back(self):
	194	"""Return to the previous page."""
	195	(self.location, self.status, self.message,
	196	self.headers, self.document, self.region) = self.history.pop()
	197	return self.location
	198
	199	def follow(self, anchor, region=None):
	200	"""Follow the first link with the given anchor text. The anchor may
	201	be given as a string or a compiled RE. If a region is given, the
	202	link is sought within that region instead of the whole document."""
	203	link = (region or self.region).first('a', content=anchor)
	204	if not link:
	205	raise ScrapeError('link %r not found' % anchor)
	206	if not link['href']:
	207	raise ScrapeError('link %r has no href' % link)
	208	return self.go(link['href'])
	209
	210	def submit(self, form, button=None, **params):
	211	"""Submit a form, optionally by clicking a given button."""
	212	if form.tagname != 'form':
	213	raise ScrapeError('%r is not a form' % form)
	214	p = form.params
	215	if button:
	216	p[button['name']] = button['value']
	217	p.update(params)
	218	method = form['method'].lower() or 'get'
	219	if method == 'post':
	220	return self.go(form['action'], p)
	221	elif method == 'get':
	222	return self.go(form['action'] + '?' + urlencode(p, doseq=1))
	223	else:
	224	raise ScrapeError('unknown form method %r' % method)
	225
	226	tagcontent_re = r'''(('[^']'\|"[^"]"\|--([^-]\|-[^-])--\|-(?!-)\|[^'">-]))'''
	227
	228	def tag_re(tagname_re):
	229	return '<' + tagname_re + tagcontent_re + '>'
	230
	231	anytag_re = tag_re(r'(\?\|!\w\|/?[a-zA-Z_:][\w:.-])')
	232	tagpat = re.compile(anytag_re)
	233
	234	def htmldec(text):
	235	"""Decode HTML entities in the given text."""
	236	chunks = text.split('&#')
	237	for i in range(1, len(chunks)):
	238	number, rest = chunks[i].split(';', 1)
	239	chunks[i] = chr(int(number)) + rest
	240	text = ''.join(chunks)
	241	text = text.replace('\xa0', ' ')
	242	text = text.replace(' ', ' ')
	243	text = text.replace('<', '<')
	244	text = text.replace('>', '>')
	245	text = text.replace('"', '"')
	246	text = text.replace('&', '&')
	247	return text
	248
	249	def htmlenc(text):
	250	"""Use HTML entities to encode special characters in the given text."""
	251	text = text.replace('&', '&')
	252	text = text.replace('"', '"')
	253	text = text.replace('<', '<')
	254	text = text.replace('>', '>')
	255	return text
	256
	257	def no_groups(re):
	258	return re.replace('(', '(?:').replace('(?:?', '(?')
	259
	260	tagsplitter = re.compile(no_groups(anytag_re))
	261	parasplitter = re.compile(no_groups(tag_re('(p\|table\|form)')), re.I)
	262	linesplitter = re.compile(no_groups(tag_re('(div\|br\|tr)')), re.I)
	263	scriptpat = re.compile(r'<script\b', re.I)
	264	endscriptpat = re.compile(r'</script[^>]*>', re.I)
	265	endcommentpat = re.compile(r'--\s*>')
	266
	267	def striptags(text):
	268	"""Strip HTML tags from the given text, yielding line breaks for DIV,
	269	BR, or TR tags and blank lines for P, TABLE, or FORM tags."""
	270	chunks = scriptpat.split(text)
	271	for i in range(1, len(chunks)):
	272	chunks[i] = endscriptpat.split(chunks[i], 1)[1]
	273	text = ''.join(chunks)
	274	chunks = text.split('<!')
	275	for i in range(1, len(chunks)):
	276	if chunks[i].split('>', 1)[0].find('--') >= 0:
	277	chunks[i] = endcommentpat.split(chunks[i], 1)[1]
	278	else:
	279	chunks[i] = chunks[i].split('>', 1)[1]
	280	text = ''.join(chunks)
	281
	282	paragraphs = []
	283	for paragraph in parasplitter.split(text):
	284	lines = []
	285	for line in linesplitter.split(paragraph):
	286	line = ''.join(tagsplitter.split(line))
	287	line = htmldec(line)
	288	line = ' '.join(line.split())
	289	lines.append(line)
	290	paragraphs.append('\n'.join(lines))
	291	return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip()
	292
	293	attr_re = r'''\s([\w:.-]+)(\s=\s('[^']'\|"[^"]"\|[^\s>]))?'''
	294	attrpat = re.compile(attr_re)
	295
	296	def parseattrs(text):
	297	"""Turn a string of name=value pairs into an attribute dictionary."""
	298	attrs = {}
	299	pos = 0
	300	while 1:
	301	match = attrpat.search(text, pos)
	302	if not match: break
	303	pos = match.end()
	304	name, value = match.group(1), match.group(3) or ''
	305	if value[:1] in ["'", '"']:
	306	value = value[1:-1]
	307	attrs[name.lower()] = htmldec(value)
	308	return attrs
	309
	310	def matchcontent(specimen, desired):
	311	if hasattr(desired, 'match'):
	312	return desired.match(specimen)
	313	elif callable(desired):
	314	return desired(specimen)
	315	else:
	316	return specimen == desired
	317
	318	def matchattrs(specimen, desired):
	319	for name, value in desired.items():
	320	name = name.strip('_').replace('_', '-')
	321	if not (name in specimen and matchcontent(specimen[name], value)):
	322	return 0
	323	return 1
	324
	325	class Region:
	326	"""A Region object represents a contiguous region of a document together
	327	with an associated HTML or XML tag and its attributes."""
	328
	329	def __init__(self, parent, start=0, end=None, starttag=None, endtag=None):
	330	"""Create a Region. The parent argument is a string or another
	331	Region. The start and end arguments, if given, specify non-negative
	332	indices into the original string (not into a parent subregion)."""
	333	if isinstance(parent, Region):
	334	self.document = parent.document
	335	self.tags = parent.tags
	336	else:
	337	self.document = parent
	338	self.tags = self.scantags(self.document)
	339	if end is None:
	340	end = len(self.document)
	341	self.start, self.end = start, end
	342	self.tagname, self.attrs = None, {}
	343
	344	# If only starttag is specified, this Region is a tag.
	345	# If starttag and endtag are specified, this Region is an element.
	346	self.starttag, self.endtag = starttag, endtag
	347	if starttag is not None:
	348	self.start, self.end, self.tagname, self.attrs = self.tags[starttag]
	349	if endtag is not None:
	350	self.start, self.end = self.tags[starttag][1], self.tags[endtag][0]
	351
	352	# Find the minimum and maximum indices of tags within this Region.
	353	if starttag and endtag:
	354	self.tagmin, self.tagmax = starttag + 1, endtag - 1
	355	else:
	356	self.tagmin, self.tagmax = len(self.tags), -1
	357	for i, (start, end, tagname, attrs) in enumerate(self.tags):
	358	if start >= self.start and i < self.tagmin:
	359	self.tagmin = i
	360	if end <= self.end and i > self.tagmax:
	361	self.tagmax = i
	362
	363	def __repr__(self):
	364	if self.tagname:
	365	attrs = ''.join([' %s=%r' % item for item in self.attrs.items()])
	366	return '<Region %d:%d %s%s>' % (
	367	self.start, self.end, self.tagname, attrs)
	368	else:
	369	return '<Region %d:%d>' % (self.start, self.end)
	370
	371	# Utilities that operate on the array of scanned tags.
	372	def scantags(self, document):
	373	"""Generate a list of all the tags in a document."""
	374	tags = []
	375	pos = 0
	376	while 1:
	377	match = tagpat.search(document, pos)
	378	if not match: break
	379	start, end = match.span()
	380	tagname = match.group(1).lower()
	381	attrs = match.group(2)
	382	tags.append([start, end, tagname, attrs])
	383	if tagname == 'script':
	384	match = endscriptpat.search(document, end)
	385	if not match: break
	386	start, end = match.span()
	387	tags.append([start, end, '/' + tagname, ''])
	388	pos = end
	389	return tags
	390
	391	def matchtag(self, i, tagname, attrs):
	392	"""Return 1 if the ith tag matches the given tagname and attributes."""
	393	itagname, iattrs = self.tags[i][2], self.tags[i][3]
	394	if itagname[:1] not in ['', '?', '!', '/']:
	395	if itagname == tagname or tagname is None:
	396	if isinstance(iattrs, str):
	397	self.tags[i][3] = iattrs = parseattrs(iattrs)
	398	return matchattrs(iattrs, attrs)
	399
	400	def findendtag(self, starttag, outside=0):
	401	"""Find the index of the matching end tag for the given start tag.
	402	If outside is 0, look for the end tag within the current region;
	403	if outside is 1, look beyond the end of the current region."""
	404	tagname = self.tags[starttag][2]
	405	depth = 1
	406	for i in range(starttag + 1, len(self.tags)):
	407	if self.tags[i][2] == tagname:
	408	depth += 1
	409	if self.tags[i][2] == '/' + tagname:
	410	depth -= 1
	411	if depth == 0:
	412	if not outside and i <= self.tagmax:
	413	return i
	414	if outside and i > self.tagmax:
	415	return i
	416	break
	417
	418	def matchelement(self, starttag, content=None, outside=0):
	419	"""If the element with the given start tag matches the given content,
	420	return the index of the matching end tag. See findendtag() for the
	421	meaning of the outside flag."""
	422	endtag = self.findendtag(starttag, outside)
	423	if endtag is not None:
	424	start, end = self.tags[starttag][1], self.tags[endtag][0]
	425	stripped = striptags(self.document[start:end])
	426	if content is None or matchcontent(stripped, content):
	427	return endtag
	428
	429	# Provide the "content" and "text" attributes to access the contents.
	430	content = property(lambda self: self.document[self.start:self.end])
	431	text = property(lambda self: striptags(self.content))
	432
	433	def getparams(self):
	434	"""Get a dictionary of default values for all the form parameters."""
	435	if self.tagname == 'form':
	436	params = {}
	437	for input in self.alltags('input'):
	438	if 'disabled' not in input:
	439	type = input['type'].lower()
	440	if type in ['text', 'password', 'hidden'] or (
	441	type in ['checkbox', 'radio'] and 'checked' in input):
	442	params[input['name']] = input['value']
	443	for select in self.all('select'):
	444	if 'disabled' not in select:
	445	selections = [option['value']
	446	for option in select.alltags('option')
	447	if 'selected' in option]
	448	if 'multiple' in select:
	449	params[select['name']] = selections
	450	elif selections:
	451	params[select['name']] = selections[0]
	452	for textarea in self.all('textarea'):
	453	if 'disabled' not in textarea:
	454	params[textarea['name']] = textarea.content
	455	return params
	456
	457	def getbuttons(self):
	458	"""Get a list of all the form submission buttons."""
	459	if self.tagname == 'form':
	460	return [tag for tag in self.alltags('input')
	461	if tag['type'].lower() in ['submit', 'image']
	462	] + [tag for tag in self.alltags('button')
	463	if tag['type'].lower() in ['submit', '']]
	464
	465	params = property(getparams)
	466	buttons = property(getbuttons)
	467
	468	# Provide a dictionary-like interface to the tag attributes.
	469	def __contains__(self, name):
	470	return name in self.attrs
	471
	472	def __getitem__(self, name):
	473	return self.attrs.get(name, '')
	474
	475	# Provide subregions by slicing.
	476	def __getslice__(self, start, end):
	477	start += (start < 0) and self.end or self.start
	478	end += (end < 0) and self.end or self.start
	479	return Region(self, start, end)
	480
	481	# Search for text.
	482	def find(self, target, group=0):
	483	"""Search this Region for a string or a compiled RE and return a
	484	Region representing the match. The optional group argument specifies
	485	which grouped subexpression should be returned as the match."""
	486	if hasattr(target, 'search'):
	487	match = target.search(self.content)
	488	if match:
	489	return self[match.start(group):match.end(group)]
	490	else:
	491	start = self.content.find(target)
	492	if start > -1:
	493	return self[start:start+len(target)]
	494
	495	def findall(self, target, group=0):
	496	"""Search this Region for a string or a compiled RE and return a
	497	sequence of Regions representing all the matches."""
	498	pos = 0
	499	content = self.content
	500	matches = []
	501	if hasattr(target, 'search'):
	502	while 1:
	503	match = target.search(content, pos)
	504	if not match:
	505	break
	506	start, pos = match.span(group)
	507	matches.append(self[start:pos])
	508	else:
	509	while 1:
	510	start = content.find(target, pos)
	511	if start < 0:
	512	break
	513	pos = start + len(target)
	514	matches.append(self[start:pos])
	515	return matches
	516
	517	# Search for tags.
	518	def firsttag(self, tagname=None, **attrs):
	519	"""Return the Region for the first tag entirely within this Region
	520	with the given tag name and attributes."""
	521	for i in range(self.tagmin, self.tagmax + 1):
	522	if self.matchtag(i, tagname, attrs):
	523	return Region(self, 0, 0, i)
	524
	525	def lasttag(self, tagname=None, **attrs):
	526	"""Return the Region for the last tag entirely within this Region
	527	with the given tag name and attributes."""
	528	for i in range(self.tagmax, self.tagmin - 1, -1):
	529	if self.matchtag(i, tagname, attrs):
	530	return Region(self, 0, 0, i)
	531
	532	def alltags(self, tagname=None, **attrs):
	533	"""Return a list of Regions for all the tags entirely within this
	534	Region with the given tag name and attributes."""
	535	tags = []
	536	for i in range(self.tagmin, self.tagmax + 1):
	537	if self.matchtag(i, tagname, attrs):
	538	tags.append(Region(self, 0, 0, i))
	539	return tags
	540
	541	def nexttag(self, tagname=None, **attrs):
	542	"""Return the Region for the nearest tag after the end of this Region
	543	with the given tag name and attributes."""
	544	return Region(self, self.end).firsttag(tagname, **attrs)
	545
	546	def previoustag(self, tagname=None, **attrs):
	547	"""Return the Region for the nearest tag before the start of this
	548	Region with the given tag name and attributes."""
	549	return Region(self, 0, self.start).lasttag(tagname, **attrs)
	550
	551	# Search for elements.
	552	def first(self, tagname=None, content=None, **attrs):
	553	"""Return the Region for the first properly balanced element entirely
	554	within this Region with the given tag name, content, and attributes.
	555	The element content is passed through striptags(). If the content
	556	argument has a match() method, the stripped content is passed into
	557	this method; otherwise it is compared directly as a string."""
	558	for starttag in range(self.tagmin, self.tagmax + 1):
	559	if self.matchtag(starttag, tagname, attrs):
	560	endtag = self.matchelement(starttag, content)
	561	if endtag is not None:
	562	return Region(self, 0, 0, starttag, endtag)
	563
	564	def last(self, tagname=None, content=None, **attrs):
	565	"""Return the Region for the last properly balanced element entirely
	566	within this Region with the given tag name, content, and attributes."""
	567	for starttag in range(self.tagmax, self.tagmin - 1, -1):
	568	if self.matchtag(starttag, tagname, attrs):
	569	endtag = self.matchelement(starttag, content)
	570	if endtag is not None:
	571	return Region(self, 0, 0, starttag, endtag)
	572
	573	def all(self, tagname=None, content=None, **attrs):
	574	"""Return Regions for all non-overlapping balanced elements entirely
	575	within this Region with the given tag name, content, and attributes."""
	576	elements = []
	577	starttag = self.tagmin
	578	while starttag <= self.tagmax:
	579	if self.matchtag(starttag, tagname, attrs):
	580	endtag = self.matchelement(starttag, content)
	581	if endtag is not None:
	582	elements.append(Region(self, 0, 0, starttag, endtag))
	583	starttag = endtag
	584	starttag += 1
	585	return elements
	586
	587	def next(self, tagname=None, content=None, **attrs):
	588	"""Return the Region for the nearest balanced element after the end of
	589	this Region with the given tag name, content, and attributes."""
	590	return Region(self, self.end).first(tagname, content, **attrs)
	591
	592	def previous(self, tagname=None, content=None, **attrs):
	593	"""Return the Region for the nearest balanced element before the start
	594	of this Region with the given tag name, content, and attributes."""
	595	return Region(self, 0, self.start).last(tagname, content, **attrs)
	596
	597	def enclosing(self, tagname=None, content=None, **attrs):
	598	"""Return the Region for the nearest balanced element that encloses
	599	this Region with the given tag name, content, and attributes."""
	600	if self.starttag and self.endtag: # skip our own start tag
	601	laststarttag = self.starttag - 1
	602	else:
	603	laststarttag = self.tagmin - 1
	604	for starttag in range(laststarttag, -1, -1):
	605	if self.matchtag(starttag, tagname, attrs):
	606	endtag = self.matchelement(starttag, content, outside=1)
	607	if endtag is not None:
	608	return Region(self, 0, 0, starttag, endtag)
	609
	610	def read(path):
	611	"""Read and return the entire contents of the file at the given path."""
	612	return open(path).read()
	613
	614	def write(path, text):
	615	"""Write the given text to a file at the given path."""
	616	file = open(path, 'w')
	617	file.write(text)
	618	file.close()
	619
	620	def load(path):
	621	"""Return the deserialized contents of the file at the given path."""
	622	return marshal.load(open(path))
	623
	624	def dump(path, data):
	625	"""Serialize the given data and write it to a file at the given path."""
	626	file = open(path, 'w')
	627	marshal.dump(data, file)
	628	file.close()
	629
	630	def getnumber(text):
	631	"""Find and parse a floating-point or integer number in the given text,
	632	ignoring commas, percentage signs, and non-numeric words."""
	633	for word in striptags(text).replace(',', '').replace('%', '').split():
	634	try: return int(word)
	635	except:
	636	try: return float(word)
	637	except: continue

Note: See TracBrowser for help on using the repository browser.

Download in other formats: