[188] | 1 | """Python module for web browsing and scraping.
|
---|
| 2 |
|
---|
| 3 | Done:
|
---|
| 4 | - navigate to absolute and relative URLs
|
---|
| 5 | - follow links in page or region
|
---|
| 6 | - find first or all occurrences of string or RE in page or region
|
---|
| 7 | - find first, last, next, previous, or all tags with given name/attributes
|
---|
| 8 | - find first, last, next, previous, enclosing, or all elements with given
|
---|
| 9 | name/attributes/content
|
---|
| 10 | - set form fields
|
---|
| 11 | - submit forms
|
---|
| 12 | - strip tags from arbitrary strings of HTML
|
---|
| 13 |
|
---|
| 14 | Todo:
|
---|
| 15 | - cookie-handling is dumb (sends all cookies to all sites)
|
---|
| 16 | - handle CDATA and RCDATA marked sections
|
---|
| 17 | - support for submitting forms with file upload
|
---|
| 18 | - use Regions in striptags instead of duplicating work
|
---|
| 19 | - map of enders
|
---|
| 20 | """
|
---|
| 21 |
|
---|
| 22 | __author__ = 'Ka-Ping Yee'
|
---|
| 23 | __date__ = '2005-03-29'
|
---|
| 24 | __version__ = '$Revision: 1.16 $'
|
---|
| 25 |
|
---|
| 26 | import os, socket, re, marshal, subprocess
|
---|
| 27 | from tempfile import gettempdir
|
---|
| 28 | from urlparse import urljoin, urlsplit
|
---|
| 29 | from urllib import urlencode
|
---|
| 30 |
|
---|
| 31 | def connect(server, port):
|
---|
| 32 | """Return a TCP socket connected to the given server and port."""
|
---|
| 33 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
---|
| 34 | sock.connect((server, port))
|
---|
| 35 | return sock
|
---|
| 36 |
|
---|
| 37 | def receive(sock):
|
---|
| 38 | """Read all the data from a socket until it closes."""
|
---|
| 39 | chunks = []
|
---|
| 40 | while 1:
|
---|
| 41 | chunk = sock.recv(4096)
|
---|
| 42 | if chunk: chunks.append(chunk)
|
---|
| 43 | else: return ''.join(chunks)
|
---|
| 44 |
|
---|
| 45 | def request(host, method, path, headers, entity=None):
|
---|
| 46 | """Make an HTTP request and return (status, message, headers, document)."""
|
---|
| 47 | sock = connect(host, 80)
|
---|
| 48 | request = method + ' ' + path + ' HTTP/1.0\r\n'
|
---|
| 49 | for name in headers:
|
---|
| 50 | capname = '-'.join([part.capitalize() for part in name.split('-')])
|
---|
| 51 | request += capname + ': ' + str(headers[name]) + '\r\n'
|
---|
| 52 | request += '\r\n'
|
---|
| 53 | if entity:
|
---|
| 54 | request += entity
|
---|
| 55 | sock.sendall(request)
|
---|
| 56 | data = receive(sock)
|
---|
| 57 | try: return splitreply(data)
|
---|
| 58 | except: return (0, '', {}, data)
|
---|
| 59 |
|
---|
| 60 | def splitreply(reply):
|
---|
| 61 | """Split an HTTP response into (status, message, headers, document)."""
|
---|
| 62 | if '\r\n\r\n' in reply:
|
---|
| 63 | head, document = reply.split('\r\n\r\n', 1)
|
---|
| 64 | else:
|
---|
| 65 | head, document = reply, ''
|
---|
| 66 | headers = []
|
---|
| 67 | while True:
|
---|
| 68 | if '\r\n' in head:
|
---|
| 69 | response, head = head.split('\r\n', 1)
|
---|
| 70 | for line in head.split('\r\n'):
|
---|
| 71 | name, value = line.split(': ', 1)
|
---|
| 72 | headers.append((name.lower(), value))
|
---|
| 73 | else:
|
---|
| 74 | response, head = head, ''
|
---|
| 75 | status = int(response.split()[1])
|
---|
| 76 | message = ' '.join(response.split()[2:])
|
---|
| 77 | if document.startswith('HTTP/1.') and '\r\n\r\n' in document:
|
---|
| 78 | head, document = document.split('\r\n\r\n', 1)
|
---|
| 79 | else:
|
---|
| 80 | return status, message, headers, document
|
---|
| 81 |
|
---|
| 82 | def shellquote(text):
|
---|
| 83 | """Quote a string literal for sh."""
|
---|
| 84 | return "'" + text.replace("'", "'\\''") + "'"
|
---|
| 85 |
|
---|
| 86 | def curl(url, entity=None, follow=1, cookies=[], referrer=None):
|
---|
| 87 | """Invoke curl to perform an HTTP request."""
|
---|
| 88 | command = ['curl', '-s', '-i']
|
---|
| 89 | if referrer:
|
---|
| 90 | command += ['-e', referrer]
|
---|
| 91 | if entity:
|
---|
| 92 | if not isinstance(entity, str):
|
---|
| 93 | entity = urlencode(entity, doseq=1)
|
---|
| 94 | command += ['-d', entity]
|
---|
| 95 | if not follow:
|
---|
| 96 | command += ['-Z', '0']
|
---|
| 97 | else:
|
---|
| 98 | command += ['-L']
|
---|
| 99 | if cookies:
|
---|
| 100 | command += ['-b', '; '.join(cookies)]
|
---|
| 101 | command.append(url)
|
---|
| 102 | reply = subprocess.Popen(command, stdout=subprocess.PIPE).stdout.read()
|
---|
| 103 | return splitreply(reply)
|
---|
| 104 |
|
---|
| 105 | def fetch(url, entity=None, follow=1):
|
---|
| 106 | """Fetch one document in a one-shot session."""
|
---|
| 107 | return Session().fetch(url, entity, follow)
|
---|
| 108 |
|
---|
| 109 | class ScrapeError(Exception): pass
|
---|
| 110 | class HTTPError(ScrapeError): pass
|
---|
| 111 | LAST_URL = object()
|
---|
| 112 |
|
---|
| 113 | class Session:
|
---|
| 114 | """A Web-browsing session.
|
---|
| 115 |
|
---|
| 116 | Exposed attributes:
|
---|
| 117 |
|
---|
| 118 | agent - set or get the User-Agent string
|
---|
| 119 | location - get the current (i.e. last successfully fetched) URL
|
---|
| 120 | status - get the status code of the last successful request
|
---|
| 121 | message - get the status message of the last successful request
|
---|
| 122 | headers - get the dictionary of headers from the last successful request
|
---|
| 123 | document - get the document returned by the last successful request
|
---|
| 124 | region - get a Region spanning the entire document
|
---|
| 125 | """
|
---|
| 126 |
|
---|
| 127 | def __init__(self, agent=None):
|
---|
| 128 | self.cookies = []
|
---|
| 129 | self.agent = agent
|
---|
| 130 | self.location = self.status = self.message = None
|
---|
| 131 | self.headers = self.document = self.region = None
|
---|
| 132 | self.history = []
|
---|
| 133 |
|
---|
| 134 | def fetch(self, url, entity=None, follow=1, referrer=LAST_URL):
|
---|
| 135 | scheme, host, path, query, fragment = urlsplit(url)
|
---|
| 136 | if referrer is LAST_URL:
|
---|
| 137 | referrer = self.location
|
---|
| 138 | self.location = url
|
---|
| 139 | if scheme == 'https':
|
---|
| 140 | status, message, headers, document = \
|
---|
| 141 | curl(url, entity, follow, self.cookies)
|
---|
| 142 | elif scheme == 'http':
|
---|
| 143 | if query:
|
---|
| 144 | path += '?' + query
|
---|
| 145 | headers = {}
|
---|
| 146 | headers['host'] = host
|
---|
| 147 | headers['accept'] = '*/*'
|
---|
| 148 | if referrer:
|
---|
| 149 | headers['referer'] = referrer
|
---|
| 150 | self.location = url
|
---|
| 151 | if self.agent:
|
---|
| 152 | headers['user-agent'] = self.agent
|
---|
| 153 | if self.cookies:
|
---|
| 154 | headers['cookie'] = '; '.join(self.cookies)
|
---|
| 155 | if entity:
|
---|
| 156 | if not isinstance(entity, str):
|
---|
| 157 | entity = urlencode(entity, doseq=1)
|
---|
| 158 | headers['content-type'] = 'application/x-www-form-urlencoded'
|
---|
| 159 | headers['content-length'] = len(entity)
|
---|
| 160 | method = entity and 'POST' or 'GET'
|
---|
| 161 | status, message, headers, document = \
|
---|
| 162 | request(host, method, path, headers, entity)
|
---|
| 163 | else:
|
---|
| 164 | raise ValueError, scheme + ' not supported'
|
---|
| 165 | headerdict = {}
|
---|
| 166 | for name, value in headers:
|
---|
| 167 | if name == 'set-cookie':
|
---|
| 168 | cookie = value.split(';')[0]
|
---|
| 169 | if cookie not in self.cookies:
|
---|
| 170 | self.cookies.append(cookie)
|
---|
| 171 | else:
|
---|
| 172 | headerdict[name] = value
|
---|
| 173 | if follow and status in [301, 302] and 'location' in headerdict:
|
---|
| 174 | return self.fetch(urljoin(url, headerdict['location']))
|
---|
| 175 | return status, message, headerdict, document
|
---|
| 176 |
|
---|
| 177 | def go(self, url, entity=None, follow=1, referrer=LAST_URL):
|
---|
| 178 | """Navigate to a given URL. If the URL is relative, it is resolved
|
---|
| 179 | with respect to the current location. If the document is successfully
|
---|
| 180 | fetched, return a Region spanning the entire document."""
|
---|
| 181 | historyentry = (self.location, self.status, self.message,
|
---|
| 182 | self.headers, self.document, self.region)
|
---|
| 183 | if self.location:
|
---|
| 184 | url = urljoin(self.location, url)
|
---|
| 185 | results = self.fetch(url, entity, follow, referrer)
|
---|
| 186 | if results[0] == 200:
|
---|
| 187 | self.history.append(historyentry)
|
---|
| 188 | self.status, self.message, self.headers, self.document = results
|
---|
| 189 | self.region = Region(self.document)
|
---|
| 190 | return self.region
|
---|
| 191 | raise HTTPError(self.status, self.message)
|
---|
| 192 |
|
---|
| 193 | def back(self):
|
---|
| 194 | """Return to the previous page."""
|
---|
| 195 | (self.location, self.status, self.message,
|
---|
| 196 | self.headers, self.document, self.region) = self.history.pop()
|
---|
| 197 | return self.location
|
---|
| 198 |
|
---|
| 199 | def follow(self, anchor, region=None):
|
---|
| 200 | """Follow the first link with the given anchor text. The anchor may
|
---|
| 201 | be given as a string or a compiled RE. If a region is given, the
|
---|
| 202 | link is sought within that region instead of the whole document."""
|
---|
| 203 | link = (region or self.region).first('a', content=anchor)
|
---|
| 204 | if not link:
|
---|
| 205 | raise ScrapeError('link %r not found' % anchor)
|
---|
| 206 | if not link['href']:
|
---|
| 207 | raise ScrapeError('link %r has no href' % link)
|
---|
| 208 | return self.go(link['href'])
|
---|
| 209 |
|
---|
| 210 | def submit(self, form, button=None, **params):
|
---|
| 211 | """Submit a form, optionally by clicking a given button."""
|
---|
| 212 | if form.tagname != 'form':
|
---|
| 213 | raise ScrapeError('%r is not a form' % form)
|
---|
| 214 | p = form.params
|
---|
| 215 | if button:
|
---|
| 216 | p[button['name']] = button['value']
|
---|
| 217 | p.update(params)
|
---|
| 218 | method = form['method'].lower() or 'get'
|
---|
| 219 | if method == 'post':
|
---|
| 220 | return self.go(form['action'], p)
|
---|
| 221 | elif method == 'get':
|
---|
| 222 | return self.go(form['action'] + '?' + urlencode(p, doseq=1))
|
---|
| 223 | else:
|
---|
| 224 | raise ScrapeError('unknown form method %r' % method)
|
---|
| 225 |
|
---|
| 226 | tagcontent_re = r'''(('[^']*'|"[^"]*"|--([^-]|-[^-])*--|-(?!-)|[^'">-])*)'''
|
---|
| 227 |
|
---|
| 228 | def tag_re(tagname_re):
|
---|
| 229 | return '<' + tagname_re + tagcontent_re + '>'
|
---|
| 230 |
|
---|
| 231 | anytag_re = tag_re(r'(\?|!\w*|/?[a-zA-Z_:][\w:.-]*)')
|
---|
| 232 | tagpat = re.compile(anytag_re)
|
---|
| 233 |
|
---|
| 234 | def htmldec(text):
|
---|
| 235 | """Decode HTML entities in the given text."""
|
---|
| 236 | chunks = text.split('&#')
|
---|
| 237 | for i in range(1, len(chunks)):
|
---|
| 238 | number, rest = chunks[i].split(';', 1)
|
---|
| 239 | chunks[i] = chr(int(number)) + rest
|
---|
| 240 | text = ''.join(chunks)
|
---|
| 241 | text = text.replace('\xa0', ' ')
|
---|
| 242 | text = text.replace(' ', ' ')
|
---|
| 243 | text = text.replace('<', '<')
|
---|
| 244 | text = text.replace('>', '>')
|
---|
| 245 | text = text.replace('"', '"')
|
---|
| 246 | text = text.replace('&', '&')
|
---|
| 247 | return text
|
---|
| 248 |
|
---|
| 249 | def htmlenc(text):
|
---|
| 250 | """Use HTML entities to encode special characters in the given text."""
|
---|
| 251 | text = text.replace('&', '&')
|
---|
| 252 | text = text.replace('"', '"')
|
---|
| 253 | text = text.replace('<', '<')
|
---|
| 254 | text = text.replace('>', '>')
|
---|
| 255 | return text
|
---|
| 256 |
|
---|
| 257 | def no_groups(re):
|
---|
| 258 | return re.replace('(', '(?:').replace('(?:?', '(?')
|
---|
| 259 |
|
---|
| 260 | tagsplitter = re.compile(no_groups(anytag_re))
|
---|
| 261 | parasplitter = re.compile(no_groups(tag_re('(p|table|form)')), re.I)
|
---|
| 262 | linesplitter = re.compile(no_groups(tag_re('(div|br|tr)')), re.I)
|
---|
| 263 | scriptpat = re.compile(r'<script\b', re.I)
|
---|
| 264 | endscriptpat = re.compile(r'</script[^>]*>', re.I)
|
---|
| 265 | endcommentpat = re.compile(r'--\s*>')
|
---|
| 266 |
|
---|
| 267 | def striptags(text):
|
---|
| 268 | """Strip HTML tags from the given text, yielding line breaks for DIV,
|
---|
| 269 | BR, or TR tags and blank lines for P, TABLE, or FORM tags."""
|
---|
| 270 | chunks = scriptpat.split(text)
|
---|
| 271 | for i in range(1, len(chunks)):
|
---|
| 272 | chunks[i] = endscriptpat.split(chunks[i], 1)[1]
|
---|
| 273 | text = ''.join(chunks)
|
---|
| 274 | chunks = text.split('<!')
|
---|
| 275 | for i in range(1, len(chunks)):
|
---|
| 276 | if chunks[i].split('>', 1)[0].find('--') >= 0:
|
---|
| 277 | chunks[i] = endcommentpat.split(chunks[i], 1)[1]
|
---|
| 278 | else:
|
---|
| 279 | chunks[i] = chunks[i].split('>', 1)[1]
|
---|
| 280 | text = ''.join(chunks)
|
---|
| 281 |
|
---|
| 282 | paragraphs = []
|
---|
| 283 | for paragraph in parasplitter.split(text):
|
---|
| 284 | lines = []
|
---|
| 285 | for line in linesplitter.split(paragraph):
|
---|
| 286 | line = ''.join(tagsplitter.split(line))
|
---|
| 287 | line = htmldec(line)
|
---|
| 288 | line = ' '.join(line.split())
|
---|
| 289 | lines.append(line)
|
---|
| 290 | paragraphs.append('\n'.join(lines))
|
---|
| 291 | return re.sub('\n\n+', '\n\n', '\n\n'.join(paragraphs)).strip()
|
---|
| 292 |
|
---|
| 293 | attr_re = r'''\s*([\w:.-]+)(\s*=\s*('[^']*'|"[^"]*"|[^\s>]*))?'''
|
---|
| 294 | attrpat = re.compile(attr_re)
|
---|
| 295 |
|
---|
| 296 | def parseattrs(text):
|
---|
| 297 | """Turn a string of name=value pairs into an attribute dictionary."""
|
---|
| 298 | attrs = {}
|
---|
| 299 | pos = 0
|
---|
| 300 | while 1:
|
---|
| 301 | match = attrpat.search(text, pos)
|
---|
| 302 | if not match: break
|
---|
| 303 | pos = match.end()
|
---|
| 304 | name, value = match.group(1), match.group(3) or ''
|
---|
| 305 | if value[:1] in ["'", '"']:
|
---|
| 306 | value = value[1:-1]
|
---|
| 307 | attrs[name.lower()] = htmldec(value)
|
---|
| 308 | return attrs
|
---|
| 309 |
|
---|
| 310 | def matchcontent(specimen, desired):
|
---|
| 311 | if hasattr(desired, 'match'):
|
---|
| 312 | return desired.match(specimen)
|
---|
| 313 | elif callable(desired):
|
---|
| 314 | return desired(specimen)
|
---|
| 315 | else:
|
---|
| 316 | return specimen == desired
|
---|
| 317 |
|
---|
| 318 | def matchattrs(specimen, desired):
|
---|
| 319 | for name, value in desired.items():
|
---|
| 320 | name = name.strip('_').replace('_', '-')
|
---|
| 321 | if not (name in specimen and matchcontent(specimen[name], value)):
|
---|
| 322 | return 0
|
---|
| 323 | return 1
|
---|
| 324 |
|
---|
| 325 | class Region:
|
---|
| 326 | """A Region object represents a contiguous region of a document together
|
---|
| 327 | with an associated HTML or XML tag and its attributes."""
|
---|
| 328 |
|
---|
| 329 | def __init__(self, parent, start=0, end=None, starttag=None, endtag=None):
|
---|
| 330 | """Create a Region. The parent argument is a string or another
|
---|
| 331 | Region. The start and end arguments, if given, specify non-negative
|
---|
| 332 | indices into the original string (not into a parent subregion)."""
|
---|
| 333 | if isinstance(parent, Region):
|
---|
| 334 | self.document = parent.document
|
---|
| 335 | self.tags = parent.tags
|
---|
| 336 | else:
|
---|
| 337 | self.document = parent
|
---|
| 338 | self.tags = self.scantags(self.document)
|
---|
| 339 | if end is None:
|
---|
| 340 | end = len(self.document)
|
---|
| 341 | self.start, self.end = start, end
|
---|
| 342 | self.tagname, self.attrs = None, {}
|
---|
| 343 |
|
---|
| 344 | # If only starttag is specified, this Region is a tag.
|
---|
| 345 | # If starttag and endtag are specified, this Region is an element.
|
---|
| 346 | self.starttag, self.endtag = starttag, endtag
|
---|
| 347 | if starttag is not None:
|
---|
| 348 | self.start, self.end, self.tagname, self.attrs = self.tags[starttag]
|
---|
| 349 | if endtag is not None:
|
---|
| 350 | self.start, self.end = self.tags[starttag][1], self.tags[endtag][0]
|
---|
| 351 |
|
---|
| 352 | # Find the minimum and maximum indices of tags within this Region.
|
---|
| 353 | if starttag and endtag:
|
---|
| 354 | self.tagmin, self.tagmax = starttag + 1, endtag - 1
|
---|
| 355 | else:
|
---|
| 356 | self.tagmin, self.tagmax = len(self.tags), -1
|
---|
| 357 | for i, (start, end, tagname, attrs) in enumerate(self.tags):
|
---|
| 358 | if start >= self.start and i < self.tagmin:
|
---|
| 359 | self.tagmin = i
|
---|
| 360 | if end <= self.end and i > self.tagmax:
|
---|
| 361 | self.tagmax = i
|
---|
| 362 |
|
---|
| 363 | def __repr__(self):
|
---|
| 364 | if self.tagname:
|
---|
| 365 | attrs = ''.join([' %s=%r' % item for item in self.attrs.items()])
|
---|
| 366 | return '<Region %d:%d %s%s>' % (
|
---|
| 367 | self.start, self.end, self.tagname, attrs)
|
---|
| 368 | else:
|
---|
| 369 | return '<Region %d:%d>' % (self.start, self.end)
|
---|
| 370 |
|
---|
| 371 | # Utilities that operate on the array of scanned tags.
|
---|
| 372 | def scantags(self, document):
|
---|
| 373 | """Generate a list of all the tags in a document."""
|
---|
| 374 | tags = []
|
---|
| 375 | pos = 0
|
---|
| 376 | while 1:
|
---|
| 377 | match = tagpat.search(document, pos)
|
---|
| 378 | if not match: break
|
---|
| 379 | start, end = match.span()
|
---|
| 380 | tagname = match.group(1).lower()
|
---|
| 381 | attrs = match.group(2)
|
---|
| 382 | tags.append([start, end, tagname, attrs])
|
---|
| 383 | if tagname == 'script':
|
---|
| 384 | match = endscriptpat.search(document, end)
|
---|
| 385 | if not match: break
|
---|
| 386 | start, end = match.span()
|
---|
| 387 | tags.append([start, end, '/' + tagname, ''])
|
---|
| 388 | pos = end
|
---|
| 389 | return tags
|
---|
| 390 |
|
---|
| 391 | def matchtag(self, i, tagname, attrs):
|
---|
| 392 | """Return 1 if the ith tag matches the given tagname and attributes."""
|
---|
| 393 | itagname, iattrs = self.tags[i][2], self.tags[i][3]
|
---|
| 394 | if itagname[:1] not in ['', '?', '!', '/']:
|
---|
| 395 | if itagname == tagname or tagname is None:
|
---|
| 396 | if isinstance(iattrs, str):
|
---|
| 397 | self.tags[i][3] = iattrs = parseattrs(iattrs)
|
---|
| 398 | return matchattrs(iattrs, attrs)
|
---|
| 399 |
|
---|
| 400 | def findendtag(self, starttag, outside=0):
|
---|
| 401 | """Find the index of the matching end tag for the given start tag.
|
---|
| 402 | If outside is 0, look for the end tag within the current region;
|
---|
| 403 | if outside is 1, look beyond the end of the current region."""
|
---|
| 404 | tagname = self.tags[starttag][2]
|
---|
| 405 | depth = 1
|
---|
| 406 | for i in range(starttag + 1, len(self.tags)):
|
---|
| 407 | if self.tags[i][2] == tagname:
|
---|
| 408 | depth += 1
|
---|
| 409 | if self.tags[i][2] == '/' + tagname:
|
---|
| 410 | depth -= 1
|
---|
| 411 | if depth == 0:
|
---|
| 412 | if not outside and i <= self.tagmax:
|
---|
| 413 | return i
|
---|
| 414 | if outside and i > self.tagmax:
|
---|
| 415 | return i
|
---|
| 416 | break
|
---|
| 417 |
|
---|
| 418 | def matchelement(self, starttag, content=None, outside=0):
|
---|
| 419 | """If the element with the given start tag matches the given content,
|
---|
| 420 | return the index of the matching end tag. See findendtag() for the
|
---|
| 421 | meaning of the outside flag."""
|
---|
| 422 | endtag = self.findendtag(starttag, outside)
|
---|
| 423 | if endtag is not None:
|
---|
| 424 | start, end = self.tags[starttag][1], self.tags[endtag][0]
|
---|
| 425 | stripped = striptags(self.document[start:end])
|
---|
| 426 | if content is None or matchcontent(stripped, content):
|
---|
| 427 | return endtag
|
---|
| 428 |
|
---|
| 429 | # Provide the "content" and "text" attributes to access the contents.
|
---|
| 430 | content = property(lambda self: self.document[self.start:self.end])
|
---|
| 431 | text = property(lambda self: striptags(self.content))
|
---|
| 432 |
|
---|
| 433 | def getparams(self):
|
---|
| 434 | """Get a dictionary of default values for all the form parameters."""
|
---|
| 435 | if self.tagname == 'form':
|
---|
| 436 | params = {}
|
---|
| 437 | for input in self.alltags('input'):
|
---|
| 438 | if 'disabled' not in input:
|
---|
| 439 | type = input['type'].lower()
|
---|
| 440 | if type in ['text', 'password', 'hidden'] or (
|
---|
| 441 | type in ['checkbox', 'radio'] and 'checked' in input):
|
---|
| 442 | params[input['name']] = input['value']
|
---|
| 443 | for select in self.all('select'):
|
---|
| 444 | if 'disabled' not in select:
|
---|
| 445 | selections = [option['value']
|
---|
| 446 | for option in select.alltags('option')
|
---|
| 447 | if 'selected' in option]
|
---|
| 448 | if 'multiple' in select:
|
---|
| 449 | params[select['name']] = selections
|
---|
| 450 | elif selections:
|
---|
| 451 | params[select['name']] = selections[0]
|
---|
| 452 | for textarea in self.all('textarea'):
|
---|
| 453 | if 'disabled' not in textarea:
|
---|
| 454 | params[textarea['name']] = textarea.content
|
---|
| 455 | return params
|
---|
| 456 |
|
---|
| 457 | def getbuttons(self):
|
---|
| 458 | """Get a list of all the form submission buttons."""
|
---|
| 459 | if self.tagname == 'form':
|
---|
| 460 | return [tag for tag in self.alltags('input')
|
---|
| 461 | if tag['type'].lower() in ['submit', 'image']
|
---|
| 462 | ] + [tag for tag in self.alltags('button')
|
---|
| 463 | if tag['type'].lower() in ['submit', '']]
|
---|
| 464 |
|
---|
| 465 | params = property(getparams)
|
---|
| 466 | buttons = property(getbuttons)
|
---|
| 467 |
|
---|
| 468 | # Provide a dictionary-like interface to the tag attributes.
|
---|
| 469 | def __contains__(self, name):
|
---|
| 470 | return name in self.attrs
|
---|
| 471 |
|
---|
| 472 | def __getitem__(self, name):
|
---|
| 473 | return self.attrs.get(name, '')
|
---|
| 474 |
|
---|
| 475 | # Provide subregions by slicing.
|
---|
| 476 | def __getslice__(self, start, end):
|
---|
| 477 | start += (start < 0) and self.end or self.start
|
---|
| 478 | end += (end < 0) and self.end or self.start
|
---|
| 479 | return Region(self, start, end)
|
---|
| 480 |
|
---|
| 481 | # Search for text.
|
---|
| 482 | def find(self, target, group=0):
|
---|
| 483 | """Search this Region for a string or a compiled RE and return a
|
---|
| 484 | Region representing the match. The optional group argument specifies
|
---|
| 485 | which grouped subexpression should be returned as the match."""
|
---|
| 486 | if hasattr(target, 'search'):
|
---|
| 487 | match = target.search(self.content)
|
---|
| 488 | if match:
|
---|
| 489 | return self[match.start(group):match.end(group)]
|
---|
| 490 | else:
|
---|
| 491 | start = self.content.find(target)
|
---|
| 492 | if start > -1:
|
---|
| 493 | return self[start:start+len(target)]
|
---|
| 494 |
|
---|
| 495 | def findall(self, target, group=0):
|
---|
| 496 | """Search this Region for a string or a compiled RE and return a
|
---|
| 497 | sequence of Regions representing all the matches."""
|
---|
| 498 | pos = 0
|
---|
| 499 | content = self.content
|
---|
| 500 | matches = []
|
---|
| 501 | if hasattr(target, 'search'):
|
---|
| 502 | while 1:
|
---|
| 503 | match = target.search(content, pos)
|
---|
| 504 | if not match:
|
---|
| 505 | break
|
---|
| 506 | start, pos = match.span(group)
|
---|
| 507 | matches.append(self[start:pos])
|
---|
| 508 | else:
|
---|
| 509 | while 1:
|
---|
| 510 | start = content.find(target, pos)
|
---|
| 511 | if start < 0:
|
---|
| 512 | break
|
---|
| 513 | pos = start + len(target)
|
---|
| 514 | matches.append(self[start:pos])
|
---|
| 515 | return matches
|
---|
| 516 |
|
---|
| 517 | # Search for tags.
|
---|
| 518 | def firsttag(self, tagname=None, **attrs):
|
---|
| 519 | """Return the Region for the first tag entirely within this Region
|
---|
| 520 | with the given tag name and attributes."""
|
---|
| 521 | for i in range(self.tagmin, self.tagmax + 1):
|
---|
| 522 | if self.matchtag(i, tagname, attrs):
|
---|
| 523 | return Region(self, 0, 0, i)
|
---|
| 524 |
|
---|
| 525 | def lasttag(self, tagname=None, **attrs):
|
---|
| 526 | """Return the Region for the last tag entirely within this Region
|
---|
| 527 | with the given tag name and attributes."""
|
---|
| 528 | for i in range(self.tagmax, self.tagmin - 1, -1):
|
---|
| 529 | if self.matchtag(i, tagname, attrs):
|
---|
| 530 | return Region(self, 0, 0, i)
|
---|
| 531 |
|
---|
| 532 | def alltags(self, tagname=None, **attrs):
|
---|
| 533 | """Return a list of Regions for all the tags entirely within this
|
---|
| 534 | Region with the given tag name and attributes."""
|
---|
| 535 | tags = []
|
---|
| 536 | for i in range(self.tagmin, self.tagmax + 1):
|
---|
| 537 | if self.matchtag(i, tagname, attrs):
|
---|
| 538 | tags.append(Region(self, 0, 0, i))
|
---|
| 539 | return tags
|
---|
| 540 |
|
---|
| 541 | def nexttag(self, tagname=None, **attrs):
|
---|
| 542 | """Return the Region for the nearest tag after the end of this Region
|
---|
| 543 | with the given tag name and attributes."""
|
---|
| 544 | return Region(self, self.end).firsttag(tagname, **attrs)
|
---|
| 545 |
|
---|
| 546 | def previoustag(self, tagname=None, **attrs):
|
---|
| 547 | """Return the Region for the nearest tag before the start of this
|
---|
| 548 | Region with the given tag name and attributes."""
|
---|
| 549 | return Region(self, 0, self.start).lasttag(tagname, **attrs)
|
---|
| 550 |
|
---|
| 551 | # Search for elements.
|
---|
| 552 | def first(self, tagname=None, content=None, **attrs):
|
---|
| 553 | """Return the Region for the first properly balanced element entirely
|
---|
| 554 | within this Region with the given tag name, content, and attributes.
|
---|
| 555 | The element content is passed through striptags(). If the content
|
---|
| 556 | argument has a match() method, the stripped content is passed into
|
---|
| 557 | this method; otherwise it is compared directly as a string."""
|
---|
| 558 | for starttag in range(self.tagmin, self.tagmax + 1):
|
---|
| 559 | if self.matchtag(starttag, tagname, attrs):
|
---|
| 560 | endtag = self.matchelement(starttag, content)
|
---|
| 561 | if endtag is not None:
|
---|
| 562 | return Region(self, 0, 0, starttag, endtag)
|
---|
| 563 |
|
---|
| 564 | def last(self, tagname=None, content=None, **attrs):
|
---|
| 565 | """Return the Region for the last properly balanced element entirely
|
---|
| 566 | within this Region with the given tag name, content, and attributes."""
|
---|
| 567 | for starttag in range(self.tagmax, self.tagmin - 1, -1):
|
---|
| 568 | if self.matchtag(starttag, tagname, attrs):
|
---|
| 569 | endtag = self.matchelement(starttag, content)
|
---|
| 570 | if endtag is not None:
|
---|
| 571 | return Region(self, 0, 0, starttag, endtag)
|
---|
| 572 |
|
---|
| 573 | def all(self, tagname=None, content=None, **attrs):
|
---|
| 574 | """Return Regions for all non-overlapping balanced elements entirely
|
---|
| 575 | within this Region with the given tag name, content, and attributes."""
|
---|
| 576 | elements = []
|
---|
| 577 | starttag = self.tagmin
|
---|
| 578 | while starttag <= self.tagmax:
|
---|
| 579 | if self.matchtag(starttag, tagname, attrs):
|
---|
| 580 | endtag = self.matchelement(starttag, content)
|
---|
| 581 | if endtag is not None:
|
---|
| 582 | elements.append(Region(self, 0, 0, starttag, endtag))
|
---|
| 583 | starttag = endtag
|
---|
| 584 | starttag += 1
|
---|
| 585 | return elements
|
---|
| 586 |
|
---|
| 587 | def next(self, tagname=None, content=None, **attrs):
|
---|
| 588 | """Return the Region for the nearest balanced element after the end of
|
---|
| 589 | this Region with the given tag name, content, and attributes."""
|
---|
| 590 | return Region(self, self.end).first(tagname, content, **attrs)
|
---|
| 591 |
|
---|
| 592 | def previous(self, tagname=None, content=None, **attrs):
|
---|
| 593 | """Return the Region for the nearest balanced element before the start
|
---|
| 594 | of this Region with the given tag name, content, and attributes."""
|
---|
| 595 | return Region(self, 0, self.start).last(tagname, content, **attrs)
|
---|
| 596 |
|
---|
| 597 | def enclosing(self, tagname=None, content=None, **attrs):
|
---|
| 598 | """Return the Region for the nearest balanced element that encloses
|
---|
| 599 | this Region with the given tag name, content, and attributes."""
|
---|
| 600 | if self.starttag and self.endtag: # skip our own start tag
|
---|
| 601 | laststarttag = self.starttag - 1
|
---|
| 602 | else:
|
---|
| 603 | laststarttag = self.tagmin - 1
|
---|
| 604 | for starttag in range(laststarttag, -1, -1):
|
---|
| 605 | if self.matchtag(starttag, tagname, attrs):
|
---|
| 606 | endtag = self.matchelement(starttag, content, outside=1)
|
---|
| 607 | if endtag is not None:
|
---|
| 608 | return Region(self, 0, 0, starttag, endtag)
|
---|
| 609 |
|
---|
| 610 | def read(path):
|
---|
| 611 | """Read and return the entire contents of the file at the given path."""
|
---|
| 612 | return open(path).read()
|
---|
| 613 |
|
---|
| 614 | def write(path, text):
|
---|
| 615 | """Write the given text to a file at the given path."""
|
---|
| 616 | file = open(path, 'w')
|
---|
| 617 | file.write(text)
|
---|
| 618 | file.close()
|
---|
| 619 |
|
---|
| 620 | def load(path):
|
---|
| 621 | """Return the deserialized contents of the file at the given path."""
|
---|
| 622 | return marshal.load(open(path))
|
---|
| 623 |
|
---|
| 624 | def dump(path, data):
|
---|
| 625 | """Serialize the given data and write it to a file at the given path."""
|
---|
| 626 | file = open(path, 'w')
|
---|
| 627 | marshal.dump(data, file)
|
---|
| 628 | file.close()
|
---|
| 629 |
|
---|
| 630 | def getnumber(text):
|
---|
| 631 | """Find and parse a floating-point or integer number in the given text,
|
---|
| 632 | ignoring commas, percentage signs, and non-numeric words."""
|
---|
| 633 | for word in striptags(text).replace(',', '').replace('%', '').split():
|
---|
| 634 | try: return int(word)
|
---|
| 635 | except:
|
---|
| 636 | try: return float(word)
|
---|
| 637 | except: continue
|
---|