source: trunk/RetroStatus/RSS.py@ 378

Last change on this file since 378 was 201, checked in by Nicholas Riley, 17 years ago

RetroStatus

File size: 22.9 KB
Line 
1#!/usr/bin/env python
2
3"""
4RSS.py
5
6Classes for working with RSS channels as arbitrary data structures.
7Requires Python 2.2 or newer and PyXML 0.7.1 or newer.
8
9 ChannelBase - Base class for RSS Channels.
10 CollectionChannel - RSS Channel modeled as a URI-per-entry
11 dictionary.
12 TrackingChannel - RSS Channel modeled as an item-per-entry
13 dictionary.
14 RSSParser - Multi-format RSS/XML Parser.
15
16Typically, the *Channel clases will be most useful to developers.
17
18This library provides tools for working with RSS feeds as data
19structures. The core is an RSS parser capable of understanding most
20RSS formats, and a serializer that produces RSS1.0. The RSS channel
21itself can be represented as any arbitrary data structure; two such
22structures are provided both as examples and to service common
23usage. This approach allows channels to be manipulated and stored in
24a fashion that suits both their semantics and the applications that
25access them.
26
27Both the parser and the serializer have the following limitations:
28 - RSS 1.0 "rich content" modules are not supported
29 - RSS 0.9x features that rely on attributes are not supported
30 - RDF is not understood; this library does not expose statements or
31 understand RDF syntax beyond that documented in RSS1.0 (taking
32 into account the previously listed limitations)
33
34The RSS format is made up of three metadata sections (channel,
35image, and textinput) and a list of items. Each individual metadata
36section and each item is passed around as an "item dictionary",
37which is a Python dictionary with (namespace, localname) tuples as
38keys. The values of the dictionaries are always strings; they may
39contain markup, which will be rendered into the RSS/XML when
40serialized.
41
42Individual items are found by using an "item identifier"; this is a
43channel-unique, string identifier for any given item. Item
44identifiers may be generated in a variety of ways, depending on the
45requirements of the channel.
46
47Certain types of channel metadata are automatically generated, and
48will not be returned or honored when accessed. They includes the
49"items", "image" and "textinput" children of the channel element.
50
51
52TODO:
53 - any markup (and the content inside) in item or metadata children
54 (e.g., HTML in a <description> will be silently ignored.
55 - test suite
56 - a function (XPath-based?) to detect a channel's type and return
57 the appropriate class.
58 - pay attention to <rss:items> when appropriate.
59"""
60
61__license__ = """
62Copyright (c) 2004 Mark Nottingham <mnot@pobox.com>
63
64Permission is hereby granted, free of charge, to any person obtaining a copy
65of this software and associated documentation files (the "Software"), to deal
66in the Software without restriction, including without limitation the rights
67to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
68copies of the Software, and to permit persons to whom the Software is
69furnished to do so, subject to the following conditions:
70
71The above copyright notice and this permission notice shall be included in all
72copies or substantial portions of the Software.
73
74THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
75IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
76FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
78LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
79OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
80SOFTWARE.
81"""
82
83__version__ = "0.46"
84
85import UserDict, sys, codecs, sha, types, signal
86import xml.sax as sax
87import xml.sax.saxutils as saxutils
88import cPickle as pickle
89import cStringIO as StringIO
90
91versionURI = 'http://www.mnot.net/python/RSS.py?version=%s' % __version__
92
93
94class _NamespaceMap:
95 """
96 Prefix <-> Namespace map.
97
98 Hold prefix->namespace mappings, and generate new prefixes when
99 necessary. Exposes prefix->URI map as attributes, URI->prefix
100 through getPrefix(URI).
101 """
102
103 def __init__(self):
104 self._nsID = 0 # seed for namespace prefix generation
105 self._prefixMap = {}
106 self.rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
107 self.rss10 = 'http://purl.org/rss/1.0/'
108 self.rss09 = 'http://my.netscape.com/rdf/simple/0.9/'
109 self.rss091 = 'http://purl.org/rss/1.0/modules/rss091/'
110 self.dc = 'http://purl.org/dc/elements/1.1/'
111 self.syn = 'http://purl.org/rss/modules/syndication/'
112 self.content = 'http://purl.org/rss/1.0/modules/content/'
113 self.admin = 'http://webns.net/mvcb/'
114 self.ag = 'http://purl.org/rss/modules/aggregation/'
115 self.annotate = 'http://purl.org/rss/1.0/modules/annotate/'
116 self.cp = 'http://my.theinfo.org/changed/1.0/rss/'
117 self.company = 'http://purl.org/rss/1.0/modules/company'
118 self.event = 'http://purl.org/rss/1.0/modules/event/'
119 self.slash = 'http://purl.org/rss/1.0/modules/slash/'
120 self.html = 'http://www.w3.org/html4/'
121
122 def __setattr__(self, attr, value):
123 self.__dict__[attr] = value
124 if attr[0] != '_':
125 self._prefixMap[value] = attr
126
127 def getPrefix(self, URI):
128 """
129 Get the prefix for a given URI; generate one if it
130 doesn't exist.
131 """
132 try:
133 if URI == self.rss10:
134 return None # special case
135 return self._prefixMap[URI]
136 except KeyError:
137 o = []
138 d = self._nsID
139 while 1:
140 o.insert(0, d % 26)
141 d = d / 26
142 if not d: break
143 candidate = "".join(map(lambda a: chr(a+97), o))
144 self._nsID = self._nsID + 1
145 if candidate in self._prefixMap.values():
146 candidate = self.getPrefix(URI)
147 setattr(self, candidate, URI)
148 return candidate
149
150
151ns = _NamespaceMap()
152
153# possible namespaces for RSS docs (None included for 0.9x)
154rssNamespaces = [ns.rss09, ns.rss10, None]
155
156# major sections of a RSS file
157rssSections = [ (ns.rss10, 'channel'),
158 (ns.rss10, 'image'),
159 (ns.rss10, 'textarea')
160 ]
161
162# RSS core element localnames
163rssElements = ['rss', 'channel', 'image', 'textarea', 'item', 'items',
164 'title', 'link', 'description', 'url']
165
166# RSS elements whose data is in an rdf:resource attribute
167rdfResources = [ (ns.rss10, 'image'),
168 (ns.rss10, 'textarea'),
169 (ns.admin, 'errorReportsTo'),
170 (ns.admin, 'generatorAgent'),
171 (ns.annotate, 'reference'),
172 (ns.cp, 'server')
173 ]
174
175
176class ChannelBase:
177 """
178 Base class for RSS Channels.
179
180 A number of generic methods for accessing and setting channel
181 data and metadata are exposed, for the benefit of subclasses.
182 They may be used by applications as well, or the data structure
183 of the subclass may be directly manipulated.
184 """
185
186 def __init__(self):
187 self.encoding = 'utf-8'
188
189 def listItems(self):
190 """List the items in a channel, with a list of identifiers."""
191 pass # override me
192
193 def addItem(self, item, index=0):
194 """Add an item to the channel. Expects an item dictionary."""
195 pass # override me
196
197 def getItem(self, identifier):
198 """Get the appropriate item dictionary for a given identifier."""
199 pass # override me
200
201 def getMD(self, name):
202 """
203 Get the [name] metadata as an item dictionary, where type is
204 a tuple (typically, in the ns:rss10 namespace, with a localname of
205 channel|image|textinput). MUST return an empty dictionary if the
206 metadata isn't found.
207 """
208 pass # override me
209
210 def setMD(self, name, metadata):
211 """
212 Set the [name] metadata, where name is a tuple (typically,
213 it will be in the ns:rss10 namespace, and have a localname of
214 channel|image|textinput), and metadata is an item dictionary.
215 """
216 pass # override me
217
218 def parse(self, url, timeout=30):
219 """
220 Fetch a channel representation from a URL and populate
221 the channel.
222 """
223 dh = RSSParser(self)
224 p = sax.sax2exts.make_parser()
225 p.setContentHandler(dh)
226 p.setFeature(sax.handler.feature_namespaces, 1)
227 signal.signal(signal.SIGALRM, self._timeout)
228 signal.alarm(timeout)
229 try:
230 p.parse(str(url)) # URIs are ascii
231 finally:
232 signal.alarm(0)
233 return dh
234
235 def _timeout(self, **args):
236 raise IOError, 'timeout'
237
238 def parseFile(self, file):
239 """Parse a file and populate the channel."""
240 dh = RSSParser(self)
241 p = sax.sax2exts.make_parser()
242 p.setContentHandler(dh)
243 p.setFeature(sax.handler.feature_namespaces, 1)
244 p.parseFile(file)
245 return dh
246
247 def __str__(self):
248 return self.output(self.listItems())
249
250 def output(self, items):
251 """Return the items referred to by a list of identifiers."""
252 assert type(items) is types.ListType, "items must be a list (%s)" % \
253 type(items)
254 out = StringIO.StringIO()
255 o = _XMLGenerator(out, self.encoding, 'replace')
256 channelMD = self.getMD((ns.rss10, "channel"))
257 imageMD = self.getMD((ns.rss10, "image"))
258 textinputMD = self.getMD((ns.rss10, "textinput"))
259 channelMD[(ns.admin, 'generatorAgent')] = versionURI
260
261 # gather namespaces, map prefixes
262 namespaces = {ns.rdf: 1}
263 namespaces.update(dict(
264 channelMD.keys() + imageMD.keys() + textinputMD.keys()))
265 [namespaces.update(dict(i.keys())) for i in map(self.getItem, items)]
266 for namespace in namespaces.keys():
267 o.startPrefixMapping(ns.getPrefix(namespace), namespace)
268
269 # write the XML
270 o.startDocument()
271 o.startElementNS((ns.rdf, 'RDF'), None, {})
272 o.ignorableWhitespace('\n')
273 o.startElementNS(
274 (ns.rss10, 'channel'), None,
275 {(ns.rdf, 'about'): channelMD[(ns.rss10, 'link')]})
276 o.ignorableWhitespace('\n')
277
278 # /channel
279 for name, data in channelMD.items():
280 if name in [(ns.rss10, 'items'), (ns.rss10, 'image'),
281 (ns.rss10, 'textinput')]:
282 continue
283 o.ignorableWhitespace(' ')
284 if name in rdfResources:
285 o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
286 else:
287 if "<" in data:
288 o.startElementNS(name, None,
289 {(ns.rdf, "parseType"): "Literal"})
290 else:
291 o.startElementNS(name, None, {})
292 o.characters(data)
293 o.endElementNS(name, None)
294 o.ignorableWhitespace('\n')
295
296 # /channel/items
297 o.ignorableWhitespace(' ')
298 o.startElementNS((ns.rss10, 'items'), None, {})
299 o.startElementNS((ns.rdf, 'Seq'), None, {})
300 o.ignorableWhitespace('\n')
301 for id in items:
302 o.ignorableWhitespace(' ')
303 o.startElementNS((ns.rdf, 'li'), None,
304 {(ns.rdf, 'resource'): self.getItem(id).get((ns.rss10, 'link'),
305 _make_hash(self.getItem(id)))})
306 o.endElementNS((ns.rdf, 'li'), None)
307 o.ignorableWhitespace('\n')
308 o.ignorableWhitespace(' ')
309 o.endElementNS((ns.rdf, 'Seq'), None)
310 o.endElementNS((ns.rss10, 'items'), None)
311 o.ignorableWhitespace('\n')
312
313 # /channel/image
314 if imageMD.has_key((ns.rss10, 'url')):
315 o.startElementNS((ns.rss10, 'image'), None,
316 {(ns.rdf, 'about'): imageMD[(ns.rss10, 'url')]})
317 o.endElementNS((ns.rss10, 'image'), None)
318 o.ignorableWhitespace('\n')
319
320 # /channel/textinput
321 if textinputMD.has_key((ns.rss10, 'link')):
322 o.startElementNS((ns.rss10, 'textinput'), None,
323 {(ns.rdf, 'about'): textinputMD[(ns.rss10, 'link')]})
324 o.endElementNS((ns.rss10, 'textinput'), None)
325 o.ignorableWhitespace('\n')
326 o.endElementNS((ns.rss10, 'channel'), None)
327 o.ignorableWhitespace('\n')
328
329 # /image
330 if imageMD.has_key((ns.rss10, 'url')):
331 o.startElementNS((ns.rss10, 'image'), None,
332 {(ns.rdf, 'about'): imageMD[(ns.rss10, 'url')]})
333 for name, data in imageMD.items():
334 o.ignorableWhitespace(' ')
335 if name in rdfResources:
336 o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
337 else:
338 if "<" in data:
339 o.startElementNS(name, None,
340 {(ns.rdf, "parseType"): "Literal"})
341 else:
342 o.startElementNS(name, None, {})
343 o.characters(data)
344 o.endElementNS(name, None)
345 o.ignorableWhitespace('\n')
346 o.endElementNS((ns.rss10, 'image'), None)
347 o.ignorableWhitespace('\n')
348
349 # /textinput
350 if textinputMD.has_key((ns.rss10, 'link')):
351 o.startElementNS((ns.rss10, 'textinput'), None,
352 {(ns.rdf, 'about'): textinputMD[(ns.rss10, 'link')]})
353 for name, data in textinputMD.items():
354 o.ignorableWhitespace(' ')
355 if name in rdfResources:
356 o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
357 else:
358 if "<" in data:
359 o.startElementNS(name, None,
360 {(ns.rdf, "parseType"): "Literal"})
361 else:
362 o.startElementNS(name, None, {})
363 o.characters(data)
364 o.endElementNS(name, None)
365 o.ignorableWhitespace('\n')
366 o.endElementNS((ns.rss10, 'textinput'), None)
367 o.ignorableWhitespace('\n')
368
369 # /item
370 for id in items:
371 item = self.getItem(id)
372 o.startElementNS(
373 (ns.rss10, 'item'), None, {(ns.rdf, 'about'):
374 item.get((ns.rss10, 'link'), _make_hash(item))})
375 o.ignorableWhitespace('\n')
376 for name, data in item.items():
377 o.ignorableWhitespace(' ')
378 if name in rdfResources:
379 o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
380 else:
381 if "<" in data:
382 o.startElementNS(name, None,
383 {(ns.rdf, "parseType"): "Literal"})
384 else:
385 o.startElementNS(name, None, {})
386 o.characters(data)
387 o.endElementNS(name, None)
388 o.ignorableWhitespace('\n')
389 o.endElementNS((ns.rss10, 'item'), None)
390 o.ignorableWhitespace('\n')
391 o.endElementNS((ns.rdf, 'RDF'), None)
392 o.endDocument()
393 out.seek(0)
394 return out.read()
395
396
397
398class TrackingChannel(ChannelBase, UserDict.UserDict):
399 """
400 RSS Channel modeled as a URI-per-entry dictionary.
401
402 Item identifiers are (uri, index) tuples, where uri is
403 the rdf:about or rss:link URI, and index indicates the
404 position in a list of a number of times that URI has
405 appeared in the channel.
406
407 This allows "tracking" channels that track the state of
408 a group of resources, such as stock tickers, file state
409 changes, etc.
410
411 For example:
412
413 {
414 (ns.rss10, "channel"): {
415 (ns.rss10, "title"): "the channel",
416 (ns.rss10, "description"): "whatever",
417 },
418 (ns.rss10, "items"):
419 ["http://example.com/foo", "htp://example.com/bar", ... ],
420 "http://example.com/foo" [
421 {
422 (ns.rss10, "title"): "item 1",
423 (ns.rss10, "link"): "http://example.com/",
424 (ns.rss10, "description"): "foo",
425 },
426 {
427 (ns.rss10, "title"): "item 1 revised",
428 (ns.rss10, "link"): "http://example.com/",
429 (ns.rss10, "description"): "foo revisited",
430 },
431 ]
432
433 "http://example.com/bar" [
434 ...
435 ]
436 }
437
438 """
439
440 def __init__(self, data={}):
441 ChannelBase.__init__(self)
442 UserDict.UserDict.__init__(self, data)
443 self.data[(ns.rss10, 'items')] = []
444
445 def listItems(self):
446 return self[(ns.rss10, 'items')]
447
448 def addItem(self, item, index=0):
449 if index == -1: index = len(self.data[(ns.rss10, 'items')])
450 uri = item.get((ns.rss10, "link"), _make_hash(item)) # shoudn't happen
451 if not self.data.has_key(uri):
452 self.data[uri] = [item]
453 else:
454 self.data[uri].append(item)
455 self.data[(ns.rss10, 'items')].insert(index, (uri, len(self.data[uri])))
456
457 def truncateToLength(self, length):
458 items = self.listItems()
459 data = self.data
460 overage = len(items) - length
461 while overage > 0:
462 del data[items.pop()[0]]
463 overage -= 1
464
465 def getItem(self, identifier):
466 (uri, index) = identifier
467 try:
468 return self.data[uri][index-1]
469 except (KeyError, IndexError):
470 return {}
471
472 def getMD(self, name):
473 return self.data.get(name, {})
474
475 def setMD(self, name, metadata):
476 self.data[name] = metadata
477
478
479
480class CollectionChannel(ChannelBase, UserDict.UserDict):
481 """
482 RSS Channel modeled as an item-per-entry dictionary.
483
484 Each Item is hashed to create a unique entry in the
485 dictionary, no matter how many times a particular
486 URI is in the channel.
487
488 This allows "collection" channels, which are typically
489 used for news updates, etc.
490
491 For example:
492
493 {
494 (ns.rss10, "channel"): {
495 (ns.rss10, "title"): "the channel",
496 (ns.rss10, "description"): "whatever",
497 },
498 (ns.rss10, "items"): ["ID1", "ID2", ... ],
499 "ID1" {
500 (ns.rss10, "title"): "item 1",
501 (ns.rss10, "link"): "http://example.com/",
502 (ns.rss10, "description"): "foo",
503 },
504 "ID2" {
505 ...
506 }
507 }
508
509 Note that:
510 - items are keyed by a hash-data URI; metadata is keyed
511 by a (namespace, localname) tuple.
512 - (ns.rss10, items) is a property; it cannot be
513 manipulated without manipulating the corresponding
514 (sub-)items (delete, add)
515 - likewise, all item's are properties; adding, deleting,
516 appending an item modifies (ns.rss10, items)
517 correspondingly
518 """
519
520 def __init__(self, data={}):
521 ChannelBase.__init__(self)
522 UserDict.UserDict.__init__(self, data)
523 self.data[(ns.rss10, 'items')] = []
524
525 def listItems(self):
526 return self.data[(ns.rss10, 'items')]
527
528 def addItem(self, item, index=0):
529 """append an item dictionary to the channel"""
530 if index == -1: index = len(self.data[(ns.rss10, 'items')])
531 ID = _make_hash(item)
532 self.data[ID] = item
533 self.data[(ns.rss10, 'items')].insert(index, ID)
534
535 def getItem(self, identifier):
536 return self.data.get(identifier, {})
537
538 def getMD(self, name):
539 return self.data.get(name, {})
540
541 def setMD(self, name, metadata):
542 self.data[name] = metadata
543
544
545class _XMLGenerator(saxutils.XMLGenerator):
546 """
547 Modified XMLGenerator.
548
549 Allows modification of encoding error handling, and tries to
550 encode problematic characters as Latin-1 to work around some
551 implementations.
552 """
553
554 def __init__(self, out=None, encoding='iso-8859-1', errors='strict'):
555 saxutils.XMLGenerator.__init__(self, out=out, encoding=encoding)
556 if out is None:
557 out = sys.stdout
558 self._out = codecs.lookup(encoding)[3](out, errors)
559
560 def characters(self, content):
561 try:
562 self._out.write(sax.saxutils.escape(content))
563 except UnicodeError: # hack for broken content
564 self._out.write(sax.saxutils.escape(unicode(content, 'Latin-1')))
565
566
567class RSSParser(sax.handler.ContentHandler):
568 """
569 Multi-format RSS/XML Parser.
570
571 Parse XML into RSS Channel objects. May optionally be passed a
572 Channel() instance to append to.
573
574 Formats understood include:
575 - RSS 0.9
576 - RSS 0.91
577 - RSS 0.92
578 - RSS 1.0 (EXCEPT "rich content" modules)
579
580 "Core" RSS elements are normalized to the RSS1.0 namespace.
581 """
582
583 def __init__(self, channel, encoding='utf-8'):
584 sax.handler.ContentHandler.__init__(self)
585 self.channel = channel
586 self.encoding = encoding
587 self._context = []
588 self._tmp_item = {}
589 self._tmp_md = { (ns.rss10, "channel"): {},
590 (ns.rss10, "image"): {},
591 (ns.rss10, "textinput"): {},
592 }
593 self._tmp_buf = ''
594 self.version = None
595
596 def startElementNS(self, name, qname, attrs):
597 if name[1] is 'rss': # sniff version
598 if name[0] is None:
599 self.version = attrs.get('version', None)
600 else:
601 self.version = name[0]
602 # normalize the rss namespace
603 if name[0] in rssNamespaces and name[1] in rssElements:
604 name = (ns.rss10, name[1])
605 elif name[0] is None:
606 name = (ns.rss091, name[1])
607 self._context.append(name)
608 if name == (ns.rss10, 'item'):
609 self._tmp_item = {}
610 self._tmp_buf = ''
611 elif len(self._context) > 1 and \
612 self._context[-2] == (ns.rss10, 'item') and \
613 name in rdfResources:
614 self._tmp_item[name] = attrs[(ns.rdf, 'resource')]
615
616
617 def endElementNS(self, name, qname):
618 # normalize the rss namespace
619 if name[0] in rssNamespaces and name[1] in rssElements:
620 name = (ns.rss10, name[1])
621 elif name[0] is None:
622 name = (ns.rss091, name[1])
623 if (ns.rss10, 'item') in self._context:
624 if self._context[-1] == (ns.rss10, 'item'): # end of an item
625 self.channel.addItem(self._tmp_item, len(self.channel))
626 self._tmp_item = {}
627 elif self._context[-2] == (ns.rss10, 'item'): # an item's child
628 if name not in rdfResources:
629 self._tmp_item[name] = self._tmp_buf.strip()
630 else: # an item's grandchild
631 pass ###
632 elif len(self._context) > 2 and self._context[-2] in rssSections:
633 # metadata
634 self._tmp_md[self._context[-2]][name] = self._tmp_buf.strip()
635 self._tmp_buf = ''
636 self._context.pop()
637
638 def endDocument(self):
639 for name, metadata in self._tmp_md.items():
640 self.channel.setMD(name, metadata)
641
642 def characters(self, content):
643 self._tmp_buf = self._tmp_buf + content.encode(self.encoding)
644
645
646def _make_hash(data):
647 return "hash-data:SHA:" + sha.new(pickle.dumps(data)).hexdigest()[:20]
648
649
650if __name__ == "__main__":
651 # a simple test
652 c = TrackingChannel()
653 c.parse(sys.argv[1])
654 print c
Note: See TracBrowser for help on using the repository browser.