Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

RSS.py@ 378

Last change on this file since 378 was 201, checked in by Nicholas Riley, 18 years ago
RetroStatus
File size: 22.9 KB

Line
1	#!/usr/bin/env python
2
3	"""
4	RSS.py
5
6	Classes for working with RSS channels as arbitrary data structures.
7	Requires Python 2.2 or newer and PyXML 0.7.1 or newer.
8
9	ChannelBase - Base class for RSS Channels.
10	CollectionChannel - RSS Channel modeled as a URI-per-entry
11	dictionary.
12	TrackingChannel - RSS Channel modeled as an item-per-entry
13	dictionary.
14	RSSParser - Multi-format RSS/XML Parser.
15
16	Typically, the *Channel clases will be most useful to developers.
17
18	This library provides tools for working with RSS feeds as data
19	structures. The core is an RSS parser capable of understanding most
20	RSS formats, and a serializer that produces RSS1.0. The RSS channel
21	itself can be represented as any arbitrary data structure; two such
22	structures are provided both as examples and to service common
23	usage. This approach allows channels to be manipulated and stored in
24	a fashion that suits both their semantics and the applications that
25	access them.
26
27	Both the parser and the serializer have the following limitations:
28	- RSS 1.0 "rich content" modules are not supported
29	- RSS 0.9x features that rely on attributes are not supported
30	- RDF is not understood; this library does not expose statements or
31	understand RDF syntax beyond that documented in RSS1.0 (taking
32	into account the previously listed limitations)
33
34	The RSS format is made up of three metadata sections (channel,
35	image, and textinput) and a list of items. Each individual metadata
36	section and each item is passed around as an "item dictionary",
37	which is a Python dictionary with (namespace, localname) tuples as
38	keys. The values of the dictionaries are always strings; they may
39	contain markup, which will be rendered into the RSS/XML when
40	serialized.
41
42	Individual items are found by using an "item identifier"; this is a
43	channel-unique, string identifier for any given item. Item
44	identifiers may be generated in a variety of ways, depending on the
45	requirements of the channel.
46
47	Certain types of channel metadata are automatically generated, and
48	will not be returned or honored when accessed. They includes the
49	"items", "image" and "textinput" children of the channel element.
50
51
52	TODO:
53	- any markup (and the content inside) in item or metadata children
54	(e.g., HTML in a <description> will be silently ignored.
55	- test suite
56	- a function (XPath-based?) to detect a channel's type and return
57	the appropriate class.
58	- pay attention to <rss:items> when appropriate.
59	"""
60
61	__license__ = """
62	Copyright (c) 2004 Mark Nottingham <mnot@pobox.com>
63
64	Permission is hereby granted, free of charge, to any person obtaining a copy
65	of this software and associated documentation files (the "Software"), to deal
66	in the Software without restriction, including without limitation the rights
67	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
68	copies of the Software, and to permit persons to whom the Software is
69	furnished to do so, subject to the following conditions:
70
71	The above copyright notice and this permission notice shall be included in all
72	copies or substantial portions of the Software.
73
74	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
75	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
76	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
78	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
79	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
80	SOFTWARE.
81	"""
82
83	__version__ = "0.46"
84
85	import UserDict, sys, codecs, sha, types, signal
86	import xml.sax as sax
87	import xml.sax.saxutils as saxutils
88	import cPickle as pickle
89	import cStringIO as StringIO
90
91	versionURI = 'http://www.mnot.net/python/RSS.py?version=%s' % __version__
92
93
94	class _NamespaceMap:
95	"""
96	Prefix <-> Namespace map.
97
98	Hold prefix->namespace mappings, and generate new prefixes when
99	necessary. Exposes prefix->URI map as attributes, URI->prefix
100	through getPrefix(URI).
101	"""
102
103	def __init__(self):
104	self._nsID = 0 # seed for namespace prefix generation
105	self._prefixMap = {}
106	self.rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
107	self.rss10 = 'http://purl.org/rss/1.0/'
108	self.rss09 = 'http://my.netscape.com/rdf/simple/0.9/'
109	self.rss091 = 'http://purl.org/rss/1.0/modules/rss091/'
110	self.dc = 'http://purl.org/dc/elements/1.1/'
111	self.syn = 'http://purl.org/rss/modules/syndication/'
112	self.content = 'http://purl.org/rss/1.0/modules/content/'
113	self.admin = 'http://webns.net/mvcb/'
114	self.ag = 'http://purl.org/rss/modules/aggregation/'
115	self.annotate = 'http://purl.org/rss/1.0/modules/annotate/'
116	self.cp = 'http://my.theinfo.org/changed/1.0/rss/'
117	self.company = 'http://purl.org/rss/1.0/modules/company'
118	self.event = 'http://purl.org/rss/1.0/modules/event/'
119	self.slash = 'http://purl.org/rss/1.0/modules/slash/'
120	self.html = 'http://www.w3.org/html4/'
121
122	def __setattr__(self, attr, value):
123	self.__dict__[attr] = value
124	if attr[0] != '_':
125	self._prefixMap[value] = attr
126
127	def getPrefix(self, URI):
128	"""
129	Get the prefix for a given URI; generate one if it
130	doesn't exist.
131	"""
132	try:
133	if URI == self.rss10:
134	return None # special case
135	return self._prefixMap[URI]
136	except KeyError:
137	o = []
138	d = self._nsID
139	while 1:
140	o.insert(0, d % 26)
141	d = d / 26
142	if not d: break
143	candidate = "".join(map(lambda a: chr(a+97), o))
144	self._nsID = self._nsID + 1
145	if candidate in self._prefixMap.values():
146	candidate = self.getPrefix(URI)
147	setattr(self, candidate, URI)
148	return candidate
149
150
151	ns = _NamespaceMap()
152
153	# possible namespaces for RSS docs (None included for 0.9x)
154	rssNamespaces = [ns.rss09, ns.rss10, None]
155
156	# major sections of a RSS file
157	rssSections = [ (ns.rss10, 'channel'),
158	(ns.rss10, 'image'),
159	(ns.rss10, 'textarea')
160	]
161
162	# RSS core element localnames
163	rssElements = ['rss', 'channel', 'image', 'textarea', 'item', 'items',
164	'title', 'link', 'description', 'url']
165
166	# RSS elements whose data is in an rdf:resource attribute
167	rdfResources = [ (ns.rss10, 'image'),
168	(ns.rss10, 'textarea'),
169	(ns.admin, 'errorReportsTo'),
170	(ns.admin, 'generatorAgent'),
171	(ns.annotate, 'reference'),
172	(ns.cp, 'server')
173	]
174
175
176	class ChannelBase:
177	"""
178	Base class for RSS Channels.
179
180	A number of generic methods for accessing and setting channel
181	data and metadata are exposed, for the benefit of subclasses.
182	They may be used by applications as well, or the data structure
183	of the subclass may be directly manipulated.
184	"""
185
186	def __init__(self):
187	self.encoding = 'utf-8'
188
189	def listItems(self):
190	"""List the items in a channel, with a list of identifiers."""
191	pass # override me
192
193	def addItem(self, item, index=0):
194	"""Add an item to the channel. Expects an item dictionary."""
195	pass # override me
196
197	def getItem(self, identifier):
198	"""Get the appropriate item dictionary for a given identifier."""
199	pass # override me
200
201	def getMD(self, name):
202	"""
203	Get the [name] metadata as an item dictionary, where type is
204	a tuple (typically, in the ns:rss10 namespace, with a localname of
205	channel\|image\|textinput). MUST return an empty dictionary if the
206	metadata isn't found.
207	"""
208	pass # override me
209
210	def setMD(self, name, metadata):
211	"""
212	Set the [name] metadata, where name is a tuple (typically,
213	it will be in the ns:rss10 namespace, and have a localname of
214	channel\|image\|textinput), and metadata is an item dictionary.
215	"""
216	pass # override me
217
218	def parse(self, url, timeout=30):
219	"""
220	Fetch a channel representation from a URL and populate
221	the channel.
222	"""
223	dh = RSSParser(self)
224	p = sax.sax2exts.make_parser()
225	p.setContentHandler(dh)
226	p.setFeature(sax.handler.feature_namespaces, 1)
227	signal.signal(signal.SIGALRM, self._timeout)
228	signal.alarm(timeout)
229	try:
230	p.parse(str(url)) # URIs are ascii
231	finally:
232	signal.alarm(0)
233	return dh
234
235	def _timeout(self, **args):
236	raise IOError, 'timeout'
237
238	def parseFile(self, file):
239	"""Parse a file and populate the channel."""
240	dh = RSSParser(self)
241	p = sax.sax2exts.make_parser()
242	p.setContentHandler(dh)
243	p.setFeature(sax.handler.feature_namespaces, 1)
244	p.parseFile(file)
245	return dh
246
247	def __str__(self):
248	return self.output(self.listItems())
249
250	def output(self, items):
251	"""Return the items referred to by a list of identifiers."""
252	assert type(items) is types.ListType, "items must be a list (%s)" % \
253	type(items)
254	out = StringIO.StringIO()
255	o = _XMLGenerator(out, self.encoding, 'replace')
256	channelMD = self.getMD((ns.rss10, "channel"))
257	imageMD = self.getMD((ns.rss10, "image"))
258	textinputMD = self.getMD((ns.rss10, "textinput"))
259	channelMD[(ns.admin, 'generatorAgent')] = versionURI
260
261	# gather namespaces, map prefixes
262	namespaces = {ns.rdf: 1}
263	namespaces.update(dict(
264	channelMD.keys() + imageMD.keys() + textinputMD.keys()))
265	[namespaces.update(dict(i.keys())) for i in map(self.getItem, items)]
266	for namespace in namespaces.keys():
267	o.startPrefixMapping(ns.getPrefix(namespace), namespace)
268
269	# write the XML
270	o.startDocument()
271	o.startElementNS((ns.rdf, 'RDF'), None, {})
272	o.ignorableWhitespace('\n')
273	o.startElementNS(
274	(ns.rss10, 'channel'), None,
275	{(ns.rdf, 'about'): channelMD[(ns.rss10, 'link')]})
276	o.ignorableWhitespace('\n')
277
278	# /channel
279	for name, data in channelMD.items():
280	if name in [(ns.rss10, 'items'), (ns.rss10, 'image'),
281	(ns.rss10, 'textinput')]:
282	continue
283	o.ignorableWhitespace(' ')
284	if name in rdfResources:
285	o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
286	else:
287	if "<" in data:
288	o.startElementNS(name, None,
289	{(ns.rdf, "parseType"): "Literal"})
290	else:
291	o.startElementNS(name, None, {})
292	o.characters(data)
293	o.endElementNS(name, None)
294	o.ignorableWhitespace('\n')
295
296	# /channel/items
297	o.ignorableWhitespace(' ')
298	o.startElementNS((ns.rss10, 'items'), None, {})
299	o.startElementNS((ns.rdf, 'Seq'), None, {})
300	o.ignorableWhitespace('\n')
301	for id in items:
302	o.ignorableWhitespace(' ')
303	o.startElementNS((ns.rdf, 'li'), None,
304	{(ns.rdf, 'resource'): self.getItem(id).get((ns.rss10, 'link'),
305	_make_hash(self.getItem(id)))})
306	o.endElementNS((ns.rdf, 'li'), None)
307	o.ignorableWhitespace('\n')
308	o.ignorableWhitespace(' ')
309	o.endElementNS((ns.rdf, 'Seq'), None)
310	o.endElementNS((ns.rss10, 'items'), None)
311	o.ignorableWhitespace('\n')
312
313	# /channel/image
314	if imageMD.has_key((ns.rss10, 'url')):
315	o.startElementNS((ns.rss10, 'image'), None,
316	{(ns.rdf, 'about'): imageMD[(ns.rss10, 'url')]})
317	o.endElementNS((ns.rss10, 'image'), None)
318	o.ignorableWhitespace('\n')
319
320	# /channel/textinput
321	if textinputMD.has_key((ns.rss10, 'link')):
322	o.startElementNS((ns.rss10, 'textinput'), None,
323	{(ns.rdf, 'about'): textinputMD[(ns.rss10, 'link')]})
324	o.endElementNS((ns.rss10, 'textinput'), None)
325	o.ignorableWhitespace('\n')
326	o.endElementNS((ns.rss10, 'channel'), None)
327	o.ignorableWhitespace('\n')
328
329	# /image
330	if imageMD.has_key((ns.rss10, 'url')):
331	o.startElementNS((ns.rss10, 'image'), None,
332	{(ns.rdf, 'about'): imageMD[(ns.rss10, 'url')]})
333	for name, data in imageMD.items():
334	o.ignorableWhitespace(' ')
335	if name in rdfResources:
336	o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
337	else:
338	if "<" in data:
339	o.startElementNS(name, None,
340	{(ns.rdf, "parseType"): "Literal"})
341	else:
342	o.startElementNS(name, None, {})
343	o.characters(data)
344	o.endElementNS(name, None)
345	o.ignorableWhitespace('\n')
346	o.endElementNS((ns.rss10, 'image'), None)
347	o.ignorableWhitespace('\n')
348
349	# /textinput
350	if textinputMD.has_key((ns.rss10, 'link')):
351	o.startElementNS((ns.rss10, 'textinput'), None,
352	{(ns.rdf, 'about'): textinputMD[(ns.rss10, 'link')]})
353	for name, data in textinputMD.items():
354	o.ignorableWhitespace(' ')
355	if name in rdfResources:
356	o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
357	else:
358	if "<" in data:
359	o.startElementNS(name, None,
360	{(ns.rdf, "parseType"): "Literal"})
361	else:
362	o.startElementNS(name, None, {})
363	o.characters(data)
364	o.endElementNS(name, None)
365	o.ignorableWhitespace('\n')
366	o.endElementNS((ns.rss10, 'textinput'), None)
367	o.ignorableWhitespace('\n')
368
369	# /item
370	for id in items:
371	item = self.getItem(id)
372	o.startElementNS(
373	(ns.rss10, 'item'), None, {(ns.rdf, 'about'):
374	item.get((ns.rss10, 'link'), _make_hash(item))})
375	o.ignorableWhitespace('\n')
376	for name, data in item.items():
377	o.ignorableWhitespace(' ')
378	if name in rdfResources:
379	o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
380	else:
381	if "<" in data:
382	o.startElementNS(name, None,
383	{(ns.rdf, "parseType"): "Literal"})
384	else:
385	o.startElementNS(name, None, {})
386	o.characters(data)
387	o.endElementNS(name, None)
388	o.ignorableWhitespace('\n')
389	o.endElementNS((ns.rss10, 'item'), None)
390	o.ignorableWhitespace('\n')
391	o.endElementNS((ns.rdf, 'RDF'), None)
392	o.endDocument()
393	out.seek(0)
394	return out.read()
395
396
397
398	class TrackingChannel(ChannelBase, UserDict.UserDict):
399	"""
400	RSS Channel modeled as a URI-per-entry dictionary.
401
402	Item identifiers are (uri, index) tuples, where uri is
403	the rdf:about or rss:link URI, and index indicates the
404	position in a list of a number of times that URI has
405	appeared in the channel.
406
407	This allows "tracking" channels that track the state of
408	a group of resources, such as stock tickers, file state
409	changes, etc.
410
411	For example:
412
413	{
414	(ns.rss10, "channel"): {
415	(ns.rss10, "title"): "the channel",
416	(ns.rss10, "description"): "whatever",
417	},
418	(ns.rss10, "items"):
419	["http://example.com/foo", "htp://example.com/bar", ... ],
420	"http://example.com/foo" [
421	{
422	(ns.rss10, "title"): "item 1",
423	(ns.rss10, "link"): "http://example.com/",
424	(ns.rss10, "description"): "foo",
425	},
426	{
427	(ns.rss10, "title"): "item 1 revised",
428	(ns.rss10, "link"): "http://example.com/",
429	(ns.rss10, "description"): "foo revisited",
430	},
431	]
432
433	"http://example.com/bar" [
434	...
435	]
436	}
437
438	"""
439
440	def __init__(self, data={}):
441	ChannelBase.__init__(self)
442	UserDict.UserDict.__init__(self, data)
443	self.data[(ns.rss10, 'items')] = []
444
445	def listItems(self):
446	return self[(ns.rss10, 'items')]
447
448	def addItem(self, item, index=0):
449	if index == -1: index = len(self.data[(ns.rss10, 'items')])
450	uri = item.get((ns.rss10, "link"), _make_hash(item)) # shoudn't happen
451	if not self.data.has_key(uri):
452	self.data[uri] = [item]
453	else:
454	self.data[uri].append(item)
455	self.data[(ns.rss10, 'items')].insert(index, (uri, len(self.data[uri])))
456
457	def truncateToLength(self, length):
458	items = self.listItems()
459	data = self.data
460	overage = len(items) - length
461	while overage > 0:
462	del data[items.pop()[0]]
463	overage -= 1
464
465	def getItem(self, identifier):
466	(uri, index) = identifier
467	try:
468	return self.data[uri][index-1]
469	except (KeyError, IndexError):
470	return {}
471
472	def getMD(self, name):
473	return self.data.get(name, {})
474
475	def setMD(self, name, metadata):
476	self.data[name] = metadata
477
478
479
480	class CollectionChannel(ChannelBase, UserDict.UserDict):
481	"""
482	RSS Channel modeled as an item-per-entry dictionary.
483
484	Each Item is hashed to create a unique entry in the
485	dictionary, no matter how many times a particular
486	URI is in the channel.
487
488	This allows "collection" channels, which are typically
489	used for news updates, etc.
490
491	For example:
492
493	{
494	(ns.rss10, "channel"): {
495	(ns.rss10, "title"): "the channel",
496	(ns.rss10, "description"): "whatever",
497	},
498	(ns.rss10, "items"): ["ID1", "ID2", ... ],
499	"ID1" {
500	(ns.rss10, "title"): "item 1",
501	(ns.rss10, "link"): "http://example.com/",
502	(ns.rss10, "description"): "foo",
503	},
504	"ID2" {
505	...
506	}
507	}
508
509	Note that:
510	- items are keyed by a hash-data URI; metadata is keyed
511	by a (namespace, localname) tuple.
512	- (ns.rss10, items) is a property; it cannot be
513	manipulated without manipulating the corresponding
514	(sub-)items (delete, add)
515	- likewise, all item's are properties; adding, deleting,
516	appending an item modifies (ns.rss10, items)
517	correspondingly
518	"""
519
520	def __init__(self, data={}):
521	ChannelBase.__init__(self)
522	UserDict.UserDict.__init__(self, data)
523	self.data[(ns.rss10, 'items')] = []
524
525	def listItems(self):
526	return self.data[(ns.rss10, 'items')]
527
528	def addItem(self, item, index=0):
529	"""append an item dictionary to the channel"""
530	if index == -1: index = len(self.data[(ns.rss10, 'items')])
531	ID = _make_hash(item)
532	self.data[ID] = item
533	self.data[(ns.rss10, 'items')].insert(index, ID)
534
535	def getItem(self, identifier):
536	return self.data.get(identifier, {})
537
538	def getMD(self, name):
539	return self.data.get(name, {})
540
541	def setMD(self, name, metadata):
542	self.data[name] = metadata
543
544
545	class _XMLGenerator(saxutils.XMLGenerator):
546	"""
547	Modified XMLGenerator.
548
549	Allows modification of encoding error handling, and tries to
550	encode problematic characters as Latin-1 to work around some
551	implementations.
552	"""
553
554	def __init__(self, out=None, encoding='iso-8859-1', errors='strict'):
555	saxutils.XMLGenerator.__init__(self, out=out, encoding=encoding)
556	if out is None:
557	out = sys.stdout
558	self._out = codecs.lookup(encoding)[3](out, errors)
559
560	def characters(self, content):
561	try:
562	self._out.write(sax.saxutils.escape(content))
563	except UnicodeError: # hack for broken content
564	self._out.write(sax.saxutils.escape(unicode(content, 'Latin-1')))
565
566
567	class RSSParser(sax.handler.ContentHandler):
568	"""
569	Multi-format RSS/XML Parser.
570
571	Parse XML into RSS Channel objects. May optionally be passed a
572	Channel() instance to append to.
573
574	Formats understood include:
575	- RSS 0.9
576	- RSS 0.91
577	- RSS 0.92
578	- RSS 1.0 (EXCEPT "rich content" modules)
579
580	"Core" RSS elements are normalized to the RSS1.0 namespace.
581	"""
582
583	def __init__(self, channel, encoding='utf-8'):
584	sax.handler.ContentHandler.__init__(self)
585	self.channel = channel
586	self.encoding = encoding
587	self._context = []
588	self._tmp_item = {}
589	self._tmp_md = { (ns.rss10, "channel"): {},
590	(ns.rss10, "image"): {},
591	(ns.rss10, "textinput"): {},
592	}
593	self._tmp_buf = ''
594	self.version = None
595
596	def startElementNS(self, name, qname, attrs):
597	if name[1] is 'rss': # sniff version
598	if name[0] is None:
599	self.version = attrs.get('version', None)
600	else:
601	self.version = name[0]
602	# normalize the rss namespace
603	if name[0] in rssNamespaces and name[1] in rssElements:
604	name = (ns.rss10, name[1])
605	elif name[0] is None:
606	name = (ns.rss091, name[1])
607	self._context.append(name)
608	if name == (ns.rss10, 'item'):
609	self._tmp_item = {}
610	self._tmp_buf = ''
611	elif len(self._context) > 1 and \
612	self._context[-2] == (ns.rss10, 'item') and \
613	name in rdfResources:
614	self._tmp_item[name] = attrs[(ns.rdf, 'resource')]
615
616
617	def endElementNS(self, name, qname):
618	# normalize the rss namespace
619	if name[0] in rssNamespaces and name[1] in rssElements:
620	name = (ns.rss10, name[1])
621	elif name[0] is None:
622	name = (ns.rss091, name[1])
623	if (ns.rss10, 'item') in self._context:
624	if self._context[-1] == (ns.rss10, 'item'): # end of an item
625	self.channel.addItem(self._tmp_item, len(self.channel))
626	self._tmp_item = {}
627	elif self._context[-2] == (ns.rss10, 'item'): # an item's child
628	if name not in rdfResources:
629	self._tmp_item[name] = self._tmp_buf.strip()
630	else: # an item's grandchild
631	pass ###
632	elif len(self._context) > 2 and self._context[-2] in rssSections:
633	# metadata
634	self._tmp_md[self._context[-2]][name] = self._tmp_buf.strip()
635	self._tmp_buf = ''
636	self._context.pop()
637
638	def endDocument(self):
639	for name, metadata in self._tmp_md.items():
640	self.channel.setMD(name, metadata)
641
642	def characters(self, content):
643	self._tmp_buf = self._tmp_buf + content.encode(self.encoding)
644
645
646	def _make_hash(data):
647	return "hash-data:SHA:" + sha.new(pickle.dumps(data)).hexdigest()[:20]
648
649
650	if __name__ == "__main__":
651	# a simple test
652	c = TrackingChannel()
653	c.parse(sys.argv[1])
654	print c

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/RetroStatus/RSS.py@ 378

Download in other formats: