[201] | 1 | #!/usr/bin/env python
|
---|
| 2 |
|
---|
| 3 | """
|
---|
| 4 | RSS.py
|
---|
| 5 |
|
---|
| 6 | Classes for working with RSS channels as arbitrary data structures.
|
---|
| 7 | Requires Python 2.2 or newer and PyXML 0.7.1 or newer.
|
---|
| 8 |
|
---|
| 9 | ChannelBase - Base class for RSS Channels.
|
---|
| 10 | CollectionChannel - RSS Channel modeled as a URI-per-entry
|
---|
| 11 | dictionary.
|
---|
| 12 | TrackingChannel - RSS Channel modeled as an item-per-entry
|
---|
| 13 | dictionary.
|
---|
| 14 | RSSParser - Multi-format RSS/XML Parser.
|
---|
| 15 |
|
---|
| 16 | Typically, the *Channel clases will be most useful to developers.
|
---|
| 17 |
|
---|
| 18 | This library provides tools for working with RSS feeds as data
|
---|
| 19 | structures. The core is an RSS parser capable of understanding most
|
---|
| 20 | RSS formats, and a serializer that produces RSS1.0. The RSS channel
|
---|
| 21 | itself can be represented as any arbitrary data structure; two such
|
---|
| 22 | structures are provided both as examples and to service common
|
---|
| 23 | usage. This approach allows channels to be manipulated and stored in
|
---|
| 24 | a fashion that suits both their semantics and the applications that
|
---|
| 25 | access them.
|
---|
| 26 |
|
---|
| 27 | Both the parser and the serializer have the following limitations:
|
---|
| 28 | - RSS 1.0 "rich content" modules are not supported
|
---|
| 29 | - RSS 0.9x features that rely on attributes are not supported
|
---|
| 30 | - RDF is not understood; this library does not expose statements or
|
---|
| 31 | understand RDF syntax beyond that documented in RSS1.0 (taking
|
---|
| 32 | into account the previously listed limitations)
|
---|
| 33 |
|
---|
| 34 | The RSS format is made up of three metadata sections (channel,
|
---|
| 35 | image, and textinput) and a list of items. Each individual metadata
|
---|
| 36 | section and each item is passed around as an "item dictionary",
|
---|
| 37 | which is a Python dictionary with (namespace, localname) tuples as
|
---|
| 38 | keys. The values of the dictionaries are always strings; they may
|
---|
| 39 | contain markup, which will be rendered into the RSS/XML when
|
---|
| 40 | serialized.
|
---|
| 41 |
|
---|
| 42 | Individual items are found by using an "item identifier"; this is a
|
---|
| 43 | channel-unique, string identifier for any given item. Item
|
---|
| 44 | identifiers may be generated in a variety of ways, depending on the
|
---|
| 45 | requirements of the channel.
|
---|
| 46 |
|
---|
| 47 | Certain types of channel metadata are automatically generated, and
|
---|
| 48 | will not be returned or honored when accessed. They includes the
|
---|
| 49 | "items", "image" and "textinput" children of the channel element.
|
---|
| 50 |
|
---|
| 51 |
|
---|
| 52 | TODO:
|
---|
| 53 | - any markup (and the content inside) in item or metadata children
|
---|
| 54 | (e.g., HTML in a <description> will be silently ignored.
|
---|
| 55 | - test suite
|
---|
| 56 | - a function (XPath-based?) to detect a channel's type and return
|
---|
| 57 | the appropriate class.
|
---|
| 58 | - pay attention to <rss:items> when appropriate.
|
---|
| 59 | """
|
---|
| 60 |
|
---|
| 61 | __license__ = """
|
---|
| 62 | Copyright (c) 2004 Mark Nottingham <mnot@pobox.com>
|
---|
| 63 |
|
---|
| 64 | Permission is hereby granted, free of charge, to any person obtaining a copy
|
---|
| 65 | of this software and associated documentation files (the "Software"), to deal
|
---|
| 66 | in the Software without restriction, including without limitation the rights
|
---|
| 67 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
---|
| 68 | copies of the Software, and to permit persons to whom the Software is
|
---|
| 69 | furnished to do so, subject to the following conditions:
|
---|
| 70 |
|
---|
| 71 | The above copyright notice and this permission notice shall be included in all
|
---|
| 72 | copies or substantial portions of the Software.
|
---|
| 73 |
|
---|
| 74 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
---|
| 75 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
---|
| 76 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
---|
| 77 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
---|
| 78 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
---|
| 79 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
---|
| 80 | SOFTWARE.
|
---|
| 81 | """
|
---|
| 82 |
|
---|
| 83 | __version__ = "0.46"
|
---|
| 84 |
|
---|
| 85 | import UserDict, sys, codecs, sha, types, signal
|
---|
| 86 | import xml.sax as sax
|
---|
| 87 | import xml.sax.saxutils as saxutils
|
---|
| 88 | import cPickle as pickle
|
---|
| 89 | import cStringIO as StringIO
|
---|
| 90 |
|
---|
| 91 | versionURI = 'http://www.mnot.net/python/RSS.py?version=%s' % __version__
|
---|
| 92 |
|
---|
| 93 |
|
---|
| 94 | class _NamespaceMap:
|
---|
| 95 | """
|
---|
| 96 | Prefix <-> Namespace map.
|
---|
| 97 |
|
---|
| 98 | Hold prefix->namespace mappings, and generate new prefixes when
|
---|
| 99 | necessary. Exposes prefix->URI map as attributes, URI->prefix
|
---|
| 100 | through getPrefix(URI).
|
---|
| 101 | """
|
---|
| 102 |
|
---|
| 103 | def __init__(self):
|
---|
| 104 | self._nsID = 0 # seed for namespace prefix generation
|
---|
| 105 | self._prefixMap = {}
|
---|
| 106 | self.rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
---|
| 107 | self.rss10 = 'http://purl.org/rss/1.0/'
|
---|
| 108 | self.rss09 = 'http://my.netscape.com/rdf/simple/0.9/'
|
---|
| 109 | self.rss091 = 'http://purl.org/rss/1.0/modules/rss091/'
|
---|
| 110 | self.dc = 'http://purl.org/dc/elements/1.1/'
|
---|
| 111 | self.syn = 'http://purl.org/rss/modules/syndication/'
|
---|
| 112 | self.content = 'http://purl.org/rss/1.0/modules/content/'
|
---|
| 113 | self.admin = 'http://webns.net/mvcb/'
|
---|
| 114 | self.ag = 'http://purl.org/rss/modules/aggregation/'
|
---|
| 115 | self.annotate = 'http://purl.org/rss/1.0/modules/annotate/'
|
---|
| 116 | self.cp = 'http://my.theinfo.org/changed/1.0/rss/'
|
---|
| 117 | self.company = 'http://purl.org/rss/1.0/modules/company'
|
---|
| 118 | self.event = 'http://purl.org/rss/1.0/modules/event/'
|
---|
| 119 | self.slash = 'http://purl.org/rss/1.0/modules/slash/'
|
---|
| 120 | self.html = 'http://www.w3.org/html4/'
|
---|
| 121 |
|
---|
| 122 | def __setattr__(self, attr, value):
|
---|
| 123 | self.__dict__[attr] = value
|
---|
| 124 | if attr[0] != '_':
|
---|
| 125 | self._prefixMap[value] = attr
|
---|
| 126 |
|
---|
| 127 | def getPrefix(self, URI):
|
---|
| 128 | """
|
---|
| 129 | Get the prefix for a given URI; generate one if it
|
---|
| 130 | doesn't exist.
|
---|
| 131 | """
|
---|
| 132 | try:
|
---|
| 133 | if URI == self.rss10:
|
---|
| 134 | return None # special case
|
---|
| 135 | return self._prefixMap[URI]
|
---|
| 136 | except KeyError:
|
---|
| 137 | o = []
|
---|
| 138 | d = self._nsID
|
---|
| 139 | while 1:
|
---|
| 140 | o.insert(0, d % 26)
|
---|
| 141 | d = d / 26
|
---|
| 142 | if not d: break
|
---|
| 143 | candidate = "".join(map(lambda a: chr(a+97), o))
|
---|
| 144 | self._nsID = self._nsID + 1
|
---|
| 145 | if candidate in self._prefixMap.values():
|
---|
| 146 | candidate = self.getPrefix(URI)
|
---|
| 147 | setattr(self, candidate, URI)
|
---|
| 148 | return candidate
|
---|
| 149 |
|
---|
| 150 |
|
---|
| 151 | ns = _NamespaceMap()
|
---|
| 152 |
|
---|
| 153 | # possible namespaces for RSS docs (None included for 0.9x)
|
---|
| 154 | rssNamespaces = [ns.rss09, ns.rss10, None]
|
---|
| 155 |
|
---|
| 156 | # major sections of a RSS file
|
---|
| 157 | rssSections = [ (ns.rss10, 'channel'),
|
---|
| 158 | (ns.rss10, 'image'),
|
---|
| 159 | (ns.rss10, 'textarea')
|
---|
| 160 | ]
|
---|
| 161 |
|
---|
| 162 | # RSS core element localnames
|
---|
| 163 | rssElements = ['rss', 'channel', 'image', 'textarea', 'item', 'items',
|
---|
| 164 | 'title', 'link', 'description', 'url']
|
---|
| 165 |
|
---|
| 166 | # RSS elements whose data is in an rdf:resource attribute
|
---|
| 167 | rdfResources = [ (ns.rss10, 'image'),
|
---|
| 168 | (ns.rss10, 'textarea'),
|
---|
| 169 | (ns.admin, 'errorReportsTo'),
|
---|
| 170 | (ns.admin, 'generatorAgent'),
|
---|
| 171 | (ns.annotate, 'reference'),
|
---|
| 172 | (ns.cp, 'server')
|
---|
| 173 | ]
|
---|
| 174 |
|
---|
| 175 |
|
---|
| 176 | class ChannelBase:
|
---|
| 177 | """
|
---|
| 178 | Base class for RSS Channels.
|
---|
| 179 |
|
---|
| 180 | A number of generic methods for accessing and setting channel
|
---|
| 181 | data and metadata are exposed, for the benefit of subclasses.
|
---|
| 182 | They may be used by applications as well, or the data structure
|
---|
| 183 | of the subclass may be directly manipulated.
|
---|
| 184 | """
|
---|
| 185 |
|
---|
| 186 | def __init__(self):
|
---|
| 187 | self.encoding = 'utf-8'
|
---|
| 188 |
|
---|
| 189 | def listItems(self):
|
---|
| 190 | """List the items in a channel, with a list of identifiers."""
|
---|
| 191 | pass # override me
|
---|
| 192 |
|
---|
| 193 | def addItem(self, item, index=0):
|
---|
| 194 | """Add an item to the channel. Expects an item dictionary."""
|
---|
| 195 | pass # override me
|
---|
| 196 |
|
---|
| 197 | def getItem(self, identifier):
|
---|
| 198 | """Get the appropriate item dictionary for a given identifier."""
|
---|
| 199 | pass # override me
|
---|
| 200 |
|
---|
| 201 | def getMD(self, name):
|
---|
| 202 | """
|
---|
| 203 | Get the [name] metadata as an item dictionary, where type is
|
---|
| 204 | a tuple (typically, in the ns:rss10 namespace, with a localname of
|
---|
| 205 | channel|image|textinput). MUST return an empty dictionary if the
|
---|
| 206 | metadata isn't found.
|
---|
| 207 | """
|
---|
| 208 | pass # override me
|
---|
| 209 |
|
---|
| 210 | def setMD(self, name, metadata):
|
---|
| 211 | """
|
---|
| 212 | Set the [name] metadata, where name is a tuple (typically,
|
---|
| 213 | it will be in the ns:rss10 namespace, and have a localname of
|
---|
| 214 | channel|image|textinput), and metadata is an item dictionary.
|
---|
| 215 | """
|
---|
| 216 | pass # override me
|
---|
| 217 |
|
---|
| 218 | def parse(self, url, timeout=30):
|
---|
| 219 | """
|
---|
| 220 | Fetch a channel representation from a URL and populate
|
---|
| 221 | the channel.
|
---|
| 222 | """
|
---|
| 223 | dh = RSSParser(self)
|
---|
| 224 | p = sax.sax2exts.make_parser()
|
---|
| 225 | p.setContentHandler(dh)
|
---|
| 226 | p.setFeature(sax.handler.feature_namespaces, 1)
|
---|
| 227 | signal.signal(signal.SIGALRM, self._timeout)
|
---|
| 228 | signal.alarm(timeout)
|
---|
| 229 | try:
|
---|
| 230 | p.parse(str(url)) # URIs are ascii
|
---|
| 231 | finally:
|
---|
| 232 | signal.alarm(0)
|
---|
| 233 | return dh
|
---|
| 234 |
|
---|
| 235 | def _timeout(self, **args):
|
---|
| 236 | raise IOError, 'timeout'
|
---|
| 237 |
|
---|
| 238 | def parseFile(self, file):
|
---|
| 239 | """Parse a file and populate the channel."""
|
---|
| 240 | dh = RSSParser(self)
|
---|
| 241 | p = sax.sax2exts.make_parser()
|
---|
| 242 | p.setContentHandler(dh)
|
---|
| 243 | p.setFeature(sax.handler.feature_namespaces, 1)
|
---|
| 244 | p.parseFile(file)
|
---|
| 245 | return dh
|
---|
| 246 |
|
---|
| 247 | def __str__(self):
|
---|
| 248 | return self.output(self.listItems())
|
---|
| 249 |
|
---|
| 250 | def output(self, items):
|
---|
| 251 | """Return the items referred to by a list of identifiers."""
|
---|
| 252 | assert type(items) is types.ListType, "items must be a list (%s)" % \
|
---|
| 253 | type(items)
|
---|
| 254 | out = StringIO.StringIO()
|
---|
| 255 | o = _XMLGenerator(out, self.encoding, 'replace')
|
---|
| 256 | channelMD = self.getMD((ns.rss10, "channel"))
|
---|
| 257 | imageMD = self.getMD((ns.rss10, "image"))
|
---|
| 258 | textinputMD = self.getMD((ns.rss10, "textinput"))
|
---|
| 259 | channelMD[(ns.admin, 'generatorAgent')] = versionURI
|
---|
| 260 |
|
---|
| 261 | # gather namespaces, map prefixes
|
---|
| 262 | namespaces = {ns.rdf: 1}
|
---|
| 263 | namespaces.update(dict(
|
---|
| 264 | channelMD.keys() + imageMD.keys() + textinputMD.keys()))
|
---|
| 265 | [namespaces.update(dict(i.keys())) for i in map(self.getItem, items)]
|
---|
| 266 | for namespace in namespaces.keys():
|
---|
| 267 | o.startPrefixMapping(ns.getPrefix(namespace), namespace)
|
---|
| 268 |
|
---|
| 269 | # write the XML
|
---|
| 270 | o.startDocument()
|
---|
| 271 | o.startElementNS((ns.rdf, 'RDF'), None, {})
|
---|
| 272 | o.ignorableWhitespace('\n')
|
---|
| 273 | o.startElementNS(
|
---|
| 274 | (ns.rss10, 'channel'), None,
|
---|
| 275 | {(ns.rdf, 'about'): channelMD[(ns.rss10, 'link')]})
|
---|
| 276 | o.ignorableWhitespace('\n')
|
---|
| 277 |
|
---|
| 278 | # /channel
|
---|
| 279 | for name, data in channelMD.items():
|
---|
| 280 | if name in [(ns.rss10, 'items'), (ns.rss10, 'image'),
|
---|
| 281 | (ns.rss10, 'textinput')]:
|
---|
| 282 | continue
|
---|
| 283 | o.ignorableWhitespace(' ')
|
---|
| 284 | if name in rdfResources:
|
---|
| 285 | o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
|
---|
| 286 | else:
|
---|
| 287 | if "<" in data:
|
---|
| 288 | o.startElementNS(name, None,
|
---|
| 289 | {(ns.rdf, "parseType"): "Literal"})
|
---|
| 290 | else:
|
---|
| 291 | o.startElementNS(name, None, {})
|
---|
| 292 | o.characters(data)
|
---|
| 293 | o.endElementNS(name, None)
|
---|
| 294 | o.ignorableWhitespace('\n')
|
---|
| 295 |
|
---|
| 296 | # /channel/items
|
---|
| 297 | o.ignorableWhitespace(' ')
|
---|
| 298 | o.startElementNS((ns.rss10, 'items'), None, {})
|
---|
| 299 | o.startElementNS((ns.rdf, 'Seq'), None, {})
|
---|
| 300 | o.ignorableWhitespace('\n')
|
---|
| 301 | for id in items:
|
---|
| 302 | o.ignorableWhitespace(' ')
|
---|
| 303 | o.startElementNS((ns.rdf, 'li'), None,
|
---|
| 304 | {(ns.rdf, 'resource'): self.getItem(id).get((ns.rss10, 'link'),
|
---|
| 305 | _make_hash(self.getItem(id)))})
|
---|
| 306 | o.endElementNS((ns.rdf, 'li'), None)
|
---|
| 307 | o.ignorableWhitespace('\n')
|
---|
| 308 | o.ignorableWhitespace(' ')
|
---|
| 309 | o.endElementNS((ns.rdf, 'Seq'), None)
|
---|
| 310 | o.endElementNS((ns.rss10, 'items'), None)
|
---|
| 311 | o.ignorableWhitespace('\n')
|
---|
| 312 |
|
---|
| 313 | # /channel/image
|
---|
| 314 | if imageMD.has_key((ns.rss10, 'url')):
|
---|
| 315 | o.startElementNS((ns.rss10, 'image'), None,
|
---|
| 316 | {(ns.rdf, 'about'): imageMD[(ns.rss10, 'url')]})
|
---|
| 317 | o.endElementNS((ns.rss10, 'image'), None)
|
---|
| 318 | o.ignorableWhitespace('\n')
|
---|
| 319 |
|
---|
| 320 | # /channel/textinput
|
---|
| 321 | if textinputMD.has_key((ns.rss10, 'link')):
|
---|
| 322 | o.startElementNS((ns.rss10, 'textinput'), None,
|
---|
| 323 | {(ns.rdf, 'about'): textinputMD[(ns.rss10, 'link')]})
|
---|
| 324 | o.endElementNS((ns.rss10, 'textinput'), None)
|
---|
| 325 | o.ignorableWhitespace('\n')
|
---|
| 326 | o.endElementNS((ns.rss10, 'channel'), None)
|
---|
| 327 | o.ignorableWhitespace('\n')
|
---|
| 328 |
|
---|
| 329 | # /image
|
---|
| 330 | if imageMD.has_key((ns.rss10, 'url')):
|
---|
| 331 | o.startElementNS((ns.rss10, 'image'), None,
|
---|
| 332 | {(ns.rdf, 'about'): imageMD[(ns.rss10, 'url')]})
|
---|
| 333 | for name, data in imageMD.items():
|
---|
| 334 | o.ignorableWhitespace(' ')
|
---|
| 335 | if name in rdfResources:
|
---|
| 336 | o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
|
---|
| 337 | else:
|
---|
| 338 | if "<" in data:
|
---|
| 339 | o.startElementNS(name, None,
|
---|
| 340 | {(ns.rdf, "parseType"): "Literal"})
|
---|
| 341 | else:
|
---|
| 342 | o.startElementNS(name, None, {})
|
---|
| 343 | o.characters(data)
|
---|
| 344 | o.endElementNS(name, None)
|
---|
| 345 | o.ignorableWhitespace('\n')
|
---|
| 346 | o.endElementNS((ns.rss10, 'image'), None)
|
---|
| 347 | o.ignorableWhitespace('\n')
|
---|
| 348 |
|
---|
| 349 | # /textinput
|
---|
| 350 | if textinputMD.has_key((ns.rss10, 'link')):
|
---|
| 351 | o.startElementNS((ns.rss10, 'textinput'), None,
|
---|
| 352 | {(ns.rdf, 'about'): textinputMD[(ns.rss10, 'link')]})
|
---|
| 353 | for name, data in textinputMD.items():
|
---|
| 354 | o.ignorableWhitespace(' ')
|
---|
| 355 | if name in rdfResources:
|
---|
| 356 | o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
|
---|
| 357 | else:
|
---|
| 358 | if "<" in data:
|
---|
| 359 | o.startElementNS(name, None,
|
---|
| 360 | {(ns.rdf, "parseType"): "Literal"})
|
---|
| 361 | else:
|
---|
| 362 | o.startElementNS(name, None, {})
|
---|
| 363 | o.characters(data)
|
---|
| 364 | o.endElementNS(name, None)
|
---|
| 365 | o.ignorableWhitespace('\n')
|
---|
| 366 | o.endElementNS((ns.rss10, 'textinput'), None)
|
---|
| 367 | o.ignorableWhitespace('\n')
|
---|
| 368 |
|
---|
| 369 | # /item
|
---|
| 370 | for id in items:
|
---|
| 371 | item = self.getItem(id)
|
---|
| 372 | o.startElementNS(
|
---|
| 373 | (ns.rss10, 'item'), None, {(ns.rdf, 'about'):
|
---|
| 374 | item.get((ns.rss10, 'link'), _make_hash(item))})
|
---|
| 375 | o.ignorableWhitespace('\n')
|
---|
| 376 | for name, data in item.items():
|
---|
| 377 | o.ignorableWhitespace(' ')
|
---|
| 378 | if name in rdfResources:
|
---|
| 379 | o.startElementNS(name, None, {(ns.rdf, 'resource'): data})
|
---|
| 380 | else:
|
---|
| 381 | if "<" in data:
|
---|
| 382 | o.startElementNS(name, None,
|
---|
| 383 | {(ns.rdf, "parseType"): "Literal"})
|
---|
| 384 | else:
|
---|
| 385 | o.startElementNS(name, None, {})
|
---|
| 386 | o.characters(data)
|
---|
| 387 | o.endElementNS(name, None)
|
---|
| 388 | o.ignorableWhitespace('\n')
|
---|
| 389 | o.endElementNS((ns.rss10, 'item'), None)
|
---|
| 390 | o.ignorableWhitespace('\n')
|
---|
| 391 | o.endElementNS((ns.rdf, 'RDF'), None)
|
---|
| 392 | o.endDocument()
|
---|
| 393 | out.seek(0)
|
---|
| 394 | return out.read()
|
---|
| 395 |
|
---|
| 396 |
|
---|
| 397 |
|
---|
| 398 | class TrackingChannel(ChannelBase, UserDict.UserDict):
|
---|
| 399 | """
|
---|
| 400 | RSS Channel modeled as a URI-per-entry dictionary.
|
---|
| 401 |
|
---|
| 402 | Item identifiers are (uri, index) tuples, where uri is
|
---|
| 403 | the rdf:about or rss:link URI, and index indicates the
|
---|
| 404 | position in a list of a number of times that URI has
|
---|
| 405 | appeared in the channel.
|
---|
| 406 |
|
---|
| 407 | This allows "tracking" channels that track the state of
|
---|
| 408 | a group of resources, such as stock tickers, file state
|
---|
| 409 | changes, etc.
|
---|
| 410 |
|
---|
| 411 | For example:
|
---|
| 412 |
|
---|
| 413 | {
|
---|
| 414 | (ns.rss10, "channel"): {
|
---|
| 415 | (ns.rss10, "title"): "the channel",
|
---|
| 416 | (ns.rss10, "description"): "whatever",
|
---|
| 417 | },
|
---|
| 418 | (ns.rss10, "items"):
|
---|
| 419 | ["http://example.com/foo", "htp://example.com/bar", ... ],
|
---|
| 420 | "http://example.com/foo" [
|
---|
| 421 | {
|
---|
| 422 | (ns.rss10, "title"): "item 1",
|
---|
| 423 | (ns.rss10, "link"): "http://example.com/",
|
---|
| 424 | (ns.rss10, "description"): "foo",
|
---|
| 425 | },
|
---|
| 426 | {
|
---|
| 427 | (ns.rss10, "title"): "item 1 revised",
|
---|
| 428 | (ns.rss10, "link"): "http://example.com/",
|
---|
| 429 | (ns.rss10, "description"): "foo revisited",
|
---|
| 430 | },
|
---|
| 431 | ]
|
---|
| 432 |
|
---|
| 433 | "http://example.com/bar" [
|
---|
| 434 | ...
|
---|
| 435 | ]
|
---|
| 436 | }
|
---|
| 437 |
|
---|
| 438 | """
|
---|
| 439 |
|
---|
| 440 | def __init__(self, data={}):
|
---|
| 441 | ChannelBase.__init__(self)
|
---|
| 442 | UserDict.UserDict.__init__(self, data)
|
---|
| 443 | self.data[(ns.rss10, 'items')] = []
|
---|
| 444 |
|
---|
| 445 | def listItems(self):
|
---|
| 446 | return self[(ns.rss10, 'items')]
|
---|
| 447 |
|
---|
| 448 | def addItem(self, item, index=0):
|
---|
| 449 | if index == -1: index = len(self.data[(ns.rss10, 'items')])
|
---|
| 450 | uri = item.get((ns.rss10, "link"), _make_hash(item)) # shoudn't happen
|
---|
| 451 | if not self.data.has_key(uri):
|
---|
| 452 | self.data[uri] = [item]
|
---|
| 453 | else:
|
---|
| 454 | self.data[uri].append(item)
|
---|
| 455 | self.data[(ns.rss10, 'items')].insert(index, (uri, len(self.data[uri])))
|
---|
| 456 |
|
---|
| 457 | def truncateToLength(self, length):
|
---|
| 458 | items = self.listItems()
|
---|
| 459 | data = self.data
|
---|
| 460 | overage = len(items) - length
|
---|
| 461 | while overage > 0:
|
---|
| 462 | del data[items.pop()[0]]
|
---|
| 463 | overage -= 1
|
---|
| 464 |
|
---|
| 465 | def getItem(self, identifier):
|
---|
| 466 | (uri, index) = identifier
|
---|
| 467 | try:
|
---|
| 468 | return self.data[uri][index-1]
|
---|
| 469 | except (KeyError, IndexError):
|
---|
| 470 | return {}
|
---|
| 471 |
|
---|
| 472 | def getMD(self, name):
|
---|
| 473 | return self.data.get(name, {})
|
---|
| 474 |
|
---|
| 475 | def setMD(self, name, metadata):
|
---|
| 476 | self.data[name] = metadata
|
---|
| 477 |
|
---|
| 478 |
|
---|
| 479 |
|
---|
| 480 | class CollectionChannel(ChannelBase, UserDict.UserDict):
|
---|
| 481 | """
|
---|
| 482 | RSS Channel modeled as an item-per-entry dictionary.
|
---|
| 483 |
|
---|
| 484 | Each Item is hashed to create a unique entry in the
|
---|
| 485 | dictionary, no matter how many times a particular
|
---|
| 486 | URI is in the channel.
|
---|
| 487 |
|
---|
| 488 | This allows "collection" channels, which are typically
|
---|
| 489 | used for news updates, etc.
|
---|
| 490 |
|
---|
| 491 | For example:
|
---|
| 492 |
|
---|
| 493 | {
|
---|
| 494 | (ns.rss10, "channel"): {
|
---|
| 495 | (ns.rss10, "title"): "the channel",
|
---|
| 496 | (ns.rss10, "description"): "whatever",
|
---|
| 497 | },
|
---|
| 498 | (ns.rss10, "items"): ["ID1", "ID2", ... ],
|
---|
| 499 | "ID1" {
|
---|
| 500 | (ns.rss10, "title"): "item 1",
|
---|
| 501 | (ns.rss10, "link"): "http://example.com/",
|
---|
| 502 | (ns.rss10, "description"): "foo",
|
---|
| 503 | },
|
---|
| 504 | "ID2" {
|
---|
| 505 | ...
|
---|
| 506 | }
|
---|
| 507 | }
|
---|
| 508 |
|
---|
| 509 | Note that:
|
---|
| 510 | - items are keyed by a hash-data URI; metadata is keyed
|
---|
| 511 | by a (namespace, localname) tuple.
|
---|
| 512 | - (ns.rss10, items) is a property; it cannot be
|
---|
| 513 | manipulated without manipulating the corresponding
|
---|
| 514 | (sub-)items (delete, add)
|
---|
| 515 | - likewise, all item's are properties; adding, deleting,
|
---|
| 516 | appending an item modifies (ns.rss10, items)
|
---|
| 517 | correspondingly
|
---|
| 518 | """
|
---|
| 519 |
|
---|
| 520 | def __init__(self, data={}):
|
---|
| 521 | ChannelBase.__init__(self)
|
---|
| 522 | UserDict.UserDict.__init__(self, data)
|
---|
| 523 | self.data[(ns.rss10, 'items')] = []
|
---|
| 524 |
|
---|
| 525 | def listItems(self):
|
---|
| 526 | return self.data[(ns.rss10, 'items')]
|
---|
| 527 |
|
---|
| 528 | def addItem(self, item, index=0):
|
---|
| 529 | """append an item dictionary to the channel"""
|
---|
| 530 | if index == -1: index = len(self.data[(ns.rss10, 'items')])
|
---|
| 531 | ID = _make_hash(item)
|
---|
| 532 | self.data[ID] = item
|
---|
| 533 | self.data[(ns.rss10, 'items')].insert(index, ID)
|
---|
| 534 |
|
---|
| 535 | def getItem(self, identifier):
|
---|
| 536 | return self.data.get(identifier, {})
|
---|
| 537 |
|
---|
| 538 | def getMD(self, name):
|
---|
| 539 | return self.data.get(name, {})
|
---|
| 540 |
|
---|
| 541 | def setMD(self, name, metadata):
|
---|
| 542 | self.data[name] = metadata
|
---|
| 543 |
|
---|
| 544 |
|
---|
| 545 | class _XMLGenerator(saxutils.XMLGenerator):
|
---|
| 546 | """
|
---|
| 547 | Modified XMLGenerator.
|
---|
| 548 |
|
---|
| 549 | Allows modification of encoding error handling, and tries to
|
---|
| 550 | encode problematic characters as Latin-1 to work around some
|
---|
| 551 | implementations.
|
---|
| 552 | """
|
---|
| 553 |
|
---|
| 554 | def __init__(self, out=None, encoding='iso-8859-1', errors='strict'):
|
---|
| 555 | saxutils.XMLGenerator.__init__(self, out=out, encoding=encoding)
|
---|
| 556 | if out is None:
|
---|
| 557 | out = sys.stdout
|
---|
| 558 | self._out = codecs.lookup(encoding)[3](out, errors)
|
---|
| 559 |
|
---|
| 560 | def characters(self, content):
|
---|
| 561 | try:
|
---|
| 562 | self._out.write(sax.saxutils.escape(content))
|
---|
| 563 | except UnicodeError: # hack for broken content
|
---|
| 564 | self._out.write(sax.saxutils.escape(unicode(content, 'Latin-1')))
|
---|
| 565 |
|
---|
| 566 |
|
---|
| 567 | class RSSParser(sax.handler.ContentHandler):
|
---|
| 568 | """
|
---|
| 569 | Multi-format RSS/XML Parser.
|
---|
| 570 |
|
---|
| 571 | Parse XML into RSS Channel objects. May optionally be passed a
|
---|
| 572 | Channel() instance to append to.
|
---|
| 573 |
|
---|
| 574 | Formats understood include:
|
---|
| 575 | - RSS 0.9
|
---|
| 576 | - RSS 0.91
|
---|
| 577 | - RSS 0.92
|
---|
| 578 | - RSS 1.0 (EXCEPT "rich content" modules)
|
---|
| 579 |
|
---|
| 580 | "Core" RSS elements are normalized to the RSS1.0 namespace.
|
---|
| 581 | """
|
---|
| 582 |
|
---|
| 583 | def __init__(self, channel, encoding='utf-8'):
|
---|
| 584 | sax.handler.ContentHandler.__init__(self)
|
---|
| 585 | self.channel = channel
|
---|
| 586 | self.encoding = encoding
|
---|
| 587 | self._context = []
|
---|
| 588 | self._tmp_item = {}
|
---|
| 589 | self._tmp_md = { (ns.rss10, "channel"): {},
|
---|
| 590 | (ns.rss10, "image"): {},
|
---|
| 591 | (ns.rss10, "textinput"): {},
|
---|
| 592 | }
|
---|
| 593 | self._tmp_buf = ''
|
---|
| 594 | self.version = None
|
---|
| 595 |
|
---|
| 596 | def startElementNS(self, name, qname, attrs):
|
---|
| 597 | if name[1] is 'rss': # sniff version
|
---|
| 598 | if name[0] is None:
|
---|
| 599 | self.version = attrs.get('version', None)
|
---|
| 600 | else:
|
---|
| 601 | self.version = name[0]
|
---|
| 602 | # normalize the rss namespace
|
---|
| 603 | if name[0] in rssNamespaces and name[1] in rssElements:
|
---|
| 604 | name = (ns.rss10, name[1])
|
---|
| 605 | elif name[0] is None:
|
---|
| 606 | name = (ns.rss091, name[1])
|
---|
| 607 | self._context.append(name)
|
---|
| 608 | if name == (ns.rss10, 'item'):
|
---|
| 609 | self._tmp_item = {}
|
---|
| 610 | self._tmp_buf = ''
|
---|
| 611 | elif len(self._context) > 1 and \
|
---|
| 612 | self._context[-2] == (ns.rss10, 'item') and \
|
---|
| 613 | name in rdfResources:
|
---|
| 614 | self._tmp_item[name] = attrs[(ns.rdf, 'resource')]
|
---|
| 615 |
|
---|
| 616 |
|
---|
| 617 | def endElementNS(self, name, qname):
|
---|
| 618 | # normalize the rss namespace
|
---|
| 619 | if name[0] in rssNamespaces and name[1] in rssElements:
|
---|
| 620 | name = (ns.rss10, name[1])
|
---|
| 621 | elif name[0] is None:
|
---|
| 622 | name = (ns.rss091, name[1])
|
---|
| 623 | if (ns.rss10, 'item') in self._context:
|
---|
| 624 | if self._context[-1] == (ns.rss10, 'item'): # end of an item
|
---|
| 625 | self.channel.addItem(self._tmp_item, len(self.channel))
|
---|
| 626 | self._tmp_item = {}
|
---|
| 627 | elif self._context[-2] == (ns.rss10, 'item'): # an item's child
|
---|
| 628 | if name not in rdfResources:
|
---|
| 629 | self._tmp_item[name] = self._tmp_buf.strip()
|
---|
| 630 | else: # an item's grandchild
|
---|
| 631 | pass ###
|
---|
| 632 | elif len(self._context) > 2 and self._context[-2] in rssSections:
|
---|
| 633 | # metadata
|
---|
| 634 | self._tmp_md[self._context[-2]][name] = self._tmp_buf.strip()
|
---|
| 635 | self._tmp_buf = ''
|
---|
| 636 | self._context.pop()
|
---|
| 637 |
|
---|
| 638 | def endDocument(self):
|
---|
| 639 | for name, metadata in self._tmp_md.items():
|
---|
| 640 | self.channel.setMD(name, metadata)
|
---|
| 641 |
|
---|
| 642 | def characters(self, content):
|
---|
| 643 | self._tmp_buf = self._tmp_buf + content.encode(self.encoding)
|
---|
| 644 |
|
---|
| 645 |
|
---|
| 646 | def _make_hash(data):
|
---|
| 647 | return "hash-data:SHA:" + sha.new(pickle.dumps(data)).hexdigest()[:20]
|
---|
| 648 |
|
---|
| 649 |
|
---|
| 650 | if __name__ == "__main__":
|
---|
| 651 | # a simple test
|
---|
| 652 | c = TrackingChannel()
|
---|
| 653 | c.parse(sys.argv[1])
|
---|
| 654 | print c
|
---|