"""RFC 3676 format=flowed text processing. This module provides an API to create and display text/plain; format=flowed mimetype text. """ # Copyright (C) 2005 Martijn Pieters # Written by Martijn Pieters # Development was sponsored by Logicalware (http://www.logicalware.org/) # Licensed as Open Source under the same terms as the Python 2.4.1 license, # as available at http://www.python.org/2.4.1/license.html __revision_id__ = '$Id: formatflowed.py 27 2005-09-17 19:13:48Z mj $' import re import textwrap __all__ = [ 'PARAGRAPH', 'FIXED', 'SIGNATURE_SEPARATOR', 'FormatFlowedDecoder', 'FormatFlowedEncoder', 'decode', 'encode', 'convertToWrapped', 'convertToFlowed' ] # Constants denoting the various text chunk types recognized by format=flowed PARAGRAPH, FIXED, SIGNATURE_SEPARATOR = range(3) # -- Public classes ---------------------------------------------------- class FormatFlowedDecoder: """Object for converting format=flowed text to other formats The following instance attributes influence the interpretation of format=flowed text: delete_space (default: False) Delete the trailing space before the CRLF on flowed lines before interpreting the line on flowed input, corresponds to the DelSp mime parameter character_set (default: us-ascii) The encoding of text passed in. Text is decoded to unicode using this encoding, using the default error handing scheme. """ def __init__(self, delete_space=False, character_set='us-ascii'): self.delete_space = delete_space self.character_set = character_set # -- Private methods ----------------------------------------------- def _stripquotes(self, line): """Remove quotemarks from the start of the line Returns the number of quotemarks stripped and the stripped line: >>> decoder = FormatFlowedDecoder() >>> decoder._stripquotes(u'>>> quoted line') (3, u' quoted line') Non-quoted lines are returned unchanged: >>> decoder._stripquotes(u'non-quoted line') (0, u'non-quoted line') """ stripped = line.lstrip('>') return len(line) - len(stripped), stripped def _stripstuffing(self, line): """Remove the optional leading space Returns the stripped line: >>> decoder = FormatFlowedDecoder() >>> decoder._stripstuffing(u' stuffed line') u'stuffed line' Non-stuffed lines are returned unchanged: >>> decoder._stripstuffing(u'non-stuffed line') u'non-stuffed line' Additional spacing is preserved: >>> decoder._stripstuffing(u' extra leading space') u' extra leading space' """ if line.startswith(u' '): return line[1:] return line def _stripflow(self, line): """Remove the trailing flow space is delete_space is set The instance attribute delete_space is False by default thus this method returns the line unchanged: >>> decoder = FormatFlowedDecoder() >>> decoder._stripflow(u'flowed line ') u'flowed line ' But if the delete_space attribute has been set to True the flow space is removed: >>> decoder = FormatFlowedDecoder(delete_space=True) >>> decoder._stripflow(u'flowed line ') u'flowed line' Only one flow space is removed: >>> decoder._stripflow(u'extra whitespace ') u'extra whitespace ' """ if self.delete_space and line.endswith(u' '): return line[:-1] return line # -- Public API ---------------------------------------------------- def decode(self, flowed): """Decode flowed text Returns an iterable serving a sequence of (information, chunk) tuples. information is a dictionary with the following fields: type One of PARAGRAPH, FIXED, SIGNATURE_SEPARATOR quotedepth Number of quotemarks found on the text chunk chunk is a unicode string. All text is unwrapped and without any quotemarks; when displaying these chunks, the appropriate quotemarks should be added again, and chunks of type PARAGRAPH should be displayed wrapped. Chunks of type FIXED should be displayed unwrapped. Examples -------- Here is a simple example: >>> CRLF = '\\r\\n' >>> decoder = FormatFlowedDecoder() >>> result = decoder.decode(CRLF.join(( ... ">> `Take some more tea,' the March Hare said to Alice, very ", ... ">> earnestly.", ... ">", ... "> `I've had nothing yet,' Alice replied in an offended ", ... "> tone, `so I can't take more.'", ... "", ... "`You mean you can't take less,' said the Hatter: `it's very ", ... "easy to take more than nothing.'", ... "", ... "-- ", ... "Lewis Carroll"))) >>> list(result) == [ ... ({'quotedepth': 2, 'type': PARAGRAPH}, ... u"`Take some more tea,' the March Hare said to Alice, " ... u"very earnestly."), ... ({'quotedepth': 1, 'type': FIXED}, u""), ... ({'quotedepth': 1, 'type': PARAGRAPH}, ... u"`I've had nothing yet,' Alice replied in an offended " ... u"tone, `so I can't take more.'"), ... ({'quotedepth': 0, 'type': FIXED}, u""), ... ({'quotedepth': 0, 'type': PARAGRAPH}, ... u"`You mean you can't take less,' said the Hatter: `it's " ... u"very easy to take more than nothing.'"), ... ({'quotedepth': 0, 'type': FIXED}, u""), ... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "), ... ({'quotedepth': 0, 'type': FIXED}, u"Lewis Carroll") ... ] True Improperly closed paragraphs ---------------------------- The decoder can deal with various cases of improperly format=flowed messages. Paragraphs normally end with a fixed line, but the following cases are also considered paragraph-closing cases: - A change in quotedepth: >>> result = decoder.decode(CRLF.join(( ... "> Depth one paragraph with flow space. ", ... ">> Depth two paragraph with flow space. ", ... "Depth zero paragraph with fixed line."))) >>> list(result) == [ ... ({'quotedepth': 1, 'type': PARAGRAPH}, ... u"Depth one paragraph with flow space. "), ... ({'quotedepth': 2, 'type': PARAGRAPH}, ... u"Depth two paragraph with flow space. "), ... ({'quotedepth': 0, 'type': FIXED}, ... u"Depth zero paragraph with fixed line.")] True - A signature separator: >>> result = decoder.decode(CRLF.join(( ... "A paragraph with flow space. ", ... "-- "))) >>> list(result) == [ ... ({'quotedepth': 0, 'type': PARAGRAPH}, ... u"A paragraph with flow space. "), ... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- ")] True - The end of the message: >>> result = decoder.decode(CRLF.join(( ... "A paragraph with flow space. ",))) >>> list(result) == [ ... ({'quotedepth': 0, 'type': PARAGRAPH}, ... u"A paragraph with flow space. ")] True Decoder options --------------- The delete_space attribute of the FormatFlowedDecoder class can be used to control wether or not the trailing space on flowed lines should be retained; this is used to encode flowed text where spaces are rare: >>> decoder = FormatFlowedDecoder(delete_space=True) >>> result = decoder.decode(CRLF.join(( ... "Contrived example with a word- ", ... "break across the paragraph."))) >>> list(result) == [ ... ({'quotedepth': 0, 'type': PARAGRAPH}, ... u'Contrived example with a word-break across the ' ... u'paragraph.')] True Note that the characterset determines what how to interpret a space and a quote marker. The cp037 characterset does not encode these characters the same way, for example: >>> decoder = FormatFlowedDecoder(character_set='cp037') >>> result = decoder.decode(CRLF.join(( ... "n@\\xe3\\x88\\x89\\xa2@\\x89\\xa2@\\x81@\\x98\\xa4\\x96\\xa3" ... "\\x85\\x84@\\x97\\x81\\x99\\x81\\x87\\x99\\x81\\x97\\x88@", ... "n@\\x85\\x95\\x83\\x96\\x84\\x85\\x84@\\x89\\x95@\\x83\\x97" ... "\\xf0\\xf3\\xf7K"))) >>> list(result) == [ ... ({'quotedepth': 1, 'type': PARAGRAPH}, ... u'This is a quoted paragraph encoded in cp037.')] True """ para = u'' pinfo = {'type': PARAGRAPH} for line in flowed.split('\r\n'): line = line.decode(self.character_set) quotedepth, line = self._stripquotes(line) line = self._stripstuffing(line) if line == '-- ': # signature separator if para: # exception case: flowed line followed by sig-sep yield (pinfo, para) pinfo = {'type': PARAGRAPH} para = u'' yield ({'type': SIGNATURE_SEPARATOR, 'quotedepth': quotedepth}, line) continue if line.endswith(u' '): # flowed line; collect into a paragraph if quotedepth != pinfo.get('quotedepth', quotedepth): # exception case: flowed line followed by quotedepth change yield (pinfo, para) pinfo = {'type': PARAGRAPH} para = u'' para += self._stripflow(line) pinfo['quotedepth'] = quotedepth continue # fixed line if para: # completed paragraph if quotedepth != pinfo.get('quotedepth', quotedepth): # exception case: flowed line followed by quotedepth change yield (pinfo, para) pinfo = {'type': PARAGRAPH} para = u'' else: yield (pinfo, para + line) pinfo = {'type': PARAGRAPH} para = u'' continue yield ({'type': FIXED, 'quotedepth': quotedepth}, line) if para: # exception case: last line was a flowed line yield (pinfo, para) class FormatFlowedEncoder: """Object to generate format=flowed text The following attributes influence the flowed formatting of text: extra_space (default: False) Use an extra space to create flowed lines; this requires that the DelSpace flag will be set true on the Content-Type mime header. Use this flag on texts that have little or no spaces to break on. character_set (default: us-ascii) Encode the output to this character set. spacestuff_quoted (default: True) Always spacestuff quoted chunks, i.e. place a space between the quote markers and the text. width (default: 78) The maximum line width generated for flowed paragraphs; fixed lines can still exceed this width. This value does not include the CRLF line endings. """ def __init__(self, extra_space=False, character_set='us-ascii', spacestuff_quoted=True, width=78): self.extra_space = extra_space self.character_set = character_set self.spacestuff_quoted = spacestuff_quoted self.width = width def _spacestuff(self, line, force=False): """Prepend a space to lines starting with ' ', '>' or 'From' Returns the altered line. Set 'force' to True to skip the tests and always prepend the space regardless: >>> encoder = FormatFlowedEncoder() >>> encoder._spacestuff(u' leading space needs to be preserved') u' leading space needs to be preserved' >>> encoder._spacestuff(u'> can be confused for a quotemark') u' > can be confused for a quotemark' >>> encoder._spacestuff(u'From is often escaped by MTAs') u' From is often escaped by MTAs' >>> encoder._spacestuff(u'Padding is considered harmless') u'Padding is considered harmless' >>> encoder._spacestuff(u'So forcing it is fine', True) u' So forcing it is fine' Note that empty lines can never be spacestuffed: >>> encoder._spacestuff(u'') u'' """ if not line: return line # Although the RFC doesn't say so explicitly, in practice 'From' only # needs escaping when (1) not quoted and (2) actually encoded as # 'From' (so independent of the unicode sequence u'From'). # For simplicity's sake, we spacestuff it any time a line starts with # it before adding quotemarks and encoding the line. if force or line[0] in (' ', '>') or line.startswith('From'): return u' ' + line return line # -- Public API ---------------------------------------------------- def encode(self, chunks): """Encode chunks of text to format=flowed chunks An iterable sequence of (information, text) tuples, where information is a dictionary with 'type' and 'quotedepth' keys. The 'type' value is one of PARAGRAPH, FIXED or SIGNATURE-SEPARATOR, and the 'quotedepth' value a positive integer indicating the quoting depth. text should be the unicode text to be encoded. Example ------- To illustrate, an example: >>> chunks = ( ... ({'quotedepth': 2, 'type': PARAGRAPH}, ... u"`Take some more tea,' the March Hare said to Alice, " ... u"very earnestly."), ... ({'quotedepth': 1, 'type': FIXED}, u""), ... ({'quotedepth': 1, 'type': PARAGRAPH}, ... u"`I've had nothing yet,' Alice replied in an offended " ... u"tone, `so I can't take more.'"), ... ({'quotedepth': 0, 'type': FIXED}, u""), ... ({'quotedepth': 0, 'type': PARAGRAPH}, ... u"`You mean you can't take less,' said the Hatter: `it's " ... u"very easy to take more than nothing.'"), ... ({'quotedepth': 0, 'type': FIXED}, u""), ... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "), ... ({'quotedepth': 0, 'type': PARAGRAPH}, u"Carol Lewis"), ... ) >>> result = FormatFlowedEncoder(width=45).encode(chunks) >>> result.split('\\r\\n') == [ ... ">> `Take some more tea,' the March Hare said ", ... ">> to Alice, very earnestly.", ... ">", ... "> `I've had nothing yet,' Alice replied in ", ... "> an offended tone, `so I can't take more.'", ... "", ... "`You mean you can't take less,' said the ", ... "Hatter: `it's very easy to take more than ", ... "nothing.'", ... "", ... "-- ", ... "Carol Lewis", ... ""] True """ encoded = [] for info, text in chunks: encoded.append(self.encodeChunk(text, **info)) return ''.join(encoded) def encodeChunk(self, chunk, type=PARAGRAPH, quotedepth=0): """Encode a chunk of text to format=flowed The chunk is encoded to format=flowed text, controlled by the following arguments. chunk The unicode text to be encoded. Newlines are considered to be whitespace and will be converted to spaces. type (default: PARAGRAPH) Chunk type; one of PARAGRAPH, FIXED or SIGNATURE_SEPARATOR. When called with type SIGNATURE_SEPARATOR the chunk is ignored and '-- ' is written out. quotedepth (default: 0) The quote depth of the chunk. Examples -------- The encoder has to deal with three types of text chunks. To illustrate, we create a encoder instance geared: >>> encoder = FormatFlowedEncoder(width=45) We can then use this encoder to encode some examples of these different types: - fixed lines: >>> encoder.encodeChunk(u'A fixed line remains unaltered', FIXED) 'A fixed line remains unaltered\\r\\n' >>> encoder.encodeChunk(u'Although quoting is prepended', FIXED, 2) '>> Although quoting is prepended\\r\\n' >>> encoder.encodeChunk(u'Trailing spaces are removed ', FIXED) 'Trailing spaces are removed\\r\\n' >>> encoder.encodeChunk(u'> and special first chars are fluffed', ... FIXED) ' > and special first chars are fluffed\\r\\n' - a paragraph (the default type): >>> result = encoder.encodeChunk( ... u"`Take some more tea,' the March Hare said to Alice, " ... u"very earnestly.") >>> result == ("`Take some more tea,' the March Hare said \\r\\n" ... "to Alice, very earnestly.\\r\\n") True >>> result = encoder.encodeChunk( ... u"`I've had nothing yet,' Alice replied in an offended " ... u"tone, `so I can't take more.'", PARAGRAPH, 1) >>> result == ("> `I've had nothing yet,' Alice replied in \\r\\n" ... "> an offended tone, `so I can't take more.'\\r\\n") True >>> result = encoder.encodeChunk( ... u'The wrapping deals quite well with > eratic ' ... u'spacing and space fluffs characters where needed.') >>> result == ("The wrapping deals quite well with \\r\\n" ... " > eratic spacing and space fluffs \\r\\n" ... "characters where needed.\\r\\n") True - signature separators: >>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR) '-- \\r\\n' >>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR, 3) '>>> -- \\r\\n' Note that the actual chunk value is ignored for this type: >>> encoder.encodeChunk(u'foobar', SIGNATURE_SEPARATOR) '-- \\r\\n' Encoder options --------------- The encoding can be influenced by several instance attributes; the width attribute was used for the paragraph demonstrations. Others include 'extra_space', 'character_set' and 'spacestuff_quoted': - extra_space generates extra spaces on flowed lines so flowed lines can be broken on something other than whitespace: >>> encoder = FormatFlowedEncoder(extra_space=True, width=45) >>> result = encoder.encodeChunk( ... u'This is useful for texts with many word-breaks or few ' ... u'spaces') >>> result == ("This is useful for texts with many word- \\r\\n" ... "breaks or few spaces\\r\\n") True - character_set controls the output encoding: >>> encoder = FormatFlowedEncoder(character_set='cp037') >>> result = encoder.encodeChunk(u'Can you read me now?', ... quotedepth=1) >>> result == ('n@\\xc3\\x81\\x95@\\xa8\\x96\\xa4@\\x99\\x85\\x81' ... '\\x84@\\x94\\x85@\\x95\x96\\xa6o\\r\\n') True - spacestuff_quoted causes quoted lines to be spacestuffed by default; this makes for slightly more readable quoted text output. It is on by default, but can be switched off: >>> encoder = FormatFlowedEncoder(spacestuff_quoted=False) >>> encoder.encodeChunk(u'Look Ma! No space!', quotedepth=1) '>Look Ma! No space!\\r\\n' RFC 2822 compliance ------------------- Note that RFC 2822 requires that generated lines never exceed the hard limit of 998 characters without the CRLF at the end. The encoder has to enforce this by chopping the lines up into pieces not exceeding that length: >>> encoder = FormatFlowedEncoder() >>> result = encoder.encodeChunk(u'-' * 1500, FIXED) >>> result = result.split('\\r\\n') >>> len(result) 3 >>> len(result[0]) 998 >>> result == ['-' * 998, '-' * 502, ''] True """ # cleanup: replace newlines with spaces and remove trailing spaces chunk = ' '.join(chunk.rstrip().splitlines()) # Pre-encode quoting quotemarker = u'>' * quotedepth quotemarker = quotemarker.encode(self.character_set) forcestuff = self.spacestuff_quoted and quotedepth > 0 if type == SIGNATURE_SEPARATOR: chunk = u'-- ' if type == PARAGRAPH: # Maximum width is reduced by stuffing and quotemarkers width = self.width - len(quotemarker) - 2 if width <= 0: raise ValueError('Not enough width for both quoting and text') wrapper = _FlowedTextWrapper(width, self.extra_space) chunk = wrapper.wrap(chunk) else: chunk = [chunk] lines = [] for line in chunk: # add space to flowed lines (all but last); this is an extra space # if the wrapping of paragraphs included spaces at the end of the # lines. if line != chunk[-1]: line += ' ' line = self._spacestuff(line, forcestuff) line = quotemarker + line.encode(self.character_set) # Enforce a hard limit of 998 characters per line (excluding CRLF) # Unfortunately we can only enforce this *after* encoding, # otherwise we could flow lines that are too long. while len(line) > 998: lines.append(line[:998]) line = line[998:] lines.append(line) lines.append('') # ensure last ending CRLF return '\r\n'.join(lines) # -- Convenience functions --------------------------------------------- def decode(flowed, **kwargs): """Convert format=flowed text See the FormatFlowedDecoder.decode docstring for more information. All keyword arguments are passed to the FormatFlowedDecoder instance. """ decoder = FormatFlowedDecoder(**kwargs) return decoder.decode(flowed) def encode(chunks, **kwargs): """Convert chunks of text to format=flowed See the FormatFlowedEncoder.encode docstring for more information. All keyword arguments are passed to the FormatFlowedEncoder instance. """ encoder = FormatFlowedEncoder(**kwargs) return encoder.encode(chunks) def convertToWrapped(flowed, width=78, quote=u'>', wrap_fixed=True, **kwargs): """Covert flowed text to encoded and wrapped text Create text suitable for a proportional font, fixed with, plain text display. The argements are interpreted as follows: flowed The format=flowed formatted text to convert width (default: 78) The maximum line length at which to wrap paragraphs. quote (default: u'>') Character sequence to use to mark quote depths; it is multiplied with the quotedepth to quote a line. If this sequence does not end in a space a space is added between the quotemars and the line. wrap_fixed (default: True) If true, fixed text chunks are wrapped to the given width as well, including hard word breaks if a word exceeds the line width The remaining arguments are used as arguments to FormatFlowedDecoder. Here is a simple example: >>> CRLF = '\\r\\n' >>> result = convertToWrapped(CRLF.join(( ... ">> `Take some more tea,' the March Hare said to Alice, very ", ... ">> earnestly.", ... ">", ... "> `I've had nothing yet,' Alice replied in an offended ", ... "> tone, `so I can't take more.'", ... "", ... "`You mean you can't take less,' said the Hatter: `it's very ", ... "easy to take more than nothing.'", ... "", ... "-- ", ... "Lewis Caroll")), width=60) >>> result.split('\\n') == [ ... ">> `Take some more tea,' the March Hare said to Alice, very", ... ">> earnestly.", ... "> ", ... "> `I've had nothing yet,' Alice replied in an offended tone,", ... "> `so I can't take more.'", ... "", ... "`You mean you can't take less,' said the Hatter: `it's very", ... "easy to take more than nothing.'", ... "", ... "-- ", ... "Lewis Caroll"] True """ result = [] for info, chunk in decode(flowed, **kwargs): type = info['type'] quotedepth = info['quotedepth'] quotemarker = quotedepth and quote * quotedepth or u'' if quotemarker and quote[-1] != u' ': quotemarker += u' ' if type == FIXED and not wrap_fixed: result.append(quotemarker + chunk) elif not chunk or type == SIGNATURE_SEPARATOR: result.append(quotemarker + chunk) else: result.extend(textwrap.wrap(chunk, width, replace_whitespace=False, initial_indent=quotemarker, subsequent_indent=quotemarker)) return u'\n'.join(result) def convertToFlowed(text, quotechars=u'>|%', **kwargs): """Convert plain text to format=flowed Attempt to interpret the plain text as paragraphs and fixed lines, creating a format=flowed encoded text. The paragraph detection is fairly simple and probably not suitable for real-world email. text Unicode text to be converted. Paragraphs are detected based on whitelines between them, making all lines with extra linespace at the start fixed to preserve that whitespace. quotechars (default: u'>|%') A set of characters recognized as quote markers; used to detect quote depth. Additional kwargs are passed on to FormatFlowedEncoder. """ encoder = FormatFlowedEncoder(**kwargs) return encoder.encode(_parseFlowableChunks(text, quotechars)) # -- Private classes and methods --------------------------------------- class _FlowedTextWrapper(textwrap.TextWrapper): """Custom text wrapper for flowed text When not using extra spaces, only break on spaces; when we are using extra spaces, don't swallow whitespace at the start and end of lines, but do break long words (as they can be reconstructed with DelSpace on). """ def __init__(self, width=78, extra_space=False): textwrap.TextWrapper.__init__(self, width, break_long_words=extra_space) self.extra_space = extra_space if not extra_space: self.wordsep_re = re.compile(r'(\s+)') def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # _handle_long_word taken from python 2.5 CVS speed optimisation # Can be removed if this is used with python 2.5 space_left = max(width - cur_len, 1) if self.break_long_words: cur_line.append(reversed_chunks[-1][:space_left]) reversed_chunks[-1] = reversed_chunks[-1][space_left:] elif not cur_line: cur_line.append(reversed_chunks.pop()) def _wrap(self, chunks): # Simplified and customized version of textwrap.TextWrapper # Based on textwrapper rev. 1.37 in python CVS, with speed optimisation lines = [] chunks.reverse() while chunks: cur_line = [] cur_len = 0 width = self.width # Don't strip space at the start of a line when using extra_space # because spaces are significant there. if chunks[-1].strip() == '' and lines and not self.extra_space: del chunks[-1] while chunks: l = len(chunks[-1]) if cur_len + l <= width: cur_line.append(chunks.pop()) cur_len += l else: break if chunks and len(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) # Don't drop space at end of line if using extra_space for # marking flowed lines because otherwise there is no space between # this line and the next when decoding the flowed text if cur_line and cur_line[-1].strip() == '' and not self.extra_space: del cur_line[-1] if cur_line: lines.append(''.join(cur_line)) return lines def _parseFlowableChunks(text, quotechars='>%|'): """Parse out encodeble chunks, determining chunk type First step is to remove and count quoting marks, determining the quotedepth of the text. Then the type of the lines is detected. Paragraphs are determined by terminating lines; terminating lines are changes in quoting (depth or quoting used, signatures or fixed lines (see below) Fixed lines are used for lines with nothing but whitespace and for lines with whitespace prepended (indented lines). Any line with only two dashes at the start and whitespace is a signature seperator. Example code: >>> result = _parseFlowableChunks(u'\\n'.join(( ... u'Normal text, as long as they are not delimited by empty ', ... u'lines will be considered paragraphs and will be parsed as ', ... u'such.', ... u'', ... u'> > Quoting will be detected as well, and as long as it is ', ... u'> > consistent text will be collected into one paragraph.', ... u'> Changes in depth trigger a new paragraph.', ... u'> Leading whitespace makes for fixed lines.', ... u'Signature separators are dealt with accordingly:', ... u'-- ' ... ))) >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0}, ... u'Normal text, as long as they are not delimited by empty ' ... u'lines will be considered paragraphs and will be parsed as ' ... u'such.') True >>> result.next() == ({'type': FIXED, 'quotedepth': 0}, u'') True >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 2}, ... u'Quoting will be detected as well, and as long as it is ' ... u'consistent text will be collected into one paragraph.') True >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 1}, ... u'Changes in depth trigger a new paragraph.') True >>> result.next() == ({'type': FIXED, 'quotedepth': 1}, ... u' Leading whitespace makes for fixed lines.') True >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0}, ... u'Signature separators are dealt with accordingly:') True >>> result.next() == ({'type': SIGNATURE_SEPARATOR, 'quotedepth': 0}, ... u'-- ') True >>> result.next() Traceback (most recent call last): ... StopIteration """ # Match quotemarks with limited whitespace around them qm_match = re.compile('(^\s{0,2}([%s]\s?)+)' % quotechars).match # Find all quotemarks qm_findall = re.compile('[%s]' % quotechars).findall quotedepth = 0 quotemarks = '' para = u'' for line in text.splitlines(): has_quotes = qm_match(line) same_quotes = quotemarks and line.startswith(quotemarks) if (has_quotes and not same_quotes) or (not has_quotes and quotedepth): # Change in quoting if para: yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para para = u'' quotemarks = has_quotes and has_quotes.group(0) or u'' quotedepth = len(qm_findall(quotemarks)) line = line[len(quotemarks):] if line.rstrip() == u'--': # signature separator if para: yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para para = u'' yield {'type': SIGNATURE_SEPARATOR, 'quotedepth': quotedepth}, line continue if line.strip() == u'' or line.lstrip() != line: # Fixed line if para: yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para para = u'' yield {'type': FIXED, 'quotedepth': quotedepth}, line continue # Paragraph line; store and loop to next line para += line if para: yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para def _test(verbose=False): import doctest return doctest.testmod(verbose=verbose) if __name__ == '__main__': import sys _test('-v' in sys.argv)