source: trunk/RetroStatus/formatflowed.py @ 201

Last change on this file since 201 was 201, checked in by Nicholas Riley, 15 years ago

RetroStatus?

File size: 33.4 KB
Line 
1"""RFC 3676 format=flowed text processing.
2
3This module provides an API to create and display text/plain; format=flowed
4mimetype text.
5
6"""
7
8# Copyright (C) 2005 Martijn Pieters
9# Written by Martijn Pieters <mj@zopatista.com>
10# Development was sponsored by Logicalware (http://www.logicalware.org/)
11# Licensed as Open Source under the same terms as the Python 2.4.1 license,
12# as available at http://www.python.org/2.4.1/license.html
13
14__revision_id__ = '$Id: formatflowed.py 27 2005-09-17 19:13:48Z mj $'
15
16import re
17import textwrap
18
19__all__ = [
20    'PARAGRAPH',
21    'FIXED',
22    'SIGNATURE_SEPARATOR',
23    'FormatFlowedDecoder',
24    'FormatFlowedEncoder',
25    'decode',
26    'encode',
27    'convertToWrapped',
28    'convertToFlowed'
29]
30
31# Constants denoting the various text chunk types recognized by format=flowed
32PARAGRAPH, FIXED, SIGNATURE_SEPARATOR = range(3)
33
34
35# -- Public classes ----------------------------------------------------
36
37
38class FormatFlowedDecoder:
39    """Object for converting format=flowed text to other formats
40
41    The following instance attributes influence the interpretation of
42    format=flowed text:
43      delete_space (default: False)
44        Delete the trailing space before the CRLF on flowed lines before
45        interpreting the line on flowed input, corresponds to the DelSp mime
46        parameter
47      character_set (default: us-ascii)
48        The encoding of text passed in. Text is decoded to unicode using this
49        encoding, using the default error handing scheme.
50
51    """
52    def __init__(self, delete_space=False, character_set='us-ascii'):
53        self.delete_space = delete_space
54        self.character_set = character_set
55
56    # -- Private methods -----------------------------------------------
57
58    def _stripquotes(self, line):
59        """Remove quotemarks from the start of the line
60
61        Returns the number of quotemarks stripped and the stripped line:
62
63            >>> decoder = FormatFlowedDecoder()
64            >>> decoder._stripquotes(u'>>> quoted line')
65            (3, u' quoted line')
66
67        Non-quoted lines are returned unchanged:
68
69            >>> decoder._stripquotes(u'non-quoted line')
70            (0, u'non-quoted line')
71
72        """
73        stripped = line.lstrip('>')
74        return len(line) - len(stripped), stripped
75
76    def _stripstuffing(self, line):
77        """Remove the optional leading space
78
79        Returns the stripped line:
80
81            >>> decoder = FormatFlowedDecoder()
82            >>> decoder._stripstuffing(u' stuffed line')
83            u'stuffed line'
84
85        Non-stuffed lines are returned unchanged:
86
87            >>> decoder._stripstuffing(u'non-stuffed line')
88            u'non-stuffed line'
89
90        Additional spacing is preserved:
91
92            >>> decoder._stripstuffing(u'  extra leading space')
93            u' extra leading space'
94
95        """
96        if line.startswith(u' '):
97            return line[1:]
98        return line
99
100    def _stripflow(self, line):
101        """Remove the trailing flow space is delete_space is set
102
103        The instance attribute delete_space is False by default thus this
104        method returns the line unchanged:
105
106            >>> decoder = FormatFlowedDecoder()
107            >>> decoder._stripflow(u'flowed line ')
108            u'flowed line '
109
110        But if the delete_space attribute has been set to True the flow space
111        is removed:
112
113            >>> decoder = FormatFlowedDecoder(delete_space=True)
114            >>> decoder._stripflow(u'flowed line ')
115            u'flowed line'
116
117        Only one flow space is removed:
118        >>> decoder._stripflow(u'extra whitespace  ')
119        u'extra whitespace '
120
121        """
122        if self.delete_space and line.endswith(u' '):
123            return line[:-1]
124        return line
125
126    # -- Public API ----------------------------------------------------
127
128    def decode(self, flowed):
129        """Decode flowed text
130
131        Returns an iterable serving a sequence of (information, chunk)
132        tuples. information is a dictionary with the following fields:
133          type
134            One of PARAGRAPH, FIXED, SIGNATURE_SEPARATOR
135          quotedepth
136            Number of quotemarks found on the text chunk
137
138        chunk is a unicode string. All text is unwrapped and without any
139        quotemarks; when displaying these chunks, the appropriate quotemarks
140        should be added again, and chunks of type PARAGRAPH should be
141        displayed wrapped. Chunks of type FIXED should be displayed
142        unwrapped.
143
144
145        Examples
146        --------
147
148        Here is a simple example:
149
150            >>> CRLF = '\\r\\n'
151            >>> decoder = FormatFlowedDecoder()
152            >>> result = decoder.decode(CRLF.join((
153            ... ">> `Take some more tea,' the March Hare said to Alice, very ",
154            ... ">> earnestly.",
155            ... ">",
156            ... "> `I've had nothing yet,' Alice replied in an offended ",
157            ... "> tone, `so I can't take more.'",
158            ... "",
159            ... "`You mean you can't take less,' said the Hatter: `it's very ",
160            ... "easy to take more than nothing.'",
161            ... "",
162            ... "-- ",
163            ... "Lewis Carroll")))
164            >>> list(result) == [
165            ...   ({'quotedepth': 2, 'type': PARAGRAPH},
166            ...     u"`Take some more tea,' the March Hare said to Alice, "
167            ...     u"very earnestly."),
168            ...   ({'quotedepth': 1, 'type': FIXED}, u""),
169            ...   ({'quotedepth': 1, 'type': PARAGRAPH},
170            ...    u"`I've had nothing yet,' Alice replied in an offended "
171            ...    u"tone, `so I can't take more.'"),
172            ...   ({'quotedepth': 0, 'type': FIXED}, u""),
173            ...   ({'quotedepth': 0, 'type': PARAGRAPH},
174            ...    u"`You mean you can't take less,' said the Hatter: `it's "
175            ...    u"very easy to take more than nothing.'"),
176            ...   ({'quotedepth': 0, 'type': FIXED}, u""),
177            ...   ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
178            ...   ({'quotedepth': 0, 'type': FIXED}, u"Lewis Carroll")
179            ... ]
180            True
181
182
183        Improperly closed paragraphs
184        ----------------------------
185
186        The decoder can deal with various cases of improperly format=flowed
187        messages. Paragraphs normally end with a fixed line, but the following
188        cases are also considered paragraph-closing cases:
189
190        - A change in quotedepth:
191
192            >>> result = decoder.decode(CRLF.join((
193            ... "> Depth one paragraph with flow space. ",
194            ... ">> Depth two paragraph with flow space. ",
195            ... "Depth zero paragraph with fixed line.")))
196            >>> list(result) == [
197            ...   ({'quotedepth': 1, 'type': PARAGRAPH},
198            ...    u"Depth one paragraph with flow space. "),
199            ...   ({'quotedepth': 2, 'type': PARAGRAPH},
200            ...    u"Depth two paragraph with flow space. "),
201            ...   ({'quotedepth': 0, 'type': FIXED},
202            ...    u"Depth zero paragraph with fixed line.")]
203            True
204
205        - A signature separator:
206
207            >>> result = decoder.decode(CRLF.join((
208            ... "A paragraph with flow space. ",
209            ... "-- ")))
210            >>> list(result) == [
211            ...   ({'quotedepth': 0, 'type': PARAGRAPH},
212            ...    u"A paragraph with flow space. "),
213            ...   ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- ")]
214            True
215
216        - The end of the message:
217
218            >>> result = decoder.decode(CRLF.join((
219            ... "A paragraph with flow space. ",)))
220            >>> list(result) == [
221            ...   ({'quotedepth': 0, 'type': PARAGRAPH},
222            ...    u"A paragraph with flow space. ")]
223            True
224
225
226        Decoder options
227        ---------------
228
229        The delete_space attribute of the FormatFlowedDecoder class can be used
230        to control wether or not the trailing space on flowed lines should be
231        retained; this is used to encode flowed text where spaces are rare:
232
233            >>> decoder = FormatFlowedDecoder(delete_space=True)
234            >>> result = decoder.decode(CRLF.join((
235            ... "Contrived example with a word- ",
236            ... "break across the paragraph.")))
237            >>> list(result) == [
238            ...   ({'quotedepth': 0, 'type': PARAGRAPH},
239            ...    u'Contrived example with a word-break across the '
240            ...    u'paragraph.')]
241            True
242
243        Note that the characterset determines what how to interpret a space
244        and a quote marker. The cp037 characterset does not encode these
245        characters the same way, for example:
246
247            >>> decoder = FormatFlowedDecoder(character_set='cp037')
248            >>> result = decoder.decode(CRLF.join((
249            ... "n@\\xe3\\x88\\x89\\xa2@\\x89\\xa2@\\x81@\\x98\\xa4\\x96\\xa3"
250            ... "\\x85\\x84@\\x97\\x81\\x99\\x81\\x87\\x99\\x81\\x97\\x88@",
251            ... "n@\\x85\\x95\\x83\\x96\\x84\\x85\\x84@\\x89\\x95@\\x83\\x97"
252            ... "\\xf0\\xf3\\xf7K")))
253            >>> list(result) == [
254            ...   ({'quotedepth': 1, 'type': PARAGRAPH},
255            ...    u'This is a quoted paragraph encoded in cp037.')]
256            True
257
258        """
259        para = u''
260        pinfo = {'type': PARAGRAPH}
261        for line in flowed.split('\r\n'):
262            line = line.decode(self.character_set)
263            quotedepth, line = self._stripquotes(line)
264            line = self._stripstuffing(line)
265            if line == '-- ':
266                # signature separator
267                if para:
268                    # exception case: flowed line followed by sig-sep
269                    yield (pinfo, para)
270                    pinfo = {'type': PARAGRAPH}
271                    para = u''
272                yield ({'type': SIGNATURE_SEPARATOR,
273                        'quotedepth': quotedepth}, line)
274                continue
275            if line.endswith(u' '):
276                # flowed line; collect into a paragraph
277                if quotedepth != pinfo.get('quotedepth', quotedepth):
278                    # exception case: flowed line followed by quotedepth change
279                    yield (pinfo, para)
280                    pinfo = {'type': PARAGRAPH}
281                    para = u''
282                para += self._stripflow(line)
283                pinfo['quotedepth'] = quotedepth
284                continue
285            # fixed line
286            if para:
287                # completed paragraph
288                if quotedepth != pinfo.get('quotedepth', quotedepth):
289                    # exception case: flowed line followed by quotedepth change
290                    yield (pinfo, para)
291                    pinfo = {'type': PARAGRAPH}
292                    para = u''
293                else:
294                    yield (pinfo, para + line)
295                    pinfo = {'type': PARAGRAPH}
296                    para = u''
297                    continue
298            yield ({'type': FIXED, 'quotedepth': quotedepth}, line)
299
300        if para:
301            # exception case: last line was a flowed line
302            yield (pinfo, para)
303
304
305class FormatFlowedEncoder:
306    """Object to generate format=flowed text
307
308    The following attributes influence the flowed formatting of text:
309      extra_space (default: False)
310        Use an extra space to create flowed lines; this requires that the
311        DelSpace flag will be set true on the Content-Type mime header. Use
312        this flag on texts that have little or no spaces to break on.
313      character_set (default: us-ascii)
314        Encode the output to this character set.
315      spacestuff_quoted (default: True)
316        Always spacestuff quoted chunks, i.e. place a space between the quote
317        markers and the text.
318      width (default: 78)
319        The maximum line width generated for flowed paragraphs; fixed lines
320        can still exceed this width. This value does not include the CRLF
321        line endings.
322
323    """
324    def __init__(self, extra_space=False, character_set='us-ascii',
325                 spacestuff_quoted=True, width=78):
326        self.extra_space = extra_space
327        self.character_set = character_set
328        self.spacestuff_quoted = spacestuff_quoted
329        self.width = width
330
331    def _spacestuff(self, line, force=False):
332        """Prepend a space to lines starting with ' ', '>' or 'From'
333
334        Returns the altered line. Set 'force' to True to skip the tests and
335        always prepend the space regardless:
336
337            >>> encoder = FormatFlowedEncoder()
338            >>> encoder._spacestuff(u' leading space needs to be preserved')
339            u'  leading space needs to be preserved'
340            >>> encoder._spacestuff(u'> can be confused for a quotemark')
341            u' > can be confused for a quotemark'
342            >>> encoder._spacestuff(u'From is often escaped by MTAs')
343            u' From is often escaped by MTAs'
344            >>> encoder._spacestuff(u'Padding is considered harmless')
345            u'Padding is considered harmless'
346            >>> encoder._spacestuff(u'So forcing it is fine', True)
347            u' So forcing it is fine'
348
349        Note that empty lines can never be spacestuffed:
350
351            >>> encoder._spacestuff(u'')
352            u''
353
354        """
355        if not line:
356            return line
357        # Although the RFC doesn't say so explicitly, in practice 'From' only
358        # needs escaping when (1) not quoted and (2) actually encoded as
359        # 'From' (so independent of the unicode sequence u'From').
360        # For simplicity's sake, we spacestuff it any time a line starts with
361        # it before adding quotemarks and encoding the line.
362        if force or line[0] in (' ', '>') or line.startswith('From'):
363            return u' ' + line
364        return line
365
366    # -- Public API ----------------------------------------------------
367
368    def encode(self, chunks):
369        """Encode chunks of text to format=flowed
370
371        chunks
372          An iterable sequence of (information, text) tuples, where information
373          is a dictionary with 'type' and 'quotedepth' keys. The 'type' value
374          is one of PARAGRAPH, FIXED or SIGNATURE-SEPARATOR, and the
375          'quotedepth' value a positive integer indicating the quoting depth.
376          text should be the unicode text to be encoded.
377
378        Example
379        -------
380
381        To illustrate, an example:
382
383            >>> chunks = (
384            ...   ({'quotedepth': 2, 'type': PARAGRAPH},
385            ...     u"`Take some more tea,' the March Hare said to Alice, "
386            ...     u"very earnestly."),
387            ...   ({'quotedepth': 1, 'type': FIXED}, u""),
388            ...   ({'quotedepth': 1, 'type': PARAGRAPH},
389            ...    u"`I've had nothing yet,' Alice replied in an offended "
390            ...    u"tone, `so I can't take more.'"),
391            ...   ({'quotedepth': 0, 'type': FIXED}, u""),
392            ...   ({'quotedepth': 0, 'type': PARAGRAPH},
393            ...    u"`You mean you can't take less,' said the Hatter: `it's "
394            ...    u"very easy to take more than nothing.'"),
395            ...   ({'quotedepth': 0, 'type': FIXED}, u""),
396            ...   ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
397            ...   ({'quotedepth': 0, 'type': PARAGRAPH}, u"Carol Lewis"),
398            ... )
399            >>> result = FormatFlowedEncoder(width=45).encode(chunks)
400            >>> result.split('\\r\\n') == [
401            ...   ">> `Take some more tea,' the March Hare said ",
402            ...   ">> to Alice, very earnestly.",
403            ...   ">",
404            ...   "> `I've had nothing yet,' Alice replied in ",
405            ...   "> an offended tone, `so I can't take more.'",
406            ...   "",
407            ...   "`You mean you can't take less,' said the ",
408            ...   "Hatter: `it's very easy to take more than ",
409            ...   "nothing.'",
410            ...   "",
411            ...   "-- ",
412            ...   "Carol Lewis",
413            ...   ""]
414            True
415
416        """
417        encoded = []
418        for info, text in chunks:
419            encoded.append(self.encodeChunk(text, **info))
420        return ''.join(encoded)
421
422    def encodeChunk(self, chunk, type=PARAGRAPH, quotedepth=0):
423        """Encode a chunk of text to format=flowed
424
425        The chunk is encoded to format=flowed text, controlled by the
426        following arguments.
427        chunk
428          The unicode text to be encoded. Newlines are considered to be
429          whitespace and will be converted to spaces.
430        type (default: PARAGRAPH)
431          Chunk type; one of PARAGRAPH, FIXED or SIGNATURE_SEPARATOR. When
432          called with type SIGNATURE_SEPARATOR the chunk is ignored and '-- '
433          is written out.
434        quotedepth (default: 0)
435          The quote depth of the chunk.
436
437
438        Examples
439        --------
440
441        The encoder has to deal with three types of text chunks. To illustrate,
442        we create a encoder instance geared:
443
444            >>> encoder = FormatFlowedEncoder(width=45)
445
446        We can then use this encoder to encode some examples of these different
447        types:
448
449        - fixed lines:
450
451            >>> encoder.encodeChunk(u'A fixed line remains unaltered', FIXED)
452            'A fixed line remains unaltered\\r\\n'
453            >>> encoder.encodeChunk(u'Although quoting is prepended', FIXED, 2)
454            '>> Although quoting is prepended\\r\\n'
455            >>> encoder.encodeChunk(u'Trailing spaces are removed  ', FIXED)
456            'Trailing spaces are removed\\r\\n'
457            >>> encoder.encodeChunk(u'> and special first chars are fluffed',
458            ...                     FIXED)
459            ' > and special first chars are fluffed\\r\\n'
460
461        - a paragraph (the default type):
462
463            >>> result = encoder.encodeChunk(
464            ...   u"`Take some more tea,' the March Hare said to Alice, "
465            ...   u"very earnestly.")
466            >>> result == ("`Take some more tea,' the March Hare said \\r\\n"
467            ...            "to Alice, very earnestly.\\r\\n")
468            True
469            >>> result = encoder.encodeChunk(
470            ...   u"`I've had nothing yet,' Alice replied in an offended "
471            ...   u"tone, `so I can't take more.'", PARAGRAPH, 1)
472            >>> result == ("> `I've had nothing yet,' Alice replied in \\r\\n"
473            ...            "> an offended tone, `so I can't take more.'\\r\\n")
474            True
475            >>> result = encoder.encodeChunk(
476            ...   u'The   wrapping   deals   quite   well  with > eratic '
477            ...   u'spacing and space fluffs characters where needed.')
478            >>> result == ("The   wrapping   deals   quite   well  with \\r\\n"
479            ...            " > eratic spacing and space fluffs \\r\\n"
480            ...            "characters where needed.\\r\\n")
481            True
482
483        - signature separators:
484
485            >>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR)
486            '-- \\r\\n'
487            >>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR, 3)
488            '>>> -- \\r\\n'
489
490          Note that the actual chunk value is ignored for this type:
491
492            >>> encoder.encodeChunk(u'foobar', SIGNATURE_SEPARATOR)
493            '-- \\r\\n'
494
495
496        Encoder options
497        ---------------
498
499        The encoding can be influenced by several instance attributes; the
500        width attribute was used for the paragraph demonstrations. Others
501        include 'extra_space', 'character_set' and 'spacestuff_quoted':
502
503        - extra_space generates extra spaces on flowed lines so flowed lines
504          can be broken on something other than whitespace:
505
506            >>> encoder = FormatFlowedEncoder(extra_space=True, width=45)
507            >>> result = encoder.encodeChunk(
508            ...   u'This is useful for texts with many word-breaks or few '
509            ...   u'spaces')
510            >>> result == ("This is useful for texts with many word- \\r\\n"
511            ...            "breaks or few spaces\\r\\n")
512            True
513
514        - character_set controls the output encoding:
515
516            >>> encoder = FormatFlowedEncoder(character_set='cp037')
517            >>> result = encoder.encodeChunk(u'Can you read me now?',
518            ...                              quotedepth=1)
519            >>> result == ('n@\\xc3\\x81\\x95@\\xa8\\x96\\xa4@\\x99\\x85\\x81'
520            ...            '\\x84@\\x94\\x85@\\x95\x96\\xa6o\\r\\n')
521            True
522
523        - spacestuff_quoted causes quoted lines to be spacestuffed by default;
524          this makes for slightly more readable quoted text output. It is on
525          by default, but can be switched off:
526
527            >>> encoder = FormatFlowedEncoder(spacestuff_quoted=False)
528            >>> encoder.encodeChunk(u'Look Ma! No space!', quotedepth=1)
529            '>Look Ma! No space!\\r\\n'
530
531
532        RFC 2822 compliance
533        -------------------
534
535        Note that RFC 2822 requires that generated lines never exceed the
536        hard limit of 998 characters without the CRLF at the end. The encoder
537        has to enforce this by chopping the lines up into pieces not exceeding
538        that length:
539
540            >>> encoder = FormatFlowedEncoder()
541            >>> result = encoder.encodeChunk(u'-' * 1500, FIXED)
542            >>> result = result.split('\\r\\n')
543            >>> len(result)
544            3
545            >>> len(result[0])
546            998
547            >>> result == ['-' * 998, '-' * 502, '']
548            True
549
550        """
551        # cleanup: replace newlines with spaces and remove trailing spaces
552        chunk = ' '.join(chunk.rstrip().splitlines())
553
554        # Pre-encode quoting
555        quotemarker = u'>' * quotedepth
556        quotemarker = quotemarker.encode(self.character_set)
557        forcestuff = self.spacestuff_quoted and quotedepth > 0
558
559        if type == SIGNATURE_SEPARATOR:
560            chunk = u'-- '
561
562        if type == PARAGRAPH:
563            # Maximum width is reduced by stuffing and quotemarkers
564            width = self.width - len(quotemarker) - 2
565            if width <= 0:
566                raise ValueError('Not enough width for both quoting and text')
567            wrapper = _FlowedTextWrapper(width, self.extra_space)
568            chunk = wrapper.wrap(chunk)
569        else:
570            chunk = [chunk]
571
572        lines = []
573        for line in chunk:
574            # add space to flowed lines (all but last); this is an extra space
575            # if the wrapping of paragraphs included spaces at the end of the
576            # lines.
577            if line != chunk[-1]:
578                line += ' '
579            line = self._spacestuff(line, forcestuff)
580            line = quotemarker + line.encode(self.character_set)
581
582            # Enforce a hard limit of 998 characters per line (excluding CRLF)
583            # Unfortunately we can only enforce this *after* encoding,
584            # otherwise we could flow lines that are too long.
585            while len(line) > 998:
586                lines.append(line[:998])
587                line = line[998:]
588
589            lines.append(line)
590
591        lines.append('') # ensure last ending CRLF
592        return '\r\n'.join(lines)
593
594
595# -- Convenience functions ---------------------------------------------
596
597
598def decode(flowed, **kwargs):
599    """Convert format=flowed text
600
601    See the FormatFlowedDecoder.decode docstring for more information. All
602    keyword arguments are passed to the FormatFlowedDecoder instance.
603
604    """
605    decoder = FormatFlowedDecoder(**kwargs)
606    return decoder.decode(flowed)
607
608def encode(chunks, **kwargs):
609    """Convert chunks of text to format=flowed
610
611    See the FormatFlowedEncoder.encode docstring for more information. All
612    keyword arguments are passed to the FormatFlowedEncoder instance.
613
614    """
615    encoder = FormatFlowedEncoder(**kwargs)
616    return encoder.encode(chunks)
617
618def convertToWrapped(flowed, width=78, quote=u'>', wrap_fixed=True, **kwargs):
619    """Covert flowed text to encoded and wrapped text
620
621    Create text suitable for a proportional font, fixed with, plain text
622    display. The argements are interpreted as follows:
623      flowed
624        The format=flowed formatted text to convert
625      width (default: 78)
626        The maximum line length at which to wrap paragraphs.
627      quote (default: u'>')
628        Character sequence to use to mark quote depths; it is multiplied with
629        the quotedepth to quote a line. If this sequence does not end in a
630        space a space is added between the quotemars and the line.
631      wrap_fixed (default: True)
632        If true, fixed text chunks are wrapped to the given  width as well,
633        including hard word breaks if a word exceeds the line width
634
635      The remaining arguments are used as arguments to FormatFlowedDecoder.
636
637      Here is a simple example:
638
639        >>> CRLF = '\\r\\n'
640        >>> result = convertToWrapped(CRLF.join((
641        ... ">> `Take some more tea,' the March Hare said to Alice, very ",
642        ... ">> earnestly.",
643        ... ">",
644        ... "> `I've had nothing yet,' Alice replied in an offended ",
645        ... "> tone, `so I can't take more.'",
646        ... "",
647        ... "`You mean you can't take less,' said the Hatter: `it's very ",
648        ... "easy to take more than nothing.'",
649        ... "",
650        ... "-- ",
651        ... "Lewis Caroll")), width=60)
652        >>> result.split('\\n') == [
653        ...   ">> `Take some more tea,' the March Hare said to Alice, very",
654        ...   ">> earnestly.",
655        ...   "> ",
656        ...   "> `I've had nothing yet,' Alice replied in an offended tone,",
657        ...   "> `so I can't take more.'",
658        ...   "",
659        ...   "`You mean you can't take less,' said the Hatter: `it's very",
660        ...   "easy to take more than nothing.'",
661        ...   "",
662        ...   "-- ",
663        ...   "Lewis Caroll"]
664        True
665
666    """
667    result = []
668    for info, chunk in decode(flowed, **kwargs):
669        type = info['type']
670        quotedepth = info['quotedepth']
671        quotemarker = quotedepth and quote * quotedepth or u''
672        if quotemarker and quote[-1] != u' ':
673            quotemarker += u' '
674        if type == FIXED and not wrap_fixed:
675            result.append(quotemarker + chunk)
676        elif not chunk or type == SIGNATURE_SEPARATOR:
677            result.append(quotemarker + chunk)
678        else:
679            result.extend(textwrap.wrap(chunk, width,
680                                        replace_whitespace=False,
681                                        initial_indent=quotemarker,
682                                        subsequent_indent=quotemarker))
683    return u'\n'.join(result)
684
685def convertToFlowed(text, quotechars=u'>|%', **kwargs):
686    """Convert plain text to format=flowed
687
688    Attempt to interpret the plain text as paragraphs and fixed lines,
689    creating a format=flowed encoded text. The paragraph detection is fairly
690    simple and probably not suitable for real-world email.
691
692    text
693      Unicode text to be converted. Paragraphs are detected based on
694      whitelines between them, making all lines with extra linespace at the
695      start fixed to preserve that whitespace.
696    quotechars (default: u'>|%')
697      A set of characters recognized as quote markers; used to detect quote
698      depth.
699
700    Additional kwargs are passed on to FormatFlowedEncoder.
701
702    """
703    encoder = FormatFlowedEncoder(**kwargs)
704    return encoder.encode(_parseFlowableChunks(text, quotechars))
705
706
707# -- Private classes and methods ---------------------------------------
708
709
710class _FlowedTextWrapper(textwrap.TextWrapper):
711    """Custom text wrapper for flowed text
712
713    When not using extra spaces, only break on spaces; when we are using
714    extra spaces, don't swallow whitespace at the start and end of lines, but
715    do break long words (as they can be reconstructed with DelSpace on).
716
717    """
718    def __init__(self, width=78, extra_space=False):
719        textwrap.TextWrapper.__init__(self, width,
720                                      break_long_words=extra_space)
721        self.extra_space = extra_space
722        if not extra_space:
723            self.wordsep_re = re.compile(r'(\s+)')
724
725    def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
726        # _handle_long_word taken from python 2.5 CVS speed optimisation
727        # Can be removed if this is used with python 2.5
728        space_left = max(width - cur_len, 1)
729        if self.break_long_words:
730            cur_line.append(reversed_chunks[-1][:space_left])
731            reversed_chunks[-1] = reversed_chunks[-1][space_left:]
732        elif not cur_line:
733            cur_line.append(reversed_chunks.pop())
734
735    def _wrap(self, chunks):
736        # Simplified and customized version of textwrap.TextWrapper
737        # Based on textwrapper rev. 1.37 in python CVS, with speed optimisation
738        lines = []
739        chunks.reverse()
740        while chunks:
741            cur_line = []
742            cur_len = 0
743            width = self.width
744
745            # Don't strip space at the start of a line when using extra_space
746            # because spaces are significant there.
747            if chunks[-1].strip() == '' and lines and not self.extra_space:
748                del chunks[-1]
749
750            while chunks:
751                l = len(chunks[-1])
752                if cur_len + l <= width:
753                    cur_line.append(chunks.pop())
754                    cur_len += l
755                else:
756                    break
757
758            if chunks and len(chunks[-1]) > width:
759                self._handle_long_word(chunks, cur_line, cur_len, width)
760
761            # Don't drop space at end of line if using extra_space for
762            # marking flowed lines because otherwise there is no space between
763            # this line and the next when decoding the flowed text
764            if cur_line and cur_line[-1].strip() == '' and not self.extra_space:
765                del cur_line[-1]
766
767            if cur_line:
768                lines.append(''.join(cur_line))
769        return lines
770
771
772def _parseFlowableChunks(text, quotechars='>%|'):
773    """Parse out encodeble chunks, determining chunk type
774
775    First step is to remove and count quoting marks, determining the quotedepth
776    of the text. Then the type of the lines is detected.
777
778    Paragraphs are determined by terminating lines; terminating lines are
779    changes in quoting (depth or quoting used, signatures or fixed lines (see
780    below)
781
782    Fixed lines are used for lines with nothing but whitespace and for lines
783    with whitespace prepended (indented lines).
784
785    Any line with only two dashes at the start and whitespace is a signature
786    seperator.
787
788    Example code:
789
790        >>> result = _parseFlowableChunks(u'\\n'.join((
791        ...     u'Normal text, as long as they are not delimited by empty ',
792        ...     u'lines will be considered paragraphs and will be parsed as ',
793        ...     u'such.',
794        ...     u'',
795        ...     u'> > Quoting will be detected as well, and as long as it is ',
796        ...     u'> > consistent text will be collected into one paragraph.',
797        ...     u'> Changes in depth trigger a new paragraph.',
798        ...     u'>      Leading whitespace makes for fixed lines.',
799        ...     u'Signature separators are dealt with accordingly:',
800        ...     u'-- '
801        ... )))
802        >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
803        ...     u'Normal text, as long as they are not delimited by empty '
804        ...     u'lines will be considered paragraphs and will be parsed as '
805        ...     u'such.')
806        True
807        >>> result.next() == ({'type': FIXED, 'quotedepth': 0}, u'')
808        True
809        >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 2},
810        ...     u'Quoting will be detected as well, and as long as it is '
811        ...     u'consistent text will be collected into one paragraph.')
812        True
813        >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 1},
814        ...     u'Changes in depth trigger a new paragraph.')
815        True
816        >>> result.next() == ({'type': FIXED, 'quotedepth': 1},
817        ...     u'     Leading whitespace makes for fixed lines.')
818        True
819        >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
820        ...     u'Signature separators are dealt with accordingly:')
821        True
822        >>> result.next() == ({'type': SIGNATURE_SEPARATOR, 'quotedepth': 0},
823        ...     u'-- ')
824        True
825        >>> result.next()
826        Traceback (most recent call last):
827            ...
828        StopIteration
829
830    """
831    # Match quotemarks with limited whitespace around them
832    qm_match = re.compile('(^\s{0,2}([%s]\s?)+)' % quotechars).match
833    # Find all quotemarks
834    qm_findall = re.compile('[%s]' % quotechars).findall
835
836    quotedepth = 0
837    quotemarks = ''
838    para = u''
839
840    for line in text.splitlines():
841        has_quotes = qm_match(line)
842        same_quotes = quotemarks and line.startswith(quotemarks)
843        if (has_quotes and not same_quotes) or (not has_quotes and quotedepth):
844            # Change in quoting
845            if para:
846                yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
847                para = u''
848
849            quotemarks = has_quotes and has_quotes.group(0) or u''
850            quotedepth = len(qm_findall(quotemarks))
851
852        line = line[len(quotemarks):]
853
854        if line.rstrip() == u'--':
855            # signature separator
856            if para:
857                yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
858                para = u''
859
860            yield {'type': SIGNATURE_SEPARATOR, 'quotedepth': quotedepth}, line
861            continue
862
863        if line.strip() == u'' or line.lstrip() != line:
864            # Fixed line
865            if para:
866                yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
867                para = u''
868
869            yield {'type': FIXED, 'quotedepth': quotedepth}, line
870            continue
871
872        # Paragraph line; store and loop to next line
873        para += line
874
875    if para:
876        yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
877
878def _test(verbose=False):
879    import doctest
880    return doctest.testmod(verbose=verbose)
881
882if __name__ == '__main__':
883    import sys
884    _test('-v' in sys.argv)
Note: See TracBrowser for help on using the repository browser.