source: trunk/RetroStatus/formatflowed.py@ 201

Last change on this file since 201 was 201, checked in by Nicholas Riley, 18 years ago

RetroStatus

File size: 33.4 KB
Line 
1"""RFC 3676 format=flowed text processing.
2
3This module provides an API to create and display text/plain; format=flowed
4mimetype text.
5
6"""
7
8# Copyright (C) 2005 Martijn Pieters
9# Written by Martijn Pieters <mj@zopatista.com>
10# Development was sponsored by Logicalware (http://www.logicalware.org/)
11# Licensed as Open Source under the same terms as the Python 2.4.1 license,
12# as available at http://www.python.org/2.4.1/license.html
13
14__revision_id__ = '$Id: formatflowed.py 27 2005-09-17 19:13:48Z mj $'
15
16import re
17import textwrap
18
19__all__ = [
20 'PARAGRAPH',
21 'FIXED',
22 'SIGNATURE_SEPARATOR',
23 'FormatFlowedDecoder',
24 'FormatFlowedEncoder',
25 'decode',
26 'encode',
27 'convertToWrapped',
28 'convertToFlowed'
29]
30
31# Constants denoting the various text chunk types recognized by format=flowed
32PARAGRAPH, FIXED, SIGNATURE_SEPARATOR = range(3)
33
34
35# -- Public classes ----------------------------------------------------
36
37
38class FormatFlowedDecoder:
39 """Object for converting format=flowed text to other formats
40
41 The following instance attributes influence the interpretation of
42 format=flowed text:
43 delete_space (default: False)
44 Delete the trailing space before the CRLF on flowed lines before
45 interpreting the line on flowed input, corresponds to the DelSp mime
46 parameter
47 character_set (default: us-ascii)
48 The encoding of text passed in. Text is decoded to unicode using this
49 encoding, using the default error handing scheme.
50
51 """
52 def __init__(self, delete_space=False, character_set='us-ascii'):
53 self.delete_space = delete_space
54 self.character_set = character_set
55
56 # -- Private methods -----------------------------------------------
57
58 def _stripquotes(self, line):
59 """Remove quotemarks from the start of the line
60
61 Returns the number of quotemarks stripped and the stripped line:
62
63 >>> decoder = FormatFlowedDecoder()
64 >>> decoder._stripquotes(u'>>> quoted line')
65 (3, u' quoted line')
66
67 Non-quoted lines are returned unchanged:
68
69 >>> decoder._stripquotes(u'non-quoted line')
70 (0, u'non-quoted line')
71
72 """
73 stripped = line.lstrip('>')
74 return len(line) - len(stripped), stripped
75
76 def _stripstuffing(self, line):
77 """Remove the optional leading space
78
79 Returns the stripped line:
80
81 >>> decoder = FormatFlowedDecoder()
82 >>> decoder._stripstuffing(u' stuffed line')
83 u'stuffed line'
84
85 Non-stuffed lines are returned unchanged:
86
87 >>> decoder._stripstuffing(u'non-stuffed line')
88 u'non-stuffed line'
89
90 Additional spacing is preserved:
91
92 >>> decoder._stripstuffing(u' extra leading space')
93 u' extra leading space'
94
95 """
96 if line.startswith(u' '):
97 return line[1:]
98 return line
99
100 def _stripflow(self, line):
101 """Remove the trailing flow space is delete_space is set
102
103 The instance attribute delete_space is False by default thus this
104 method returns the line unchanged:
105
106 >>> decoder = FormatFlowedDecoder()
107 >>> decoder._stripflow(u'flowed line ')
108 u'flowed line '
109
110 But if the delete_space attribute has been set to True the flow space
111 is removed:
112
113 >>> decoder = FormatFlowedDecoder(delete_space=True)
114 >>> decoder._stripflow(u'flowed line ')
115 u'flowed line'
116
117 Only one flow space is removed:
118 >>> decoder._stripflow(u'extra whitespace ')
119 u'extra whitespace '
120
121 """
122 if self.delete_space and line.endswith(u' '):
123 return line[:-1]
124 return line
125
126 # -- Public API ----------------------------------------------------
127
128 def decode(self, flowed):
129 """Decode flowed text
130
131 Returns an iterable serving a sequence of (information, chunk)
132 tuples. information is a dictionary with the following fields:
133 type
134 One of PARAGRAPH, FIXED, SIGNATURE_SEPARATOR
135 quotedepth
136 Number of quotemarks found on the text chunk
137
138 chunk is a unicode string. All text is unwrapped and without any
139 quotemarks; when displaying these chunks, the appropriate quotemarks
140 should be added again, and chunks of type PARAGRAPH should be
141 displayed wrapped. Chunks of type FIXED should be displayed
142 unwrapped.
143
144
145 Examples
146 --------
147
148 Here is a simple example:
149
150 >>> CRLF = '\\r\\n'
151 >>> decoder = FormatFlowedDecoder()
152 >>> result = decoder.decode(CRLF.join((
153 ... ">> `Take some more tea,' the March Hare said to Alice, very ",
154 ... ">> earnestly.",
155 ... ">",
156 ... "> `I've had nothing yet,' Alice replied in an offended ",
157 ... "> tone, `so I can't take more.'",
158 ... "",
159 ... "`You mean you can't take less,' said the Hatter: `it's very ",
160 ... "easy to take more than nothing.'",
161 ... "",
162 ... "-- ",
163 ... "Lewis Carroll")))
164 >>> list(result) == [
165 ... ({'quotedepth': 2, 'type': PARAGRAPH},
166 ... u"`Take some more tea,' the March Hare said to Alice, "
167 ... u"very earnestly."),
168 ... ({'quotedepth': 1, 'type': FIXED}, u""),
169 ... ({'quotedepth': 1, 'type': PARAGRAPH},
170 ... u"`I've had nothing yet,' Alice replied in an offended "
171 ... u"tone, `so I can't take more.'"),
172 ... ({'quotedepth': 0, 'type': FIXED}, u""),
173 ... ({'quotedepth': 0, 'type': PARAGRAPH},
174 ... u"`You mean you can't take less,' said the Hatter: `it's "
175 ... u"very easy to take more than nothing.'"),
176 ... ({'quotedepth': 0, 'type': FIXED}, u""),
177 ... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
178 ... ({'quotedepth': 0, 'type': FIXED}, u"Lewis Carroll")
179 ... ]
180 True
181
182
183 Improperly closed paragraphs
184 ----------------------------
185
186 The decoder can deal with various cases of improperly format=flowed
187 messages. Paragraphs normally end with a fixed line, but the following
188 cases are also considered paragraph-closing cases:
189
190 - A change in quotedepth:
191
192 >>> result = decoder.decode(CRLF.join((
193 ... "> Depth one paragraph with flow space. ",
194 ... ">> Depth two paragraph with flow space. ",
195 ... "Depth zero paragraph with fixed line.")))
196 >>> list(result) == [
197 ... ({'quotedepth': 1, 'type': PARAGRAPH},
198 ... u"Depth one paragraph with flow space. "),
199 ... ({'quotedepth': 2, 'type': PARAGRAPH},
200 ... u"Depth two paragraph with flow space. "),
201 ... ({'quotedepth': 0, 'type': FIXED},
202 ... u"Depth zero paragraph with fixed line.")]
203 True
204
205 - A signature separator:
206
207 >>> result = decoder.decode(CRLF.join((
208 ... "A paragraph with flow space. ",
209 ... "-- ")))
210 >>> list(result) == [
211 ... ({'quotedepth': 0, 'type': PARAGRAPH},
212 ... u"A paragraph with flow space. "),
213 ... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- ")]
214 True
215
216 - The end of the message:
217
218 >>> result = decoder.decode(CRLF.join((
219 ... "A paragraph with flow space. ",)))
220 >>> list(result) == [
221 ... ({'quotedepth': 0, 'type': PARAGRAPH},
222 ... u"A paragraph with flow space. ")]
223 True
224
225
226 Decoder options
227 ---------------
228
229 The delete_space attribute of the FormatFlowedDecoder class can be used
230 to control wether or not the trailing space on flowed lines should be
231 retained; this is used to encode flowed text where spaces are rare:
232
233 >>> decoder = FormatFlowedDecoder(delete_space=True)
234 >>> result = decoder.decode(CRLF.join((
235 ... "Contrived example with a word- ",
236 ... "break across the paragraph.")))
237 >>> list(result) == [
238 ... ({'quotedepth': 0, 'type': PARAGRAPH},
239 ... u'Contrived example with a word-break across the '
240 ... u'paragraph.')]
241 True
242
243 Note that the characterset determines what how to interpret a space
244 and a quote marker. The cp037 characterset does not encode these
245 characters the same way, for example:
246
247 >>> decoder = FormatFlowedDecoder(character_set='cp037')
248 >>> result = decoder.decode(CRLF.join((
249 ... "n@\\xe3\\x88\\x89\\xa2@\\x89\\xa2@\\x81@\\x98\\xa4\\x96\\xa3"
250 ... "\\x85\\x84@\\x97\\x81\\x99\\x81\\x87\\x99\\x81\\x97\\x88@",
251 ... "n@\\x85\\x95\\x83\\x96\\x84\\x85\\x84@\\x89\\x95@\\x83\\x97"
252 ... "\\xf0\\xf3\\xf7K")))
253 >>> list(result) == [
254 ... ({'quotedepth': 1, 'type': PARAGRAPH},
255 ... u'This is a quoted paragraph encoded in cp037.')]
256 True
257
258 """
259 para = u''
260 pinfo = {'type': PARAGRAPH}
261 for line in flowed.split('\r\n'):
262 line = line.decode(self.character_set)
263 quotedepth, line = self._stripquotes(line)
264 line = self._stripstuffing(line)
265 if line == '-- ':
266 # signature separator
267 if para:
268 # exception case: flowed line followed by sig-sep
269 yield (pinfo, para)
270 pinfo = {'type': PARAGRAPH}
271 para = u''
272 yield ({'type': SIGNATURE_SEPARATOR,
273 'quotedepth': quotedepth}, line)
274 continue
275 if line.endswith(u' '):
276 # flowed line; collect into a paragraph
277 if quotedepth != pinfo.get('quotedepth', quotedepth):
278 # exception case: flowed line followed by quotedepth change
279 yield (pinfo, para)
280 pinfo = {'type': PARAGRAPH}
281 para = u''
282 para += self._stripflow(line)
283 pinfo['quotedepth'] = quotedepth
284 continue
285 # fixed line
286 if para:
287 # completed paragraph
288 if quotedepth != pinfo.get('quotedepth', quotedepth):
289 # exception case: flowed line followed by quotedepth change
290 yield (pinfo, para)
291 pinfo = {'type': PARAGRAPH}
292 para = u''
293 else:
294 yield (pinfo, para + line)
295 pinfo = {'type': PARAGRAPH}
296 para = u''
297 continue
298 yield ({'type': FIXED, 'quotedepth': quotedepth}, line)
299
300 if para:
301 # exception case: last line was a flowed line
302 yield (pinfo, para)
303
304
305class FormatFlowedEncoder:
306 """Object to generate format=flowed text
307
308 The following attributes influence the flowed formatting of text:
309 extra_space (default: False)
310 Use an extra space to create flowed lines; this requires that the
311 DelSpace flag will be set true on the Content-Type mime header. Use
312 this flag on texts that have little or no spaces to break on.
313 character_set (default: us-ascii)
314 Encode the output to this character set.
315 spacestuff_quoted (default: True)
316 Always spacestuff quoted chunks, i.e. place a space between the quote
317 markers and the text.
318 width (default: 78)
319 The maximum line width generated for flowed paragraphs; fixed lines
320 can still exceed this width. This value does not include the CRLF
321 line endings.
322
323 """
324 def __init__(self, extra_space=False, character_set='us-ascii',
325 spacestuff_quoted=True, width=78):
326 self.extra_space = extra_space
327 self.character_set = character_set
328 self.spacestuff_quoted = spacestuff_quoted
329 self.width = width
330
331 def _spacestuff(self, line, force=False):
332 """Prepend a space to lines starting with ' ', '>' or 'From'
333
334 Returns the altered line. Set 'force' to True to skip the tests and
335 always prepend the space regardless:
336
337 >>> encoder = FormatFlowedEncoder()
338 >>> encoder._spacestuff(u' leading space needs to be preserved')
339 u' leading space needs to be preserved'
340 >>> encoder._spacestuff(u'> can be confused for a quotemark')
341 u' > can be confused for a quotemark'
342 >>> encoder._spacestuff(u'From is often escaped by MTAs')
343 u' From is often escaped by MTAs'
344 >>> encoder._spacestuff(u'Padding is considered harmless')
345 u'Padding is considered harmless'
346 >>> encoder._spacestuff(u'So forcing it is fine', True)
347 u' So forcing it is fine'
348
349 Note that empty lines can never be spacestuffed:
350
351 >>> encoder._spacestuff(u'')
352 u''
353
354 """
355 if not line:
356 return line
357 # Although the RFC doesn't say so explicitly, in practice 'From' only
358 # needs escaping when (1) not quoted and (2) actually encoded as
359 # 'From' (so independent of the unicode sequence u'From').
360 # For simplicity's sake, we spacestuff it any time a line starts with
361 # it before adding quotemarks and encoding the line.
362 if force or line[0] in (' ', '>') or line.startswith('From'):
363 return u' ' + line
364 return line
365
366 # -- Public API ----------------------------------------------------
367
368 def encode(self, chunks):
369 """Encode chunks of text to format=flowed
370
371 chunks
372 An iterable sequence of (information, text) tuples, where information
373 is a dictionary with 'type' and 'quotedepth' keys. The 'type' value
374 is one of PARAGRAPH, FIXED or SIGNATURE-SEPARATOR, and the
375 'quotedepth' value a positive integer indicating the quoting depth.
376 text should be the unicode text to be encoded.
377
378 Example
379 -------
380
381 To illustrate, an example:
382
383 >>> chunks = (
384 ... ({'quotedepth': 2, 'type': PARAGRAPH},
385 ... u"`Take some more tea,' the March Hare said to Alice, "
386 ... u"very earnestly."),
387 ... ({'quotedepth': 1, 'type': FIXED}, u""),
388 ... ({'quotedepth': 1, 'type': PARAGRAPH},
389 ... u"`I've had nothing yet,' Alice replied in an offended "
390 ... u"tone, `so I can't take more.'"),
391 ... ({'quotedepth': 0, 'type': FIXED}, u""),
392 ... ({'quotedepth': 0, 'type': PARAGRAPH},
393 ... u"`You mean you can't take less,' said the Hatter: `it's "
394 ... u"very easy to take more than nothing.'"),
395 ... ({'quotedepth': 0, 'type': FIXED}, u""),
396 ... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
397 ... ({'quotedepth': 0, 'type': PARAGRAPH}, u"Carol Lewis"),
398 ... )
399 >>> result = FormatFlowedEncoder(width=45).encode(chunks)
400 >>> result.split('\\r\\n') == [
401 ... ">> `Take some more tea,' the March Hare said ",
402 ... ">> to Alice, very earnestly.",
403 ... ">",
404 ... "> `I've had nothing yet,' Alice replied in ",
405 ... "> an offended tone, `so I can't take more.'",
406 ... "",
407 ... "`You mean you can't take less,' said the ",
408 ... "Hatter: `it's very easy to take more than ",
409 ... "nothing.'",
410 ... "",
411 ... "-- ",
412 ... "Carol Lewis",
413 ... ""]
414 True
415
416 """
417 encoded = []
418 for info, text in chunks:
419 encoded.append(self.encodeChunk(text, **info))
420 return ''.join(encoded)
421
422 def encodeChunk(self, chunk, type=PARAGRAPH, quotedepth=0):
423 """Encode a chunk of text to format=flowed
424
425 The chunk is encoded to format=flowed text, controlled by the
426 following arguments.
427 chunk
428 The unicode text to be encoded. Newlines are considered to be
429 whitespace and will be converted to spaces.
430 type (default: PARAGRAPH)
431 Chunk type; one of PARAGRAPH, FIXED or SIGNATURE_SEPARATOR. When
432 called with type SIGNATURE_SEPARATOR the chunk is ignored and '-- '
433 is written out.
434 quotedepth (default: 0)
435 The quote depth of the chunk.
436
437
438 Examples
439 --------
440
441 The encoder has to deal with three types of text chunks. To illustrate,
442 we create a encoder instance geared:
443
444 >>> encoder = FormatFlowedEncoder(width=45)
445
446 We can then use this encoder to encode some examples of these different
447 types:
448
449 - fixed lines:
450
451 >>> encoder.encodeChunk(u'A fixed line remains unaltered', FIXED)
452 'A fixed line remains unaltered\\r\\n'
453 >>> encoder.encodeChunk(u'Although quoting is prepended', FIXED, 2)
454 '>> Although quoting is prepended\\r\\n'
455 >>> encoder.encodeChunk(u'Trailing spaces are removed ', FIXED)
456 'Trailing spaces are removed\\r\\n'
457 >>> encoder.encodeChunk(u'> and special first chars are fluffed',
458 ... FIXED)
459 ' > and special first chars are fluffed\\r\\n'
460
461 - a paragraph (the default type):
462
463 >>> result = encoder.encodeChunk(
464 ... u"`Take some more tea,' the March Hare said to Alice, "
465 ... u"very earnestly.")
466 >>> result == ("`Take some more tea,' the March Hare said \\r\\n"
467 ... "to Alice, very earnestly.\\r\\n")
468 True
469 >>> result = encoder.encodeChunk(
470 ... u"`I've had nothing yet,' Alice replied in an offended "
471 ... u"tone, `so I can't take more.'", PARAGRAPH, 1)
472 >>> result == ("> `I've had nothing yet,' Alice replied in \\r\\n"
473 ... "> an offended tone, `so I can't take more.'\\r\\n")
474 True
475 >>> result = encoder.encodeChunk(
476 ... u'The wrapping deals quite well with > eratic '
477 ... u'spacing and space fluffs characters where needed.')
478 >>> result == ("The wrapping deals quite well with \\r\\n"
479 ... " > eratic spacing and space fluffs \\r\\n"
480 ... "characters where needed.\\r\\n")
481 True
482
483 - signature separators:
484
485 >>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR)
486 '-- \\r\\n'
487 >>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR, 3)
488 '>>> -- \\r\\n'
489
490 Note that the actual chunk value is ignored for this type:
491
492 >>> encoder.encodeChunk(u'foobar', SIGNATURE_SEPARATOR)
493 '-- \\r\\n'
494
495
496 Encoder options
497 ---------------
498
499 The encoding can be influenced by several instance attributes; the
500 width attribute was used for the paragraph demonstrations. Others
501 include 'extra_space', 'character_set' and 'spacestuff_quoted':
502
503 - extra_space generates extra spaces on flowed lines so flowed lines
504 can be broken on something other than whitespace:
505
506 >>> encoder = FormatFlowedEncoder(extra_space=True, width=45)
507 >>> result = encoder.encodeChunk(
508 ... u'This is useful for texts with many word-breaks or few '
509 ... u'spaces')
510 >>> result == ("This is useful for texts with many word- \\r\\n"
511 ... "breaks or few spaces\\r\\n")
512 True
513
514 - character_set controls the output encoding:
515
516 >>> encoder = FormatFlowedEncoder(character_set='cp037')
517 >>> result = encoder.encodeChunk(u'Can you read me now?',
518 ... quotedepth=1)
519 >>> result == ('n@\\xc3\\x81\\x95@\\xa8\\x96\\xa4@\\x99\\x85\\x81'
520 ... '\\x84@\\x94\\x85@\\x95\x96\\xa6o\\r\\n')
521 True
522
523 - spacestuff_quoted causes quoted lines to be spacestuffed by default;
524 this makes for slightly more readable quoted text output. It is on
525 by default, but can be switched off:
526
527 >>> encoder = FormatFlowedEncoder(spacestuff_quoted=False)
528 >>> encoder.encodeChunk(u'Look Ma! No space!', quotedepth=1)
529 '>Look Ma! No space!\\r\\n'
530
531
532 RFC 2822 compliance
533 -------------------
534
535 Note that RFC 2822 requires that generated lines never exceed the
536 hard limit of 998 characters without the CRLF at the end. The encoder
537 has to enforce this by chopping the lines up into pieces not exceeding
538 that length:
539
540 >>> encoder = FormatFlowedEncoder()
541 >>> result = encoder.encodeChunk(u'-' * 1500, FIXED)
542 >>> result = result.split('\\r\\n')
543 >>> len(result)
544 3
545 >>> len(result[0])
546 998
547 >>> result == ['-' * 998, '-' * 502, '']
548 True
549
550 """
551 # cleanup: replace newlines with spaces and remove trailing spaces
552 chunk = ' '.join(chunk.rstrip().splitlines())
553
554 # Pre-encode quoting
555 quotemarker = u'>' * quotedepth
556 quotemarker = quotemarker.encode(self.character_set)
557 forcestuff = self.spacestuff_quoted and quotedepth > 0
558
559 if type == SIGNATURE_SEPARATOR:
560 chunk = u'-- '
561
562 if type == PARAGRAPH:
563 # Maximum width is reduced by stuffing and quotemarkers
564 width = self.width - len(quotemarker) - 2
565 if width <= 0:
566 raise ValueError('Not enough width for both quoting and text')
567 wrapper = _FlowedTextWrapper(width, self.extra_space)
568 chunk = wrapper.wrap(chunk)
569 else:
570 chunk = [chunk]
571
572 lines = []
573 for line in chunk:
574 # add space to flowed lines (all but last); this is an extra space
575 # if the wrapping of paragraphs included spaces at the end of the
576 # lines.
577 if line != chunk[-1]:
578 line += ' '
579 line = self._spacestuff(line, forcestuff)
580 line = quotemarker + line.encode(self.character_set)
581
582 # Enforce a hard limit of 998 characters per line (excluding CRLF)
583 # Unfortunately we can only enforce this *after* encoding,
584 # otherwise we could flow lines that are too long.
585 while len(line) > 998:
586 lines.append(line[:998])
587 line = line[998:]
588
589 lines.append(line)
590
591 lines.append('') # ensure last ending CRLF
592 return '\r\n'.join(lines)
593
594
595# -- Convenience functions ---------------------------------------------
596
597
598def decode(flowed, **kwargs):
599 """Convert format=flowed text
600
601 See the FormatFlowedDecoder.decode docstring for more information. All
602 keyword arguments are passed to the FormatFlowedDecoder instance.
603
604 """
605 decoder = FormatFlowedDecoder(**kwargs)
606 return decoder.decode(flowed)
607
608def encode(chunks, **kwargs):
609 """Convert chunks of text to format=flowed
610
611 See the FormatFlowedEncoder.encode docstring for more information. All
612 keyword arguments are passed to the FormatFlowedEncoder instance.
613
614 """
615 encoder = FormatFlowedEncoder(**kwargs)
616 return encoder.encode(chunks)
617
618def convertToWrapped(flowed, width=78, quote=u'>', wrap_fixed=True, **kwargs):
619 """Covert flowed text to encoded and wrapped text
620
621 Create text suitable for a proportional font, fixed with, plain text
622 display. The argements are interpreted as follows:
623 flowed
624 The format=flowed formatted text to convert
625 width (default: 78)
626 The maximum line length at which to wrap paragraphs.
627 quote (default: u'>')
628 Character sequence to use to mark quote depths; it is multiplied with
629 the quotedepth to quote a line. If this sequence does not end in a
630 space a space is added between the quotemars and the line.
631 wrap_fixed (default: True)
632 If true, fixed text chunks are wrapped to the given width as well,
633 including hard word breaks if a word exceeds the line width
634
635 The remaining arguments are used as arguments to FormatFlowedDecoder.
636
637 Here is a simple example:
638
639 >>> CRLF = '\\r\\n'
640 >>> result = convertToWrapped(CRLF.join((
641 ... ">> `Take some more tea,' the March Hare said to Alice, very ",
642 ... ">> earnestly.",
643 ... ">",
644 ... "> `I've had nothing yet,' Alice replied in an offended ",
645 ... "> tone, `so I can't take more.'",
646 ... "",
647 ... "`You mean you can't take less,' said the Hatter: `it's very ",
648 ... "easy to take more than nothing.'",
649 ... "",
650 ... "-- ",
651 ... "Lewis Caroll")), width=60)
652 >>> result.split('\\n') == [
653 ... ">> `Take some more tea,' the March Hare said to Alice, very",
654 ... ">> earnestly.",
655 ... "> ",
656 ... "> `I've had nothing yet,' Alice replied in an offended tone,",
657 ... "> `so I can't take more.'",
658 ... "",
659 ... "`You mean you can't take less,' said the Hatter: `it's very",
660 ... "easy to take more than nothing.'",
661 ... "",
662 ... "-- ",
663 ... "Lewis Caroll"]
664 True
665
666 """
667 result = []
668 for info, chunk in decode(flowed, **kwargs):
669 type = info['type']
670 quotedepth = info['quotedepth']
671 quotemarker = quotedepth and quote * quotedepth or u''
672 if quotemarker and quote[-1] != u' ':
673 quotemarker += u' '
674 if type == FIXED and not wrap_fixed:
675 result.append(quotemarker + chunk)
676 elif not chunk or type == SIGNATURE_SEPARATOR:
677 result.append(quotemarker + chunk)
678 else:
679 result.extend(textwrap.wrap(chunk, width,
680 replace_whitespace=False,
681 initial_indent=quotemarker,
682 subsequent_indent=quotemarker))
683 return u'\n'.join(result)
684
685def convertToFlowed(text, quotechars=u'>|%', **kwargs):
686 """Convert plain text to format=flowed
687
688 Attempt to interpret the plain text as paragraphs and fixed lines,
689 creating a format=flowed encoded text. The paragraph detection is fairly
690 simple and probably not suitable for real-world email.
691
692 text
693 Unicode text to be converted. Paragraphs are detected based on
694 whitelines between them, making all lines with extra linespace at the
695 start fixed to preserve that whitespace.
696 quotechars (default: u'>|%')
697 A set of characters recognized as quote markers; used to detect quote
698 depth.
699
700 Additional kwargs are passed on to FormatFlowedEncoder.
701
702 """
703 encoder = FormatFlowedEncoder(**kwargs)
704 return encoder.encode(_parseFlowableChunks(text, quotechars))
705
706
707# -- Private classes and methods ---------------------------------------
708
709
710class _FlowedTextWrapper(textwrap.TextWrapper):
711 """Custom text wrapper for flowed text
712
713 When not using extra spaces, only break on spaces; when we are using
714 extra spaces, don't swallow whitespace at the start and end of lines, but
715 do break long words (as they can be reconstructed with DelSpace on).
716
717 """
718 def __init__(self, width=78, extra_space=False):
719 textwrap.TextWrapper.__init__(self, width,
720 break_long_words=extra_space)
721 self.extra_space = extra_space
722 if not extra_space:
723 self.wordsep_re = re.compile(r'(\s+)')
724
725 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
726 # _handle_long_word taken from python 2.5 CVS speed optimisation
727 # Can be removed if this is used with python 2.5
728 space_left = max(width - cur_len, 1)
729 if self.break_long_words:
730 cur_line.append(reversed_chunks[-1][:space_left])
731 reversed_chunks[-1] = reversed_chunks[-1][space_left:]
732 elif not cur_line:
733 cur_line.append(reversed_chunks.pop())
734
735 def _wrap(self, chunks):
736 # Simplified and customized version of textwrap.TextWrapper
737 # Based on textwrapper rev. 1.37 in python CVS, with speed optimisation
738 lines = []
739 chunks.reverse()
740 while chunks:
741 cur_line = []
742 cur_len = 0
743 width = self.width
744
745 # Don't strip space at the start of a line when using extra_space
746 # because spaces are significant there.
747 if chunks[-1].strip() == '' and lines and not self.extra_space:
748 del chunks[-1]
749
750 while chunks:
751 l = len(chunks[-1])
752 if cur_len + l <= width:
753 cur_line.append(chunks.pop())
754 cur_len += l
755 else:
756 break
757
758 if chunks and len(chunks[-1]) > width:
759 self._handle_long_word(chunks, cur_line, cur_len, width)
760
761 # Don't drop space at end of line if using extra_space for
762 # marking flowed lines because otherwise there is no space between
763 # this line and the next when decoding the flowed text
764 if cur_line and cur_line[-1].strip() == '' and not self.extra_space:
765 del cur_line[-1]
766
767 if cur_line:
768 lines.append(''.join(cur_line))
769 return lines
770
771
772def _parseFlowableChunks(text, quotechars='>%|'):
773 """Parse out encodeble chunks, determining chunk type
774
775 First step is to remove and count quoting marks, determining the quotedepth
776 of the text. Then the type of the lines is detected.
777
778 Paragraphs are determined by terminating lines; terminating lines are
779 changes in quoting (depth or quoting used, signatures or fixed lines (see
780 below)
781
782 Fixed lines are used for lines with nothing but whitespace and for lines
783 with whitespace prepended (indented lines).
784
785 Any line with only two dashes at the start and whitespace is a signature
786 seperator.
787
788 Example code:
789
790 >>> result = _parseFlowableChunks(u'\\n'.join((
791 ... u'Normal text, as long as they are not delimited by empty ',
792 ... u'lines will be considered paragraphs and will be parsed as ',
793 ... u'such.',
794 ... u'',
795 ... u'> > Quoting will be detected as well, and as long as it is ',
796 ... u'> > consistent text will be collected into one paragraph.',
797 ... u'> Changes in depth trigger a new paragraph.',
798 ... u'> Leading whitespace makes for fixed lines.',
799 ... u'Signature separators are dealt with accordingly:',
800 ... u'-- '
801 ... )))
802 >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
803 ... u'Normal text, as long as they are not delimited by empty '
804 ... u'lines will be considered paragraphs and will be parsed as '
805 ... u'such.')
806 True
807 >>> result.next() == ({'type': FIXED, 'quotedepth': 0}, u'')
808 True
809 >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 2},
810 ... u'Quoting will be detected as well, and as long as it is '
811 ... u'consistent text will be collected into one paragraph.')
812 True
813 >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 1},
814 ... u'Changes in depth trigger a new paragraph.')
815 True
816 >>> result.next() == ({'type': FIXED, 'quotedepth': 1},
817 ... u' Leading whitespace makes for fixed lines.')
818 True
819 >>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
820 ... u'Signature separators are dealt with accordingly:')
821 True
822 >>> result.next() == ({'type': SIGNATURE_SEPARATOR, 'quotedepth': 0},
823 ... u'-- ')
824 True
825 >>> result.next()
826 Traceback (most recent call last):
827 ...
828 StopIteration
829
830 """
831 # Match quotemarks with limited whitespace around them
832 qm_match = re.compile('(^\s{0,2}([%s]\s?)+)' % quotechars).match
833 # Find all quotemarks
834 qm_findall = re.compile('[%s]' % quotechars).findall
835
836 quotedepth = 0
837 quotemarks = ''
838 para = u''
839
840 for line in text.splitlines():
841 has_quotes = qm_match(line)
842 same_quotes = quotemarks and line.startswith(quotemarks)
843 if (has_quotes and not same_quotes) or (not has_quotes and quotedepth):
844 # Change in quoting
845 if para:
846 yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
847 para = u''
848
849 quotemarks = has_quotes and has_quotes.group(0) or u''
850 quotedepth = len(qm_findall(quotemarks))
851
852 line = line[len(quotemarks):]
853
854 if line.rstrip() == u'--':
855 # signature separator
856 if para:
857 yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
858 para = u''
859
860 yield {'type': SIGNATURE_SEPARATOR, 'quotedepth': quotedepth}, line
861 continue
862
863 if line.strip() == u'' or line.lstrip() != line:
864 # Fixed line
865 if para:
866 yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
867 para = u''
868
869 yield {'type': FIXED, 'quotedepth': quotedepth}, line
870 continue
871
872 # Paragraph line; store and loop to next line
873 para += line
874
875 if para:
876 yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
877
878def _test(verbose=False):
879 import doctest
880 return doctest.testmod(verbose=verbose)
881
882if __name__ == '__main__':
883 import sys
884 _test('-v' in sys.argv)
Note: See TracBrowser for help on using the repository browser.