Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/RetroStatus/formatflowed.py@ 202

Last change on this file since 202 was 201, checked in by Nicholas Riley, 18 years ago
RetroStatus
File size: 33.4 KB

Line
1	"""RFC 3676 format=flowed text processing.
2
3	This module provides an API to create and display text/plain; format=flowed
4	mimetype text.
5
6	"""
7
8	# Copyright (C) 2005 Martijn Pieters
9	# Written by Martijn Pieters <mj@zopatista.com>
10	# Development was sponsored by Logicalware (http://www.logicalware.org/)
11	# Licensed as Open Source under the same terms as the Python 2.4.1 license,
12	# as available at http://www.python.org/2.4.1/license.html
13
14	__revision_id__ = '$Id: formatflowed.py 27 2005-09-17 19:13:48Z mj $'
15
16	import re
17	import textwrap
18
19	__all__ = [
20	'PARAGRAPH',
21	'FIXED',
22	'SIGNATURE_SEPARATOR',
23	'FormatFlowedDecoder',
24	'FormatFlowedEncoder',
25	'decode',
26	'encode',
27	'convertToWrapped',
28	'convertToFlowed'
29	]
30
31	# Constants denoting the various text chunk types recognized by format=flowed
32	PARAGRAPH, FIXED, SIGNATURE_SEPARATOR = range(3)
33
34
35	# -- Public classes ----------------------------------------------------
36
37
38	class FormatFlowedDecoder:
39	"""Object for converting format=flowed text to other formats
40
41	The following instance attributes influence the interpretation of
42	format=flowed text:
43	delete_space (default: False)
44	Delete the trailing space before the CRLF on flowed lines before
45	interpreting the line on flowed input, corresponds to the DelSp mime
46	parameter
47	character_set (default: us-ascii)
48	The encoding of text passed in. Text is decoded to unicode using this
49	encoding, using the default error handing scheme.
50
51	"""
52	def __init__(self, delete_space=False, character_set='us-ascii'):
53	self.delete_space = delete_space
54	self.character_set = character_set
55
56	# -- Private methods -----------------------------------------------
57
58	def _stripquotes(self, line):
59	"""Remove quotemarks from the start of the line
60
61	Returns the number of quotemarks stripped and the stripped line:
62
63	>>> decoder = FormatFlowedDecoder()
64	>>> decoder._stripquotes(u'>>> quoted line')
65	(3, u' quoted line')
66
67	Non-quoted lines are returned unchanged:
68
69	>>> decoder._stripquotes(u'non-quoted line')
70	(0, u'non-quoted line')
71
72	"""
73	stripped = line.lstrip('>')
74	return len(line) - len(stripped), stripped
75
76	def _stripstuffing(self, line):
77	"""Remove the optional leading space
78
79	Returns the stripped line:
80
81	>>> decoder = FormatFlowedDecoder()
82	>>> decoder._stripstuffing(u' stuffed line')
83	u'stuffed line'
84
85	Non-stuffed lines are returned unchanged:
86
87	>>> decoder._stripstuffing(u'non-stuffed line')
88	u'non-stuffed line'
89
90	Additional spacing is preserved:
91
92	>>> decoder._stripstuffing(u' extra leading space')
93	u' extra leading space'
94
95	"""
96	if line.startswith(u' '):
97	return line[1:]
98	return line
99
100	def _stripflow(self, line):
101	"""Remove the trailing flow space is delete_space is set
102
103	The instance attribute delete_space is False by default thus this
104	method returns the line unchanged:
105
106	>>> decoder = FormatFlowedDecoder()
107	>>> decoder._stripflow(u'flowed line ')
108	u'flowed line '
109
110	But if the delete_space attribute has been set to True the flow space
111	is removed:
112
113	>>> decoder = FormatFlowedDecoder(delete_space=True)
114	>>> decoder._stripflow(u'flowed line ')
115	u'flowed line'
116
117	Only one flow space is removed:
118	>>> decoder._stripflow(u'extra whitespace ')
119	u'extra whitespace '
120
121	"""
122	if self.delete_space and line.endswith(u' '):
123	return line[:-1]
124	return line
125
126	# -- Public API ----------------------------------------------------
127
128	def decode(self, flowed):
129	"""Decode flowed text
130
131	Returns an iterable serving a sequence of (information, chunk)
132	tuples. information is a dictionary with the following fields:
133	type
134	One of PARAGRAPH, FIXED, SIGNATURE_SEPARATOR
135	quotedepth
136	Number of quotemarks found on the text chunk
137
138	chunk is a unicode string. All text is unwrapped and without any
139	quotemarks; when displaying these chunks, the appropriate quotemarks
140	should be added again, and chunks of type PARAGRAPH should be
141	displayed wrapped. Chunks of type FIXED should be displayed
142	unwrapped.
143
144
145	Examples
146	--------
147
148	Here is a simple example:
149
150	>>> CRLF = '\\r\\n'
151	>>> decoder = FormatFlowedDecoder()
152	>>> result = decoder.decode(CRLF.join((
153	... ">> `Take some more tea,' the March Hare said to Alice, very ",
154	... ">> earnestly.",
155	... ">",
156	... "> `I've had nothing yet,' Alice replied in an offended ",
157	... "> tone, `so I can't take more.'",
158	... "",
159	... "`You mean you can't take less,' said the Hatter: `it's very ",
160	... "easy to take more than nothing.'",
161	... "",
162	... "-- ",
163	... "Lewis Carroll")))
164	>>> list(result) == [
165	... ({'quotedepth': 2, 'type': PARAGRAPH},
166	... u"`Take some more tea,' the March Hare said to Alice, "
167	... u"very earnestly."),
168	... ({'quotedepth': 1, 'type': FIXED}, u""),
169	... ({'quotedepth': 1, 'type': PARAGRAPH},
170	... u"`I've had nothing yet,' Alice replied in an offended "
171	... u"tone, `so I can't take more.'"),
172	... ({'quotedepth': 0, 'type': FIXED}, u""),
173	... ({'quotedepth': 0, 'type': PARAGRAPH},
174	... u"`You mean you can't take less,' said the Hatter: `it's "
175	... u"very easy to take more than nothing.'"),
176	... ({'quotedepth': 0, 'type': FIXED}, u""),
177	... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
178	... ({'quotedepth': 0, 'type': FIXED}, u"Lewis Carroll")
179	... ]
180	True
181
182
183	Improperly closed paragraphs
184	----------------------------
185
186	The decoder can deal with various cases of improperly format=flowed
187	messages. Paragraphs normally end with a fixed line, but the following
188	cases are also considered paragraph-closing cases:
189
190	- A change in quotedepth:
191
192	>>> result = decoder.decode(CRLF.join((
193	... "> Depth one paragraph with flow space. ",
194	... ">> Depth two paragraph with flow space. ",
195	... "Depth zero paragraph with fixed line.")))
196	>>> list(result) == [
197	... ({'quotedepth': 1, 'type': PARAGRAPH},
198	... u"Depth one paragraph with flow space. "),
199	... ({'quotedepth': 2, 'type': PARAGRAPH},
200	... u"Depth two paragraph with flow space. "),
201	... ({'quotedepth': 0, 'type': FIXED},
202	... u"Depth zero paragraph with fixed line.")]
203	True
204
205	- A signature separator:
206
207	>>> result = decoder.decode(CRLF.join((
208	... "A paragraph with flow space. ",
209	... "-- ")))
210	>>> list(result) == [
211	... ({'quotedepth': 0, 'type': PARAGRAPH},
212	... u"A paragraph with flow space. "),
213	... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- ")]
214	True
215
216	- The end of the message:
217
218	>>> result = decoder.decode(CRLF.join((
219	... "A paragraph with flow space. ",)))
220	>>> list(result) == [
221	... ({'quotedepth': 0, 'type': PARAGRAPH},
222	... u"A paragraph with flow space. ")]
223	True
224
225
226	Decoder options
227	---------------
228
229	The delete_space attribute of the FormatFlowedDecoder class can be used
230	to control wether or not the trailing space on flowed lines should be
231	retained; this is used to encode flowed text where spaces are rare:
232
233	>>> decoder = FormatFlowedDecoder(delete_space=True)
234	>>> result = decoder.decode(CRLF.join((
235	... "Contrived example with a word- ",
236	... "break across the paragraph.")))
237	>>> list(result) == [
238	... ({'quotedepth': 0, 'type': PARAGRAPH},
239	... u'Contrived example with a word-break across the '
240	... u'paragraph.')]
241	True
242
243	Note that the characterset determines what how to interpret a space
244	and a quote marker. The cp037 characterset does not encode these
245	characters the same way, for example:
246
247	>>> decoder = FormatFlowedDecoder(character_set='cp037')
248	>>> result = decoder.decode(CRLF.join((
249	... "n@\\xe3\\x88\\x89\\xa2@\\x89\\xa2@\\x81@\\x98\\xa4\\x96\\xa3"
250	... "\\x85\\x84@\\x97\\x81\\x99\\x81\\x87\\x99\\x81\\x97\\x88@",
251	... "n@\\x85\\x95\\x83\\x96\\x84\\x85\\x84@\\x89\\x95@\\x83\\x97"
252	... "\\xf0\\xf3\\xf7K")))
253	>>> list(result) == [
254	... ({'quotedepth': 1, 'type': PARAGRAPH},
255	... u'This is a quoted paragraph encoded in cp037.')]
256	True
257
258	"""
259	para = u''
260	pinfo = {'type': PARAGRAPH}
261	for line in flowed.split('\r\n'):
262	line = line.decode(self.character_set)
263	quotedepth, line = self._stripquotes(line)
264	line = self._stripstuffing(line)
265	if line == '-- ':
266	# signature separator
267	if para:
268	# exception case: flowed line followed by sig-sep
269	yield (pinfo, para)
270	pinfo = {'type': PARAGRAPH}
271	para = u''
272	yield ({'type': SIGNATURE_SEPARATOR,
273	'quotedepth': quotedepth}, line)
274	continue
275	if line.endswith(u' '):
276	# flowed line; collect into a paragraph
277	if quotedepth != pinfo.get('quotedepth', quotedepth):
278	# exception case: flowed line followed by quotedepth change
279	yield (pinfo, para)
280	pinfo = {'type': PARAGRAPH}
281	para = u''
282	para += self._stripflow(line)
283	pinfo['quotedepth'] = quotedepth
284	continue
285	# fixed line
286	if para:
287	# completed paragraph
288	if quotedepth != pinfo.get('quotedepth', quotedepth):
289	# exception case: flowed line followed by quotedepth change
290	yield (pinfo, para)
291	pinfo = {'type': PARAGRAPH}
292	para = u''
293	else:
294	yield (pinfo, para + line)
295	pinfo = {'type': PARAGRAPH}
296	para = u''
297	continue
298	yield ({'type': FIXED, 'quotedepth': quotedepth}, line)
299
300	if para:
301	# exception case: last line was a flowed line
302	yield (pinfo, para)
303
304
305	class FormatFlowedEncoder:
306	"""Object to generate format=flowed text
307
308	The following attributes influence the flowed formatting of text:
309	extra_space (default: False)
310	Use an extra space to create flowed lines; this requires that the
311	DelSpace flag will be set true on the Content-Type mime header. Use
312	this flag on texts that have little or no spaces to break on.
313	character_set (default: us-ascii)
314	Encode the output to this character set.
315	spacestuff_quoted (default: True)
316	Always spacestuff quoted chunks, i.e. place a space between the quote
317	markers and the text.
318	width (default: 78)
319	The maximum line width generated for flowed paragraphs; fixed lines
320	can still exceed this width. This value does not include the CRLF
321	line endings.
322
323	"""
324	def __init__(self, extra_space=False, character_set='us-ascii',
325	spacestuff_quoted=True, width=78):
326	self.extra_space = extra_space
327	self.character_set = character_set
328	self.spacestuff_quoted = spacestuff_quoted
329	self.width = width
330
331	def _spacestuff(self, line, force=False):
332	"""Prepend a space to lines starting with ' ', '>' or 'From'
333
334	Returns the altered line. Set 'force' to True to skip the tests and
335	always prepend the space regardless:
336
337	>>> encoder = FormatFlowedEncoder()
338	>>> encoder._spacestuff(u' leading space needs to be preserved')
339	u' leading space needs to be preserved'
340	>>> encoder._spacestuff(u'> can be confused for a quotemark')
341	u' > can be confused for a quotemark'
342	>>> encoder._spacestuff(u'From is often escaped by MTAs')
343	u' From is often escaped by MTAs'
344	>>> encoder._spacestuff(u'Padding is considered harmless')
345	u'Padding is considered harmless'
346	>>> encoder._spacestuff(u'So forcing it is fine', True)
347	u' So forcing it is fine'
348
349	Note that empty lines can never be spacestuffed:
350
351	>>> encoder._spacestuff(u'')
352	u''
353
354	"""
355	if not line:
356	return line
357	# Although the RFC doesn't say so explicitly, in practice 'From' only
358	# needs escaping when (1) not quoted and (2) actually encoded as
359	# 'From' (so independent of the unicode sequence u'From').
360	# For simplicity's sake, we spacestuff it any time a line starts with
361	# it before adding quotemarks and encoding the line.
362	if force or line[0] in (' ', '>') or line.startswith('From'):
363	return u' ' + line
364	return line
365
366	# -- Public API ----------------------------------------------------
367
368	def encode(self, chunks):
369	"""Encode chunks of text to format=flowed
370
371	chunks
372	An iterable sequence of (information, text) tuples, where information
373	is a dictionary with 'type' and 'quotedepth' keys. The 'type' value
374	is one of PARAGRAPH, FIXED or SIGNATURE-SEPARATOR, and the
375	'quotedepth' value a positive integer indicating the quoting depth.
376	text should be the unicode text to be encoded.
377
378	Example
379	-------
380
381	To illustrate, an example:
382
383	>>> chunks = (
384	... ({'quotedepth': 2, 'type': PARAGRAPH},
385	... u"`Take some more tea,' the March Hare said to Alice, "
386	... u"very earnestly."),
387	... ({'quotedepth': 1, 'type': FIXED}, u""),
388	... ({'quotedepth': 1, 'type': PARAGRAPH},
389	... u"`I've had nothing yet,' Alice replied in an offended "
390	... u"tone, `so I can't take more.'"),
391	... ({'quotedepth': 0, 'type': FIXED}, u""),
392	... ({'quotedepth': 0, 'type': PARAGRAPH},
393	... u"`You mean you can't take less,' said the Hatter: `it's "
394	... u"very easy to take more than nothing.'"),
395	... ({'quotedepth': 0, 'type': FIXED}, u""),
396	... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
397	... ({'quotedepth': 0, 'type': PARAGRAPH}, u"Carol Lewis"),
398	... )
399	>>> result = FormatFlowedEncoder(width=45).encode(chunks)
400	>>> result.split('\\r\\n') == [
401	... ">> `Take some more tea,' the March Hare said ",
402	... ">> to Alice, very earnestly.",
403	... ">",
404	... "> `I've had nothing yet,' Alice replied in ",
405	... "> an offended tone, `so I can't take more.'",
406	... "",
407	... "`You mean you can't take less,' said the ",
408	... "Hatter: `it's very easy to take more than ",
409	... "nothing.'",
410	... "",
411	... "-- ",
412	... "Carol Lewis",
413	... ""]
414	True
415
416	"""
417	encoded = []
418	for info, text in chunks:
419	encoded.append(self.encodeChunk(text, **info))
420	return ''.join(encoded)
421
422	def encodeChunk(self, chunk, type=PARAGRAPH, quotedepth=0):
423	"""Encode a chunk of text to format=flowed
424
425	The chunk is encoded to format=flowed text, controlled by the
426	following arguments.
427	chunk
428	The unicode text to be encoded. Newlines are considered to be
429	whitespace and will be converted to spaces.
430	type (default: PARAGRAPH)
431	Chunk type; one of PARAGRAPH, FIXED or SIGNATURE_SEPARATOR. When
432	called with type SIGNATURE_SEPARATOR the chunk is ignored and '-- '
433	is written out.
434	quotedepth (default: 0)
435	The quote depth of the chunk.
436
437
438	Examples
439	--------
440
441	The encoder has to deal with three types of text chunks. To illustrate,
442	we create a encoder instance geared:
443
444	>>> encoder = FormatFlowedEncoder(width=45)
445
446	We can then use this encoder to encode some examples of these different
447	types:
448
449	- fixed lines:
450
451	>>> encoder.encodeChunk(u'A fixed line remains unaltered', FIXED)
452	'A fixed line remains unaltered\\r\\n'
453	>>> encoder.encodeChunk(u'Although quoting is prepended', FIXED, 2)
454	'>> Although quoting is prepended\\r\\n'
455	>>> encoder.encodeChunk(u'Trailing spaces are removed ', FIXED)
456	'Trailing spaces are removed\\r\\n'
457	>>> encoder.encodeChunk(u'> and special first chars are fluffed',
458	... FIXED)
459	' > and special first chars are fluffed\\r\\n'
460
461	- a paragraph (the default type):
462
463	>>> result = encoder.encodeChunk(
464	... u"`Take some more tea,' the March Hare said to Alice, "
465	... u"very earnestly.")
466	>>> result == ("`Take some more tea,' the March Hare said \\r\\n"
467	... "to Alice, very earnestly.\\r\\n")
468	True
469	>>> result = encoder.encodeChunk(
470	... u"`I've had nothing yet,' Alice replied in an offended "
471	... u"tone, `so I can't take more.'", PARAGRAPH, 1)
472	>>> result == ("> `I've had nothing yet,' Alice replied in \\r\\n"
473	... "> an offended tone, `so I can't take more.'\\r\\n")
474	True
475	>>> result = encoder.encodeChunk(
476	... u'The wrapping deals quite well with > eratic '
477	... u'spacing and space fluffs characters where needed.')
478	>>> result == ("The wrapping deals quite well with \\r\\n"
479	... " > eratic spacing and space fluffs \\r\\n"
480	... "characters where needed.\\r\\n")
481	True
482
483	- signature separators:
484
485	>>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR)
486	'-- \\r\\n'
487	>>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR, 3)
488	'>>> -- \\r\\n'
489
490	Note that the actual chunk value is ignored for this type:
491
492	>>> encoder.encodeChunk(u'foobar', SIGNATURE_SEPARATOR)
493	'-- \\r\\n'
494
495
496	Encoder options
497	---------------
498
499	The encoding can be influenced by several instance attributes; the
500	width attribute was used for the paragraph demonstrations. Others
501	include 'extra_space', 'character_set' and 'spacestuff_quoted':
502
503	- extra_space generates extra spaces on flowed lines so flowed lines
504	can be broken on something other than whitespace:
505
506	>>> encoder = FormatFlowedEncoder(extra_space=True, width=45)
507	>>> result = encoder.encodeChunk(
508	... u'This is useful for texts with many word-breaks or few '
509	... u'spaces')
510	>>> result == ("This is useful for texts with many word- \\r\\n"
511	... "breaks or few spaces\\r\\n")
512	True
513
514	- character_set controls the output encoding:
515
516	>>> encoder = FormatFlowedEncoder(character_set='cp037')
517	>>> result = encoder.encodeChunk(u'Can you read me now?',
518	... quotedepth=1)
519	>>> result == ('n@\\xc3\\x81\\x95@\\xa8\\x96\\xa4@\\x99\\x85\\x81'
520	... '\\x84@\\x94\\x85@\\x95\x96\\xa6o\\r\\n')
521	True
522
523	- spacestuff_quoted causes quoted lines to be spacestuffed by default;
524	this makes for slightly more readable quoted text output. It is on
525	by default, but can be switched off:
526
527	>>> encoder = FormatFlowedEncoder(spacestuff_quoted=False)
528	>>> encoder.encodeChunk(u'Look Ma! No space!', quotedepth=1)
529	'>Look Ma! No space!\\r\\n'
530
531
532	RFC 2822 compliance
533	-------------------
534
535	Note that RFC 2822 requires that generated lines never exceed the
536	hard limit of 998 characters without the CRLF at the end. The encoder
537	has to enforce this by chopping the lines up into pieces not exceeding
538	that length:
539
540	>>> encoder = FormatFlowedEncoder()
541	>>> result = encoder.encodeChunk(u'-' * 1500, FIXED)
542	>>> result = result.split('\\r\\n')
543	>>> len(result)
544	3
545	>>> len(result[0])
546	998
547	>>> result == ['-' * 998, '-' * 502, '']
548	True
549
550	"""
551	# cleanup: replace newlines with spaces and remove trailing spaces
552	chunk = ' '.join(chunk.rstrip().splitlines())
553
554	# Pre-encode quoting
555	quotemarker = u'>' * quotedepth
556	quotemarker = quotemarker.encode(self.character_set)
557	forcestuff = self.spacestuff_quoted and quotedepth > 0
558
559	if type == SIGNATURE_SEPARATOR:
560	chunk = u'-- '
561
562	if type == PARAGRAPH:
563	# Maximum width is reduced by stuffing and quotemarkers
564	width = self.width - len(quotemarker) - 2
565	if width <= 0:
566	raise ValueError('Not enough width for both quoting and text')
567	wrapper = _FlowedTextWrapper(width, self.extra_space)
568	chunk = wrapper.wrap(chunk)
569	else:
570	chunk = [chunk]
571
572	lines = []
573	for line in chunk:
574	# add space to flowed lines (all but last); this is an extra space
575	# if the wrapping of paragraphs included spaces at the end of the
576	# lines.
577	if line != chunk[-1]:
578	line += ' '
579	line = self._spacestuff(line, forcestuff)
580	line = quotemarker + line.encode(self.character_set)
581
582	# Enforce a hard limit of 998 characters per line (excluding CRLF)
583	# Unfortunately we can only enforce this after encoding,
584	# otherwise we could flow lines that are too long.
585	while len(line) > 998:
586	lines.append(line[:998])
587	line = line[998:]
588
589	lines.append(line)
590
591	lines.append('') # ensure last ending CRLF
592	return '\r\n'.join(lines)
593
594
595	# -- Convenience functions ---------------------------------------------
596
597
598	def decode(flowed, **kwargs):
599	"""Convert format=flowed text
600
601	See the FormatFlowedDecoder.decode docstring for more information. All
602	keyword arguments are passed to the FormatFlowedDecoder instance.
603
604	"""
605	decoder = FormatFlowedDecoder(**kwargs)
606	return decoder.decode(flowed)
607
608	def encode(chunks, **kwargs):
609	"""Convert chunks of text to format=flowed
610
611	See the FormatFlowedEncoder.encode docstring for more information. All
612	keyword arguments are passed to the FormatFlowedEncoder instance.
613
614	"""
615	encoder = FormatFlowedEncoder(**kwargs)
616	return encoder.encode(chunks)
617
618	def convertToWrapped(flowed, width=78, quote=u'>', wrap_fixed=True, **kwargs):
619	"""Covert flowed text to encoded and wrapped text
620
621	Create text suitable for a proportional font, fixed with, plain text
622	display. The argements are interpreted as follows:
623	flowed
624	The format=flowed formatted text to convert
625	width (default: 78)
626	The maximum line length at which to wrap paragraphs.
627	quote (default: u'>')
628	Character sequence to use to mark quote depths; it is multiplied with
629	the quotedepth to quote a line. If this sequence does not end in a
630	space a space is added between the quotemars and the line.
631	wrap_fixed (default: True)
632	If true, fixed text chunks are wrapped to the given width as well,
633	including hard word breaks if a word exceeds the line width
634
635	The remaining arguments are used as arguments to FormatFlowedDecoder.
636
637	Here is a simple example:
638
639	>>> CRLF = '\\r\\n'
640	>>> result = convertToWrapped(CRLF.join((
641	... ">> `Take some more tea,' the March Hare said to Alice, very ",
642	... ">> earnestly.",
643	... ">",
644	... "> `I've had nothing yet,' Alice replied in an offended ",
645	... "> tone, `so I can't take more.'",
646	... "",
647	... "`You mean you can't take less,' said the Hatter: `it's very ",
648	... "easy to take more than nothing.'",
649	... "",
650	... "-- ",
651	... "Lewis Caroll")), width=60)
652	>>> result.split('\\n') == [
653	... ">> `Take some more tea,' the March Hare said to Alice, very",
654	... ">> earnestly.",
655	... "> ",
656	... "> `I've had nothing yet,' Alice replied in an offended tone,",
657	... "> `so I can't take more.'",
658	... "",
659	... "`You mean you can't take less,' said the Hatter: `it's very",
660	... "easy to take more than nothing.'",
661	... "",
662	... "-- ",
663	... "Lewis Caroll"]
664	True
665
666	"""
667	result = []
668	for info, chunk in decode(flowed, **kwargs):
669	type = info['type']
670	quotedepth = info['quotedepth']
671	quotemarker = quotedepth and quote * quotedepth or u''
672	if quotemarker and quote[-1] != u' ':
673	quotemarker += u' '
674	if type == FIXED and not wrap_fixed:
675	result.append(quotemarker + chunk)
676	elif not chunk or type == SIGNATURE_SEPARATOR:
677	result.append(quotemarker + chunk)
678	else:
679	result.extend(textwrap.wrap(chunk, width,
680	replace_whitespace=False,
681	initial_indent=quotemarker,
682	subsequent_indent=quotemarker))
683	return u'\n'.join(result)
684
685	def convertToFlowed(text, quotechars=u'>\|%', **kwargs):
686	"""Convert plain text to format=flowed
687
688	Attempt to interpret the plain text as paragraphs and fixed lines,
689	creating a format=flowed encoded text. The paragraph detection is fairly
690	simple and probably not suitable for real-world email.
691
692	text
693	Unicode text to be converted. Paragraphs are detected based on
694	whitelines between them, making all lines with extra linespace at the
695	start fixed to preserve that whitespace.
696	quotechars (default: u'>\|%')
697	A set of characters recognized as quote markers; used to detect quote
698	depth.
699
700	Additional kwargs are passed on to FormatFlowedEncoder.
701
702	"""
703	encoder = FormatFlowedEncoder(**kwargs)
704	return encoder.encode(_parseFlowableChunks(text, quotechars))
705
706
707	# -- Private classes and methods ---------------------------------------
708
709
710	class _FlowedTextWrapper(textwrap.TextWrapper):
711	"""Custom text wrapper for flowed text
712
713	When not using extra spaces, only break on spaces; when we are using
714	extra spaces, don't swallow whitespace at the start and end of lines, but
715	do break long words (as they can be reconstructed with DelSpace on).
716
717	"""
718	def __init__(self, width=78, extra_space=False):
719	textwrap.TextWrapper.__init__(self, width,
720	break_long_words=extra_space)
721	self.extra_space = extra_space
722	if not extra_space:
723	self.wordsep_re = re.compile(r'(\s+)')
724
725	def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
726	# _handle_long_word taken from python 2.5 CVS speed optimisation
727	# Can be removed if this is used with python 2.5
728	space_left = max(width - cur_len, 1)
729	if self.break_long_words:
730	cur_line.append(reversed_chunks[-1][:space_left])
731	reversed_chunks[-1] = reversed_chunks[-1][space_left:]
732	elif not cur_line:
733	cur_line.append(reversed_chunks.pop())
734
735	def _wrap(self, chunks):
736	# Simplified and customized version of textwrap.TextWrapper
737	# Based on textwrapper rev. 1.37 in python CVS, with speed optimisation
738	lines = []
739	chunks.reverse()
740	while chunks:
741	cur_line = []
742	cur_len = 0
743	width = self.width
744
745	# Don't strip space at the start of a line when using extra_space
746	# because spaces are significant there.
747	if chunks[-1].strip() == '' and lines and not self.extra_space:
748	del chunks[-1]
749
750	while chunks:
751	l = len(chunks[-1])
752	if cur_len + l <= width:
753	cur_line.append(chunks.pop())
754	cur_len += l
755	else:
756	break
757
758	if chunks and len(chunks[-1]) > width:
759	self._handle_long_word(chunks, cur_line, cur_len, width)
760
761	# Don't drop space at end of line if using extra_space for
762	# marking flowed lines because otherwise there is no space between
763	# this line and the next when decoding the flowed text
764	if cur_line and cur_line[-1].strip() == '' and not self.extra_space:
765	del cur_line[-1]
766
767	if cur_line:
768	lines.append(''.join(cur_line))
769	return lines
770
771
772	def _parseFlowableChunks(text, quotechars='>%\|'):
773	"""Parse out encodeble chunks, determining chunk type
774
775	First step is to remove and count quoting marks, determining the quotedepth
776	of the text. Then the type of the lines is detected.
777
778	Paragraphs are determined by terminating lines; terminating lines are
779	changes in quoting (depth or quoting used, signatures or fixed lines (see
780	below)
781
782	Fixed lines are used for lines with nothing but whitespace and for lines
783	with whitespace prepended (indented lines).
784
785	Any line with only two dashes at the start and whitespace is a signature
786	seperator.
787
788	Example code:
789
790	>>> result = _parseFlowableChunks(u'\\n'.join((
791	... u'Normal text, as long as they are not delimited by empty ',
792	... u'lines will be considered paragraphs and will be parsed as ',
793	... u'such.',
794	... u'',
795	... u'> > Quoting will be detected as well, and as long as it is ',
796	... u'> > consistent text will be collected into one paragraph.',
797	... u'> Changes in depth trigger a new paragraph.',
798	... u'> Leading whitespace makes for fixed lines.',
799	... u'Signature separators are dealt with accordingly:',
800	... u'-- '
801	... )))
802	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
803	... u'Normal text, as long as they are not delimited by empty '
804	... u'lines will be considered paragraphs and will be parsed as '
805	... u'such.')
806	True
807	>>> result.next() == ({'type': FIXED, 'quotedepth': 0}, u'')
808	True
809	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 2},
810	... u'Quoting will be detected as well, and as long as it is '
811	... u'consistent text will be collected into one paragraph.')
812	True
813	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 1},
814	... u'Changes in depth trigger a new paragraph.')
815	True
816	>>> result.next() == ({'type': FIXED, 'quotedepth': 1},
817	... u' Leading whitespace makes for fixed lines.')
818	True
819	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
820	... u'Signature separators are dealt with accordingly:')
821	True
822	>>> result.next() == ({'type': SIGNATURE_SEPARATOR, 'quotedepth': 0},
823	... u'-- ')
824	True
825	>>> result.next()
826	Traceback (most recent call last):
827	...
828	StopIteration
829
830	"""
831	# Match quotemarks with limited whitespace around them
832	qm_match = re.compile('(^\s{0,2}([%s]\s?)+)' % quotechars).match
833	# Find all quotemarks
834	qm_findall = re.compile('[%s]' % quotechars).findall
835
836	quotedepth = 0
837	quotemarks = ''
838	para = u''
839
840	for line in text.splitlines():
841	has_quotes = qm_match(line)
842	same_quotes = quotemarks and line.startswith(quotemarks)
843	if (has_quotes and not same_quotes) or (not has_quotes and quotedepth):
844	# Change in quoting
845	if para:
846	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
847	para = u''
848
849	quotemarks = has_quotes and has_quotes.group(0) or u''
850	quotedepth = len(qm_findall(quotemarks))
851
852	line = line[len(quotemarks):]
853
854	if line.rstrip() == u'--':
855	# signature separator
856	if para:
857	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
858	para = u''
859
860	yield {'type': SIGNATURE_SEPARATOR, 'quotedepth': quotedepth}, line
861	continue
862
863	if line.strip() == u'' or line.lstrip() != line:
864	# Fixed line
865	if para:
866	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
867	para = u''
868
869	yield {'type': FIXED, 'quotedepth': quotedepth}, line
870	continue
871
872	# Paragraph line; store and loop to next line
873	para += line
874
875	if para:
876	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
877
878	def _test(verbose=False):
879	import doctest
880	return doctest.testmod(verbose=verbose)
881
882	if __name__ == '__main__':
883	import sys
884	_test('-v' in sys.argv)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: