Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/RetroStatus/formatflowed.py@ 201

Last change on this file since 201 was 201, checked in by Nicholas Riley, 18 years ago
RetroStatus
File size: 33.4 KB

Rev	Line
[201]	1	"""RFC 3676 format=flowed text processing.
	2
	3	This module provides an API to create and display text/plain; format=flowed
	4	mimetype text.
	5
	6	"""
	7
	8	# Copyright (C) 2005 Martijn Pieters
	9	# Written by Martijn Pieters <mj@zopatista.com>
	10	# Development was sponsored by Logicalware (http://www.logicalware.org/)
	11	# Licensed as Open Source under the same terms as the Python 2.4.1 license,
	12	# as available at http://www.python.org/2.4.1/license.html
	13
	14	__revision_id__ = '$Id: formatflowed.py 27 2005-09-17 19:13:48Z mj $'
	15
	16	import re
	17	import textwrap
	18
	19	__all__ = [
	20	'PARAGRAPH',
	21	'FIXED',
	22	'SIGNATURE_SEPARATOR',
	23	'FormatFlowedDecoder',
	24	'FormatFlowedEncoder',
	25	'decode',
	26	'encode',
	27	'convertToWrapped',
	28	'convertToFlowed'
	29	]
	30
	31	# Constants denoting the various text chunk types recognized by format=flowed
	32	PARAGRAPH, FIXED, SIGNATURE_SEPARATOR = range(3)
	33
	34
	35	# -- Public classes ----------------------------------------------------
	36
	37
	38	class FormatFlowedDecoder:
	39	"""Object for converting format=flowed text to other formats
	40
	41	The following instance attributes influence the interpretation of
	42	format=flowed text:
	43	delete_space (default: False)
	44	Delete the trailing space before the CRLF on flowed lines before
	45	interpreting the line on flowed input, corresponds to the DelSp mime
	46	parameter
	47	character_set (default: us-ascii)
	48	The encoding of text passed in. Text is decoded to unicode using this
	49	encoding, using the default error handing scheme.
	50
	51	"""
	52	def __init__(self, delete_space=False, character_set='us-ascii'):
	53	self.delete_space = delete_space
	54	self.character_set = character_set
	55
	56	# -- Private methods -----------------------------------------------
	57
	58	def _stripquotes(self, line):
	59	"""Remove quotemarks from the start of the line
	60
	61	Returns the number of quotemarks stripped and the stripped line:
	62
	63	>>> decoder = FormatFlowedDecoder()
	64	>>> decoder._stripquotes(u'>>> quoted line')
	65	(3, u' quoted line')
	66
	67	Non-quoted lines are returned unchanged:
	68
	69	>>> decoder._stripquotes(u'non-quoted line')
	70	(0, u'non-quoted line')
	71
	72	"""
	73	stripped = line.lstrip('>')
	74	return len(line) - len(stripped), stripped
	75
	76	def _stripstuffing(self, line):
	77	"""Remove the optional leading space
	78
	79	Returns the stripped line:
	80
	81	>>> decoder = FormatFlowedDecoder()
	82	>>> decoder._stripstuffing(u' stuffed line')
	83	u'stuffed line'
	84
	85	Non-stuffed lines are returned unchanged:
	86
	87	>>> decoder._stripstuffing(u'non-stuffed line')
	88	u'non-stuffed line'
	89
	90	Additional spacing is preserved:
	91
	92	>>> decoder._stripstuffing(u' extra leading space')
	93	u' extra leading space'
	94
	95	"""
	96	if line.startswith(u' '):
	97	return line[1:]
	98	return line
	99
	100	def _stripflow(self, line):
	101	"""Remove the trailing flow space is delete_space is set
	102
	103	The instance attribute delete_space is False by default thus this
	104	method returns the line unchanged:
	105
	106	>>> decoder = FormatFlowedDecoder()
	107	>>> decoder._stripflow(u'flowed line ')
	108	u'flowed line '
	109
	110	But if the delete_space attribute has been set to True the flow space
	111	is removed:
	112
	113	>>> decoder = FormatFlowedDecoder(delete_space=True)
	114	>>> decoder._stripflow(u'flowed line ')
	115	u'flowed line'
	116
	117	Only one flow space is removed:
	118	>>> decoder._stripflow(u'extra whitespace ')
	119	u'extra whitespace '
	120
	121	"""
	122	if self.delete_space and line.endswith(u' '):
	123	return line[:-1]
	124	return line
	125
	126	# -- Public API ----------------------------------------------------
	127
	128	def decode(self, flowed):
	129	"""Decode flowed text
	130
	131	Returns an iterable serving a sequence of (information, chunk)
	132	tuples. information is a dictionary with the following fields:
	133	type
	134	One of PARAGRAPH, FIXED, SIGNATURE_SEPARATOR
	135	quotedepth
	136	Number of quotemarks found on the text chunk
	137
	138	chunk is a unicode string. All text is unwrapped and without any
	139	quotemarks; when displaying these chunks, the appropriate quotemarks
	140	should be added again, and chunks of type PARAGRAPH should be
	141	displayed wrapped. Chunks of type FIXED should be displayed
	142	unwrapped.
	143
	144
	145	Examples
	146	--------
	147
	148	Here is a simple example:
	149
	150	>>> CRLF = '\\r\\n'
	151	>>> decoder = FormatFlowedDecoder()
	152	>>> result = decoder.decode(CRLF.join((
	153	... ">> `Take some more tea,' the March Hare said to Alice, very ",
	154	... ">> earnestly.",
	155	... ">",
	156	... "> `I've had nothing yet,' Alice replied in an offended ",
	157	... "> tone, `so I can't take more.'",
	158	... "",
	159	... "`You mean you can't take less,' said the Hatter: `it's very ",
	160	... "easy to take more than nothing.'",
	161	... "",
	162	... "-- ",
	163	... "Lewis Carroll")))
	164	>>> list(result) == [
	165	... ({'quotedepth': 2, 'type': PARAGRAPH},
	166	... u"`Take some more tea,' the March Hare said to Alice, "
	167	... u"very earnestly."),
	168	... ({'quotedepth': 1, 'type': FIXED}, u""),
	169	... ({'quotedepth': 1, 'type': PARAGRAPH},
	170	... u"`I've had nothing yet,' Alice replied in an offended "
	171	... u"tone, `so I can't take more.'"),
	172	... ({'quotedepth': 0, 'type': FIXED}, u""),
	173	... ({'quotedepth': 0, 'type': PARAGRAPH},
	174	... u"`You mean you can't take less,' said the Hatter: `it's "
	175	... u"very easy to take more than nothing.'"),
	176	... ({'quotedepth': 0, 'type': FIXED}, u""),
	177	... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
	178	... ({'quotedepth': 0, 'type': FIXED}, u"Lewis Carroll")
	179	... ]
	180	True
	181
	182
	183	Improperly closed paragraphs
	184	----------------------------
	185
	186	The decoder can deal with various cases of improperly format=flowed
	187	messages. Paragraphs normally end with a fixed line, but the following
	188	cases are also considered paragraph-closing cases:
	189
	190	- A change in quotedepth:
	191
	192	>>> result = decoder.decode(CRLF.join((
	193	... "> Depth one paragraph with flow space. ",
	194	... ">> Depth two paragraph with flow space. ",
	195	... "Depth zero paragraph with fixed line.")))
	196	>>> list(result) == [
	197	... ({'quotedepth': 1, 'type': PARAGRAPH},
	198	... u"Depth one paragraph with flow space. "),
	199	... ({'quotedepth': 2, 'type': PARAGRAPH},
	200	... u"Depth two paragraph with flow space. "),
	201	... ({'quotedepth': 0, 'type': FIXED},
	202	... u"Depth zero paragraph with fixed line.")]
	203	True
	204
	205	- A signature separator:
	206
	207	>>> result = decoder.decode(CRLF.join((
	208	... "A paragraph with flow space. ",
	209	... "-- ")))
	210	>>> list(result) == [
	211	... ({'quotedepth': 0, 'type': PARAGRAPH},
	212	... u"A paragraph with flow space. "),
	213	... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- ")]
	214	True
	215
	216	- The end of the message:
	217
	218	>>> result = decoder.decode(CRLF.join((
	219	... "A paragraph with flow space. ",)))
	220	>>> list(result) == [
	221	... ({'quotedepth': 0, 'type': PARAGRAPH},
	222	... u"A paragraph with flow space. ")]
	223	True
	224
	225
	226	Decoder options
	227	---------------
	228
	229	The delete_space attribute of the FormatFlowedDecoder class can be used
	230	to control wether or not the trailing space on flowed lines should be
	231	retained; this is used to encode flowed text where spaces are rare:
	232
	233	>>> decoder = FormatFlowedDecoder(delete_space=True)
	234	>>> result = decoder.decode(CRLF.join((
	235	... "Contrived example with a word- ",
	236	... "break across the paragraph.")))
	237	>>> list(result) == [
	238	... ({'quotedepth': 0, 'type': PARAGRAPH},
	239	... u'Contrived example with a word-break across the '
	240	... u'paragraph.')]
	241	True
	242
	243	Note that the characterset determines what how to interpret a space
	244	and a quote marker. The cp037 characterset does not encode these
	245	characters the same way, for example:
	246
	247	>>> decoder = FormatFlowedDecoder(character_set='cp037')
	248	>>> result = decoder.decode(CRLF.join((
	249	... "n@\\xe3\\x88\\x89\\xa2@\\x89\\xa2@\\x81@\\x98\\xa4\\x96\\xa3"
	250	... "\\x85\\x84@\\x97\\x81\\x99\\x81\\x87\\x99\\x81\\x97\\x88@",
	251	... "n@\\x85\\x95\\x83\\x96\\x84\\x85\\x84@\\x89\\x95@\\x83\\x97"
	252	... "\\xf0\\xf3\\xf7K")))
	253	>>> list(result) == [
	254	... ({'quotedepth': 1, 'type': PARAGRAPH},
	255	... u'This is a quoted paragraph encoded in cp037.')]
	256	True
	257
	258	"""
	259	para = u''
	260	pinfo = {'type': PARAGRAPH}
	261	for line in flowed.split('\r\n'):
	262	line = line.decode(self.character_set)
	263	quotedepth, line = self._stripquotes(line)
	264	line = self._stripstuffing(line)
	265	if line == '-- ':
	266	# signature separator
	267	if para:
	268	# exception case: flowed line followed by sig-sep
	269	yield (pinfo, para)
	270	pinfo = {'type': PARAGRAPH}
	271	para = u''
	272	yield ({'type': SIGNATURE_SEPARATOR,
	273	'quotedepth': quotedepth}, line)
	274	continue
	275	if line.endswith(u' '):
	276	# flowed line; collect into a paragraph
	277	if quotedepth != pinfo.get('quotedepth', quotedepth):
	278	# exception case: flowed line followed by quotedepth change
	279	yield (pinfo, para)
	280	pinfo = {'type': PARAGRAPH}
	281	para = u''
	282	para += self._stripflow(line)
	283	pinfo['quotedepth'] = quotedepth
	284	continue
	285	# fixed line
	286	if para:
	287	# completed paragraph
	288	if quotedepth != pinfo.get('quotedepth', quotedepth):
	289	# exception case: flowed line followed by quotedepth change
	290	yield (pinfo, para)
	291	pinfo = {'type': PARAGRAPH}
	292	para = u''
	293	else:
	294	yield (pinfo, para + line)
	295	pinfo = {'type': PARAGRAPH}
	296	para = u''
	297	continue
	298	yield ({'type': FIXED, 'quotedepth': quotedepth}, line)
	299
	300	if para:
	301	# exception case: last line was a flowed line
	302	yield (pinfo, para)
	303
	304
	305	class FormatFlowedEncoder:
	306	"""Object to generate format=flowed text
	307
	308	The following attributes influence the flowed formatting of text:
	309	extra_space (default: False)
	310	Use an extra space to create flowed lines; this requires that the
	311	DelSpace flag will be set true on the Content-Type mime header. Use
	312	this flag on texts that have little or no spaces to break on.
	313	character_set (default: us-ascii)
	314	Encode the output to this character set.
	315	spacestuff_quoted (default: True)
	316	Always spacestuff quoted chunks, i.e. place a space between the quote
	317	markers and the text.
	318	width (default: 78)
	319	The maximum line width generated for flowed paragraphs; fixed lines
	320	can still exceed this width. This value does not include the CRLF
	321	line endings.
	322
	323	"""
	324	def __init__(self, extra_space=False, character_set='us-ascii',
	325	spacestuff_quoted=True, width=78):
	326	self.extra_space = extra_space
	327	self.character_set = character_set
	328	self.spacestuff_quoted = spacestuff_quoted
	329	self.width = width
	330
	331	def _spacestuff(self, line, force=False):
	332	"""Prepend a space to lines starting with ' ', '>' or 'From'
	333
	334	Returns the altered line. Set 'force' to True to skip the tests and
	335	always prepend the space regardless:
	336
	337	>>> encoder = FormatFlowedEncoder()
	338	>>> encoder._spacestuff(u' leading space needs to be preserved')
	339	u' leading space needs to be preserved'
	340	>>> encoder._spacestuff(u'> can be confused for a quotemark')
	341	u' > can be confused for a quotemark'
	342	>>> encoder._spacestuff(u'From is often escaped by MTAs')
	343	u' From is often escaped by MTAs'
	344	>>> encoder._spacestuff(u'Padding is considered harmless')
	345	u'Padding is considered harmless'
	346	>>> encoder._spacestuff(u'So forcing it is fine', True)
	347	u' So forcing it is fine'
	348
	349	Note that empty lines can never be spacestuffed:
	350
	351	>>> encoder._spacestuff(u'')
	352	u''
	353
	354	"""
	355	if not line:
	356	return line
	357	# Although the RFC doesn't say so explicitly, in practice 'From' only
	358	# needs escaping when (1) not quoted and (2) actually encoded as
	359	# 'From' (so independent of the unicode sequence u'From').
	360	# For simplicity's sake, we spacestuff it any time a line starts with
	361	# it before adding quotemarks and encoding the line.
	362	if force or line[0] in (' ', '>') or line.startswith('From'):
	363	return u' ' + line
	364	return line
	365
	366	# -- Public API ----------------------------------------------------
	367
	368	def encode(self, chunks):
	369	"""Encode chunks of text to format=flowed
	370
	371	chunks
	372	An iterable sequence of (information, text) tuples, where information
	373	is a dictionary with 'type' and 'quotedepth' keys. The 'type' value
	374	is one of PARAGRAPH, FIXED or SIGNATURE-SEPARATOR, and the
	375	'quotedepth' value a positive integer indicating the quoting depth.
	376	text should be the unicode text to be encoded.
	377
	378	Example
	379	-------
	380
	381	To illustrate, an example:
	382
	383	>>> chunks = (
	384	... ({'quotedepth': 2, 'type': PARAGRAPH},
	385	... u"`Take some more tea,' the March Hare said to Alice, "
	386	... u"very earnestly."),
	387	... ({'quotedepth': 1, 'type': FIXED}, u""),
	388	... ({'quotedepth': 1, 'type': PARAGRAPH},
	389	... u"`I've had nothing yet,' Alice replied in an offended "
	390	... u"tone, `so I can't take more.'"),
	391	... ({'quotedepth': 0, 'type': FIXED}, u""),
	392	... ({'quotedepth': 0, 'type': PARAGRAPH},
	393	... u"`You mean you can't take less,' said the Hatter: `it's "
	394	... u"very easy to take more than nothing.'"),
	395	... ({'quotedepth': 0, 'type': FIXED}, u""),
	396	... ({'quotedepth': 0, 'type': SIGNATURE_SEPARATOR}, u"-- "),
	397	... ({'quotedepth': 0, 'type': PARAGRAPH}, u"Carol Lewis"),
	398	... )
	399	>>> result = FormatFlowedEncoder(width=45).encode(chunks)
	400	>>> result.split('\\r\\n') == [
	401	... ">> `Take some more tea,' the March Hare said ",
	402	... ">> to Alice, very earnestly.",
	403	... ">",
	404	... "> `I've had nothing yet,' Alice replied in ",
	405	... "> an offended tone, `so I can't take more.'",
	406	... "",
	407	... "`You mean you can't take less,' said the ",
	408	... "Hatter: `it's very easy to take more than ",
	409	... "nothing.'",
	410	... "",
	411	... "-- ",
	412	... "Carol Lewis",
	413	... ""]
	414	True
	415
	416	"""
	417	encoded = []
	418	for info, text in chunks:
	419	encoded.append(self.encodeChunk(text, **info))
	420	return ''.join(encoded)
	421
	422	def encodeChunk(self, chunk, type=PARAGRAPH, quotedepth=0):
	423	"""Encode a chunk of text to format=flowed
	424
	425	The chunk is encoded to format=flowed text, controlled by the
	426	following arguments.
	427	chunk
	428	The unicode text to be encoded. Newlines are considered to be
	429	whitespace and will be converted to spaces.
	430	type (default: PARAGRAPH)
	431	Chunk type; one of PARAGRAPH, FIXED or SIGNATURE_SEPARATOR. When
	432	called with type SIGNATURE_SEPARATOR the chunk is ignored and '-- '
	433	is written out.
	434	quotedepth (default: 0)
	435	The quote depth of the chunk.
	436
	437
	438	Examples
	439	--------
	440
	441	The encoder has to deal with three types of text chunks. To illustrate,
	442	we create a encoder instance geared:
	443
	444	>>> encoder = FormatFlowedEncoder(width=45)
	445
	446	We can then use this encoder to encode some examples of these different
	447	types:
	448
	449	- fixed lines:
	450
	451	>>> encoder.encodeChunk(u'A fixed line remains unaltered', FIXED)
	452	'A fixed line remains unaltered\\r\\n'
	453	>>> encoder.encodeChunk(u'Although quoting is prepended', FIXED, 2)
	454	'>> Although quoting is prepended\\r\\n'
	455	>>> encoder.encodeChunk(u'Trailing spaces are removed ', FIXED)
	456	'Trailing spaces are removed\\r\\n'
	457	>>> encoder.encodeChunk(u'> and special first chars are fluffed',
	458	... FIXED)
	459	' > and special first chars are fluffed\\r\\n'
	460
	461	- a paragraph (the default type):
	462
	463	>>> result = encoder.encodeChunk(
	464	... u"`Take some more tea,' the March Hare said to Alice, "
	465	... u"very earnestly.")
	466	>>> result == ("`Take some more tea,' the March Hare said \\r\\n"
	467	... "to Alice, very earnestly.\\r\\n")
	468	True
	469	>>> result = encoder.encodeChunk(
	470	... u"`I've had nothing yet,' Alice replied in an offended "
	471	... u"tone, `so I can't take more.'", PARAGRAPH, 1)
	472	>>> result == ("> `I've had nothing yet,' Alice replied in \\r\\n"
	473	... "> an offended tone, `so I can't take more.'\\r\\n")
	474	True
	475	>>> result = encoder.encodeChunk(
	476	... u'The wrapping deals quite well with > eratic '
	477	... u'spacing and space fluffs characters where needed.')
	478	>>> result == ("The wrapping deals quite well with \\r\\n"
	479	... " > eratic spacing and space fluffs \\r\\n"
	480	... "characters where needed.\\r\\n")
	481	True
	482
	483	- signature separators:
	484
	485	>>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR)
	486	'-- \\r\\n'
	487	>>> encoder.encodeChunk(u'-- ', SIGNATURE_SEPARATOR, 3)
	488	'>>> -- \\r\\n'
	489
	490	Note that the actual chunk value is ignored for this type:
	491
	492	>>> encoder.encodeChunk(u'foobar', SIGNATURE_SEPARATOR)
	493	'-- \\r\\n'
	494
	495
	496	Encoder options
	497	---------------
	498
	499	The encoding can be influenced by several instance attributes; the
	500	width attribute was used for the paragraph demonstrations. Others
	501	include 'extra_space', 'character_set' and 'spacestuff_quoted':
	502
	503	- extra_space generates extra spaces on flowed lines so flowed lines
	504	can be broken on something other than whitespace:
	505
	506	>>> encoder = FormatFlowedEncoder(extra_space=True, width=45)
	507	>>> result = encoder.encodeChunk(
	508	... u'This is useful for texts with many word-breaks or few '
	509	... u'spaces')
	510	>>> result == ("This is useful for texts with many word- \\r\\n"
	511	... "breaks or few spaces\\r\\n")
	512	True
	513
	514	- character_set controls the output encoding:
	515
	516	>>> encoder = FormatFlowedEncoder(character_set='cp037')
	517	>>> result = encoder.encodeChunk(u'Can you read me now?',
	518	... quotedepth=1)
	519	>>> result == ('n@\\xc3\\x81\\x95@\\xa8\\x96\\xa4@\\x99\\x85\\x81'
	520	... '\\x84@\\x94\\x85@\\x95\x96\\xa6o\\r\\n')
	521	True
	522
	523	- spacestuff_quoted causes quoted lines to be spacestuffed by default;
	524	this makes for slightly more readable quoted text output. It is on
	525	by default, but can be switched off:
	526
	527	>>> encoder = FormatFlowedEncoder(spacestuff_quoted=False)
	528	>>> encoder.encodeChunk(u'Look Ma! No space!', quotedepth=1)
	529	'>Look Ma! No space!\\r\\n'
	530
	531
	532	RFC 2822 compliance
	533	-------------------
	534
	535	Note that RFC 2822 requires that generated lines never exceed the
	536	hard limit of 998 characters without the CRLF at the end. The encoder
	537	has to enforce this by chopping the lines up into pieces not exceeding
	538	that length:
	539
	540	>>> encoder = FormatFlowedEncoder()
	541	>>> result = encoder.encodeChunk(u'-' * 1500, FIXED)
	542	>>> result = result.split('\\r\\n')
	543	>>> len(result)
	544	3
	545	>>> len(result[0])
	546	998
	547	>>> result == ['-' * 998, '-' * 502, '']
	548	True
	549
	550	"""
	551	# cleanup: replace newlines with spaces and remove trailing spaces
	552	chunk = ' '.join(chunk.rstrip().splitlines())
	553
	554	# Pre-encode quoting
	555	quotemarker = u'>' * quotedepth
	556	quotemarker = quotemarker.encode(self.character_set)
	557	forcestuff = self.spacestuff_quoted and quotedepth > 0
	558
	559	if type == SIGNATURE_SEPARATOR:
	560	chunk = u'-- '
	561
	562	if type == PARAGRAPH:
	563	# Maximum width is reduced by stuffing and quotemarkers
	564	width = self.width - len(quotemarker) - 2
	565	if width <= 0:
	566	raise ValueError('Not enough width for both quoting and text')
	567	wrapper = _FlowedTextWrapper(width, self.extra_space)
	568	chunk = wrapper.wrap(chunk)
	569	else:
	570	chunk = [chunk]
	571
	572	lines = []
	573	for line in chunk:
	574	# add space to flowed lines (all but last); this is an extra space
	575	# if the wrapping of paragraphs included spaces at the end of the
	576	# lines.
	577	if line != chunk[-1]:
	578	line += ' '
	579	line = self._spacestuff(line, forcestuff)
	580	line = quotemarker + line.encode(self.character_set)
	581
	582	# Enforce a hard limit of 998 characters per line (excluding CRLF)
	583	# Unfortunately we can only enforce this after encoding,
	584	# otherwise we could flow lines that are too long.
	585	while len(line) > 998:
	586	lines.append(line[:998])
	587	line = line[998:]
	588
	589	lines.append(line)
	590
	591	lines.append('') # ensure last ending CRLF
	592	return '\r\n'.join(lines)
	593
	594
	595	# -- Convenience functions ---------------------------------------------
	596
	597
	598	def decode(flowed, **kwargs):
	599	"""Convert format=flowed text
	600
	601	See the FormatFlowedDecoder.decode docstring for more information. All
	602	keyword arguments are passed to the FormatFlowedDecoder instance.
	603
	604	"""
	605	decoder = FormatFlowedDecoder(**kwargs)
	606	return decoder.decode(flowed)
	607
	608	def encode(chunks, **kwargs):
	609	"""Convert chunks of text to format=flowed
	610
	611	See the FormatFlowedEncoder.encode docstring for more information. All
	612	keyword arguments are passed to the FormatFlowedEncoder instance.
	613
	614	"""
	615	encoder = FormatFlowedEncoder(**kwargs)
	616	return encoder.encode(chunks)
	617
	618	def convertToWrapped(flowed, width=78, quote=u'>', wrap_fixed=True, **kwargs):
	619	"""Covert flowed text to encoded and wrapped text
	620
	621	Create text suitable for a proportional font, fixed with, plain text
	622	display. The argements are interpreted as follows:
	623	flowed
	624	The format=flowed formatted text to convert
	625	width (default: 78)
	626	The maximum line length at which to wrap paragraphs.
	627	quote (default: u'>')
	628	Character sequence to use to mark quote depths; it is multiplied with
	629	the quotedepth to quote a line. If this sequence does not end in a
	630	space a space is added between the quotemars and the line.
	631	wrap_fixed (default: True)
	632	If true, fixed text chunks are wrapped to the given width as well,
	633	including hard word breaks if a word exceeds the line width
	634
	635	The remaining arguments are used as arguments to FormatFlowedDecoder.
	636
	637	Here is a simple example:
	638
	639	>>> CRLF = '\\r\\n'
	640	>>> result = convertToWrapped(CRLF.join((
	641	... ">> `Take some more tea,' the March Hare said to Alice, very ",
	642	... ">> earnestly.",
	643	... ">",
	644	... "> `I've had nothing yet,' Alice replied in an offended ",
	645	... "> tone, `so I can't take more.'",
	646	... "",
	647	... "`You mean you can't take less,' said the Hatter: `it's very ",
	648	... "easy to take more than nothing.'",
	649	... "",
	650	... "-- ",
	651	... "Lewis Caroll")), width=60)
	652	>>> result.split('\\n') == [
	653	... ">> `Take some more tea,' the March Hare said to Alice, very",
	654	... ">> earnestly.",
	655	... "> ",
	656	... "> `I've had nothing yet,' Alice replied in an offended tone,",
	657	... "> `so I can't take more.'",
	658	... "",
	659	... "`You mean you can't take less,' said the Hatter: `it's very",
	660	... "easy to take more than nothing.'",
	661	... "",
	662	... "-- ",
	663	... "Lewis Caroll"]
	664	True
	665
	666	"""
	667	result = []
	668	for info, chunk in decode(flowed, **kwargs):
	669	type = info['type']
	670	quotedepth = info['quotedepth']
	671	quotemarker = quotedepth and quote * quotedepth or u''
	672	if quotemarker and quote[-1] != u' ':
	673	quotemarker += u' '
	674	if type == FIXED and not wrap_fixed:
	675	result.append(quotemarker + chunk)
	676	elif not chunk or type == SIGNATURE_SEPARATOR:
	677	result.append(quotemarker + chunk)
	678	else:
	679	result.extend(textwrap.wrap(chunk, width,
	680	replace_whitespace=False,
	681	initial_indent=quotemarker,
	682	subsequent_indent=quotemarker))
	683	return u'\n'.join(result)
	684
	685	def convertToFlowed(text, quotechars=u'>\|%', **kwargs):
	686	"""Convert plain text to format=flowed
	687
	688	Attempt to interpret the plain text as paragraphs and fixed lines,
	689	creating a format=flowed encoded text. The paragraph detection is fairly
	690	simple and probably not suitable for real-world email.
	691
	692	text
	693	Unicode text to be converted. Paragraphs are detected based on
	694	whitelines between them, making all lines with extra linespace at the
	695	start fixed to preserve that whitespace.
	696	quotechars (default: u'>\|%')
	697	A set of characters recognized as quote markers; used to detect quote
	698	depth.
	699
	700	Additional kwargs are passed on to FormatFlowedEncoder.
	701
	702	"""
	703	encoder = FormatFlowedEncoder(**kwargs)
	704	return encoder.encode(_parseFlowableChunks(text, quotechars))
	705
	706
	707	# -- Private classes and methods ---------------------------------------
	708
	709
	710	class _FlowedTextWrapper(textwrap.TextWrapper):
	711	"""Custom text wrapper for flowed text
	712
	713	When not using extra spaces, only break on spaces; when we are using
	714	extra spaces, don't swallow whitespace at the start and end of lines, but
	715	do break long words (as they can be reconstructed with DelSpace on).
	716
	717	"""
	718	def __init__(self, width=78, extra_space=False):
	719	textwrap.TextWrapper.__init__(self, width,
	720	break_long_words=extra_space)
	721	self.extra_space = extra_space
	722	if not extra_space:
	723	self.wordsep_re = re.compile(r'(\s+)')
	724
	725	def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
	726	# _handle_long_word taken from python 2.5 CVS speed optimisation
	727	# Can be removed if this is used with python 2.5
	728	space_left = max(width - cur_len, 1)
	729	if self.break_long_words:
	730	cur_line.append(reversed_chunks[-1][:space_left])
	731	reversed_chunks[-1] = reversed_chunks[-1][space_left:]
	732	elif not cur_line:
	733	cur_line.append(reversed_chunks.pop())
	734
	735	def _wrap(self, chunks):
	736	# Simplified and customized version of textwrap.TextWrapper
	737	# Based on textwrapper rev. 1.37 in python CVS, with speed optimisation
	738	lines = []
	739	chunks.reverse()
	740	while chunks:
	741	cur_line = []
	742	cur_len = 0
	743	width = self.width
	744
	745	# Don't strip space at the start of a line when using extra_space
	746	# because spaces are significant there.
	747	if chunks[-1].strip() == '' and lines and not self.extra_space:
	748	del chunks[-1]
	749
	750	while chunks:
	751	l = len(chunks[-1])
	752	if cur_len + l <= width:
	753	cur_line.append(chunks.pop())
	754	cur_len += l
	755	else:
	756	break
	757
	758	if chunks and len(chunks[-1]) > width:
	759	self._handle_long_word(chunks, cur_line, cur_len, width)
	760
	761	# Don't drop space at end of line if using extra_space for
	762	# marking flowed lines because otherwise there is no space between
	763	# this line and the next when decoding the flowed text
	764	if cur_line and cur_line[-1].strip() == '' and not self.extra_space:
	765	del cur_line[-1]
	766
	767	if cur_line:
	768	lines.append(''.join(cur_line))
	769	return lines
	770
	771
	772	def _parseFlowableChunks(text, quotechars='>%\|'):
	773	"""Parse out encodeble chunks, determining chunk type
	774
	775	First step is to remove and count quoting marks, determining the quotedepth
	776	of the text. Then the type of the lines is detected.
	777
	778	Paragraphs are determined by terminating lines; terminating lines are
	779	changes in quoting (depth or quoting used, signatures or fixed lines (see
	780	below)
	781
	782	Fixed lines are used for lines with nothing but whitespace and for lines
	783	with whitespace prepended (indented lines).
	784
	785	Any line with only two dashes at the start and whitespace is a signature
	786	seperator.
	787
	788	Example code:
	789
	790	>>> result = _parseFlowableChunks(u'\\n'.join((
	791	... u'Normal text, as long as they are not delimited by empty ',
	792	... u'lines will be considered paragraphs and will be parsed as ',
	793	... u'such.',
	794	... u'',
	795	... u'> > Quoting will be detected as well, and as long as it is ',
	796	... u'> > consistent text will be collected into one paragraph.',
	797	... u'> Changes in depth trigger a new paragraph.',
	798	... u'> Leading whitespace makes for fixed lines.',
	799	... u'Signature separators are dealt with accordingly:',
	800	... u'-- '
	801	... )))
	802	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
	803	... u'Normal text, as long as they are not delimited by empty '
	804	... u'lines will be considered paragraphs and will be parsed as '
	805	... u'such.')
	806	True
	807	>>> result.next() == ({'type': FIXED, 'quotedepth': 0}, u'')
	808	True
	809	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 2},
	810	... u'Quoting will be detected as well, and as long as it is '
	811	... u'consistent text will be collected into one paragraph.')
	812	True
	813	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 1},
	814	... u'Changes in depth trigger a new paragraph.')
	815	True
	816	>>> result.next() == ({'type': FIXED, 'quotedepth': 1},
	817	... u' Leading whitespace makes for fixed lines.')
	818	True
	819	>>> result.next() == ({'type': PARAGRAPH, 'quotedepth': 0},
	820	... u'Signature separators are dealt with accordingly:')
	821	True
	822	>>> result.next() == ({'type': SIGNATURE_SEPARATOR, 'quotedepth': 0},
	823	... u'-- ')
	824	True
	825	>>> result.next()
	826	Traceback (most recent call last):
	827	...
	828	StopIteration
	829
	830	"""
	831	# Match quotemarks with limited whitespace around them
	832	qm_match = re.compile('(^\s{0,2}([%s]\s?)+)' % quotechars).match
	833	# Find all quotemarks
	834	qm_findall = re.compile('[%s]' % quotechars).findall
	835
	836	quotedepth = 0
	837	quotemarks = ''
	838	para = u''
	839
	840	for line in text.splitlines():
	841	has_quotes = qm_match(line)
	842	same_quotes = quotemarks and line.startswith(quotemarks)
	843	if (has_quotes and not same_quotes) or (not has_quotes and quotedepth):
	844	# Change in quoting
	845	if para:
	846	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
	847	para = u''
	848
	849	quotemarks = has_quotes and has_quotes.group(0) or u''
	850	quotedepth = len(qm_findall(quotemarks))
	851
	852	line = line[len(quotemarks):]
	853
	854	if line.rstrip() == u'--':
	855	# signature separator
	856	if para:
	857	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
	858	para = u''
	859
	860	yield {'type': SIGNATURE_SEPARATOR, 'quotedepth': quotedepth}, line
	861	continue
	862
	863	if line.strip() == u'' or line.lstrip() != line:
	864	# Fixed line
	865	if para:
	866	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
	867	para = u''
	868
	869	yield {'type': FIXED, 'quotedepth': quotedepth}, line
	870	continue
	871
	872	# Paragraph line; store and loop to next line
	873	para += line
	874
	875	if para:
	876	yield {'type': PARAGRAPH, 'quotedepth': quotedepth}, para
	877
	878	def _test(verbose=False):
	879	import doctest
	880	return doctest.testmod(verbose=verbose)
	881
	882	if __name__ == '__main__':
	883	import sys
	884	_test('-v' in sys.argv)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: