pyzmail.parse

1 # 2 # pyzmail/parse.py 3 # (c) Alain Spineux <alain.spineux@gmail.com> 4 # http://www.magiksys.net/pyzmail 5 # Released under LGPL 6 7 """ 8 Useful functions to parse emails 9 10 @var email_address_re: a regex that match well formed email address (from perlfaq9) 11 @undocumented: atom_rfc2822 12 @undocumented: atom_posfix_restricted 13 @undocumented: atom 14 @undocumented: dot_atom 15 @undocumented: local 16 @undocumented: domain_lit 17 @undocumented: domain 18 @undocumented: addr_spec 19 """ 20 21 import re 22 import StringIO 23 import email 24 import email.errors 25 import email.header 26 import email.message 27 import mimetypes 28 29 from utils import * 30 31 # email address REGEX matching the RFC 2822 spec from perlfaq9 32 # my $atom = qr{[a-zA-Z0-9_!#\$\%&'*+/=?\^`{}~|\-]+}; 33 # my $dot_atom = qr{$atom(?:\.$atom)*}; 34 # my $quoted = qr{"(?:\\[^\r\n]|[^\\"])*"}; 35 # my $local = qr{(?:$dot_atom|$quoted)}; 36 # my $domain_lit = qr{\[(?:\\\S|[\x21-\x5a\x5e-\x7e])*\]}; 37 # my $domain = qr{(?:$dot_atom|$domain_lit)}; 38 # my $addr_spec = qr{$local\@$domain}; 39 # 40 # Python's translation 41 atom_rfc2822=r"[a-zA-Z0-9_!#\$\%&'*+/=?\^`{}~|\-]+" 42 atom_posfix_restricted=r"[a-zA-Z0-9_#\$&'*+/=?\^`{}~|\-]+" # without '!' and '%' 43 atom=atom_rfc2822 44 dot_atom=atom + r"(?:\." + atom + ")*" 45 quoted=r'"(?:\\[^\r\n]|[^\\"])*"' 46 local="(?:" + dot_atom + "|" + quoted + ")" 47 domain_lit=r"\[(?:\\\S|[\x21-\x5a\x5e-\x7e])*\]" 48 domain="(?:" + dot_atom + "|" + domain_lit + ")" 49 addr_spec=local + "\@" + domain 50 # and the result 51 email_address_re=re.compile('^'+addr_spec+'$')

52 53 -class MailPart:

54 """ 55 Data related to a mail part (aka message content, attachment or 56 embedded content in an email) 57 58 @type charset: str or None 59 @ivar charset: the encoding of the I{get_payload()} content if I{type} is 'text/*' 60 and charset has been specified in the message 61 @type content_id: str or None 62 @ivar content_id: the MIME Content-ID if specified in the message. 63 @type description: str or None 64 @ivar description: the MIME Content-Description if specified in the message. 65 @type disposition: str or None 66 @ivar disposition: C{None}, C{'inline'} or C{'attachment'} depending 67 the MIME Content-Disposition value 68 @type filename: unicode or None 69 @ivar filename: the name of the file, if specified in the message. 70 @type part: inherit from email.mime.base.MIMEBase 71 @ivar part: the related part inside the message. 72 @type is_body: str or None 73 @ivar is_body: None if this part is not the mail content itself (an 74 attachment or embedded content), C{'text/plain'} if this part is the 75 text content or C{'text/html'} if this part is the HTML version. 76 @type sanitized_filename: str or None 77 @ivar sanitized_filename: This field is filled by L{PyzMessage} to store 78 a valid unique filename related or not with the original filename. 79 @type type: str 80 @ivar type: the MIME type, like 'text/plain', 'image/png', 'application/msword' ... 81 """ 82

83 - def __init__(self, part, filename=None, type=None, charset=None, content_id=None, description=None, disposition=None, sanitized_filename=None, is_body=None):

84 """ 85 Create an mail part and initialize all attributes 86 """ 87 self.part=part # original python part 88 self.filename=filename # filename in unicode (if any) 89 self.type=type # the mime-type 90 self.charset=charset # the charset (if any) 91 self.description=description # if any 92 self.disposition=disposition # 'inline', 'attachment' or None 93 self.sanitized_filename=sanitized_filename # cleanup your filename here (TODO) 94 self.is_body=is_body # usually in (None, 'text/plain' or 'text/html') 95 self.content_id=content_id # if any 96 if self.content_id: 97 # strip '<>' to ease search and replace in "root" content (TODO) 98 if self.content_id.startswith('<') and self.content_id.endswith('>'): 99 self.content_id=self.content_id[1:-1]

100

101 - def get_payload(self):

102 """ 103 decode and return part payload. if I{type} is 'text/*' and I{charset} 104 not C{None}, be careful to take care of the text encoding. Use 105 something like C{part.get_payload().decode(part.charset)} 106 """ 107 108 payload=None 109 if self.type.startswith('message/'): 110 # I don't use msg.as_string() because I want to use mangle_from_=False 111 if sys.version_info<(3, 0): 112 # python 2.x 113 from email.generator import Generator 114 fp = StringIO.StringIO() 115 g = Generator(fp, mangle_from_=False) 116 g.flatten(self.part, unixfrom=False) 117 payload=fp.getvalue() 118 else: 119 # support only for python >= 3.2 120 from email.generator import BytesGenerator 121 import io 122 fp = io.BytesIO() 123 g = BytesGenerator(fp, mangle_from_=False) 124 g.flatten(self.part, unixfrom=False) 125 payload=fp.getvalue() 126 127 else: 128 payload=self.part.get_payload(decode=True) 129 return payload

130

131 - def __repr__(self):

132 st=u'MailPart<' 133 if self.is_body: 134 st+=u'*' 135 st+=self.type 136 if self.charset: 137 st+=' charset='+self.charset 138 if self.filename: 139 st+=' filename='+self.filename 140 if self.content_id: 141 st+=' content_id='+self.content_id 142 st+=' len=%d' % (len(self.get_payload()), ) 143 st+=u'>' 144 return st

145 146 147 148 _line_end_re=re.compile('\r\n|\n\r|\n|\r')

149 150 -def _friendly_header(header):

151 """ 152 Convert header returned by C{email.message.Message.get()} into a 153 user friendly string. 154 155 Py3k C{email.message.Message.get()} return C{header.Header()} with charset 156 set to C{charset.UNKNOWN8BIT} when the header contains invalid characters, 157 else it return I{str} as Python 2.X does 158 159 @type header: str or email.header.Header 160 @param header: the header to convert into a user friendly string 161 162 @rtype: str 163 @returns: the converter header 164 """ 165 166 save=header 167 if isinstance(header, email.header.Header): 168 header=str(header) 169 170 return re.sub(_line_end_re, ' ', header)

171

172 -def decode_mail_header(value, default_charset='us-ascii'):

173 """ 174 Decode a header value into a unicode string. 175 Works like a more smarter python 176 C{u"".join(email.header.decode_header()} function 177 178 @type value: str 179 @param value: the value of the header. 180 @type default_charset: str 181 @keyword default_charset: if one charset used in the header (multiple charset 182 can be mixed) is unknown, then use this charset instead. 183 184 >>> decode_mail_header('=?iso-8859-1?q?Courrier_=E8lectronique_en_Fran=E7ais?=') 185 u'Courrier \\xe8lectronique en Fran\\xe7ais' 186 """ 187 188 # value=_friendly_header(value) 189 try: 190 headers=email.header.decode_header(value) 191 except email.errors.HeaderParseError: 192 # this can append in email.base64mime.decode(), for example for this value: 193 # '=?UTF-8?B?15HXmdeh15jXqNeVINeY15DXpteUINeTJ9eV16jXlSDXkdeg15XXldeUINem15PXpywg15TXptei16bXldei15nXnSDXqdecINek15zXmdeZ?==?UTF-8?B?157XldeR15nXnCwg157Xldek16Ig157Xl9eV15wg15HXodeV15bXnyDXk9ec15DXnCDXldeh15gg157Xl9eR16rXldeqINep15wg15HXmdeQ?==?UTF-8?B?15zXmNeZ?=' 194 # then return a sanitized ascii string 195 # TODO: some improvements are possible here, but a failure here is 196 # unlikely 197 return value.encode('us-ascii', 'replace').decode('us-ascii') 198 else: 199 for i, (text, charset) in enumerate(headers): 200 # python 3.x 201 # email.header.decode_header('a') -> [('a', None)] 202 # email.header.decode_header('a =?ISO-8859-1?Q?foo?= b') 203 # --> [(b'a', None), (b'foo', 'iso-8859-1'), (b'b', None)] 204 # in Py3 text is sometime str and sometime byte :-( 205 # python 2.x 206 # email.header.decode_header('a') -> [('a', None)] 207 # email.header.decode_header('a =?ISO-8859-1?Q?foo?= b') 208 # --> [('a', None), ('foo', 'iso-8859-1'), ('b', None)] 209 if (charset is None and sys.version_info>=(3, 0)): 210 # Py3 211 if isinstance(text, str): 212 # convert Py3 string into bytes string to be sure their is no 213 # non us-ascii chars and because next line expect byte string 214 text=text.encode('us-ascii', 'replace') 215 try: 216 headers[i]=text.decode(charset or 'us-ascii', 'replace') 217 except LookupError: 218 # if the charset is unknown, force default 219 headers[i]=text.decode(default_charset, 'replace') 220 221 return u"".join(headers)

222

223 -def get_mail_addresses(message, header_name):

224 """ 225 retrieve all email addresses from one message header 226 227 @type message: email.message.Message 228 @param message: the email message 229 @type header_name: str 230 @param header_name: the name of the header, can be 'from', 'to', 'cc' or 231 any other header containing one or more email addresses 232 @rtype: list 233 @returns: a list of the addresses in the form of tuples 234 C{[(u'Name', 'addresse@domain.com'), ...]} 235 236 >>> import email 237 >>> import email.mime.text 238 >>> msg=email.mime.text.MIMEText('The text.', 'plain', 'us-ascii') 239 >>> msg['From']=email.email.utils.formataddr(('Me', 'me@foo.com')) 240 >>> msg['To']=email.email.utils.formataddr(('A', 'a@foo.com'))+', '+email.email.utils.formataddr(('B', 'b@foo.com')) 241 >>> print msg.as_string(unixfrom=False) 242 Content-Type: text/plain; charset="us-ascii" 243 MIME-Version: 1.0 244 Content-Transfer-Encoding: 7bit 245 From: Me <me@foo.com> 246 To: A <a@foo.com>, B <b@foo.com> 247 <BLANKLINE> 248 The text. 249 >>> get_mail_addresses(msg, 'from') 250 [(u'Me', 'me@foo.com')] 251 >>> get_mail_addresses(msg, 'to') 252 [(u'A', 'a@foo.com'), (u'B', 'b@foo.com')] 253 """ 254 addrs=email.utils.getaddresses([ _friendly_header(h) for h in message.get_all(header_name, [])]) 255 for i, (addr_name, addr) in enumerate(addrs): 256 if not addr_name and addr: 257 # only one string! Is it the address or the address name ? 258 # use the same for both and see later 259 addr_name=addr 260 261 if is_usascii(addr): 262 # address must be ascii only and must match address regex 263 if not email_address_re.match(addr): 264 addr='' 265 else: 266 addr='' 267 addrs[i]=(decode_mail_header(addr_name), addr) 268 return addrs

269

270 -def get_filename(part):

271 """ 272 Find the filename of a mail part. Many MUA send attachments with the 273 filename in the I{name} parameter of the I{Content-type} header instead 274 of in the I{filename} parameter of the I{Content-Disposition} header. 275 276 @type part: inherit from email.mime.base.MIMEBase 277 @param part: the mail part 278 @rtype: None or unicode 279 @returns: the filename or None if not found 280 281 >>> import email.mime.image 282 >>> attach=email.mime.image.MIMEImage('data', 'png') 283 >>> attach.add_header('Content-Disposition', 'attachment', filename='image.png') 284 >>> get_filename(attach) 285 u'image.png' 286 >>> print attach.as_string(unixfrom=False) 287 Content-Type: image/png 288 MIME-Version: 1.0 289 Content-Transfer-Encoding: base64 290 Content-Disposition: attachment; filename="image.png" 291 <BLANKLINE> 292 ZGF0YQ== 293 >>> import email.mime.text 294 >>> attach=email.mime.text.MIMEText('The text.', 'plain', 'us-ascii') 295 >>> attach.add_header('Content-Disposition', 'attachment', filename=('iso-8859-1', 'fr', u'Fran\\xe7ais.txt'.encode('iso-8859-1'))) 296 >>> get_filename(attach) 297 u'Fran\\xe7ais.txt' 298 >>> print attach.as_string(unixfrom=False) 299 Content-Type: text/plain; charset="us-ascii" 300 MIME-Version: 1.0 301 Content-Transfer-Encoding: 7bit 302 Content-Disposition: attachment; filename*="iso-8859-1'fr'Fran%E7ais.txt" 303 <BLANKLINE> 304 The text. 305 """ 306 filename=part.get_param('filename', None, 'content-disposition') 307 if not filename: 308 filename=part.get_param('name', None) # default is 'content-type' 309 310 if filename: 311 if isinstance(filename, tuple): 312 # RFC 2231 must be used to encode parameters inside MIME header 313 filename=email.utils.collapse_rfc2231_value(filename).strip() 314 else: 315 # But a lot of MUA erroneously use RFC 2047 instead of RFC 2231 316 # in fact anybody missuse RFC2047 here !!! 317 filename=decode_mail_header(filename) 318 319 return filename

320

321 -def _search_message_content(contents, part):

322 """ 323 recursive search of message content (text or HTML) inside 324 the structure of the email. Used by L{search_message_content()} 325 326 @type contents: dict 327 @param contents: contents already found in parents or brothers I{parts}. 328 The dictionary will be completed as and when. key is the MIME type of the part. 329 @type part: inherit email.mime.base.MIMEBase 330 @param part: the part of the mail to look inside recursively. 331 """ 332 333 type=part.get_content_type() 334 if type.startswith('multipart/'): 335 # explore only True 'multipart/*' 336 # because 'messages/rfc822' are 'multipart/*' too but 337 # must not be explored here 338 if type=='multipart/related': 339 # the first part or the one pointed by start 340 start=part.get_param('start', None) 341 related_type=part.get_param('type', None) 342 for i, subpart in enumerate(part.get_payload()): 343 if (not start and i==0) or (start and start==subpart.get('Content-Id')): 344 _search_message_content(contents, subpart) 345 return 346 elif type=='multipart/alternative': 347 # all parts are candidates and latest are the best 348 for subpart in part.get_payload(): 349 _search_message_content(contents, subpart) 350 elif type in ('multipart/report', 'multipart/signed'): 351 # only the first part is candidate 352 try: 353 subpart=part.get_payload()[0] 354 except IndexError: 355 return 356 else: 357 _search_message_content(contents, subpart) 358 return 359 360 elif type=='multipart/encrypted': 361 # the second part is the good one, but we need to de-crypt it 362 # using the first part. Do nothing 363 return 364 365 else: 366 # unknown types must be handled as 'multipart/mixed' 367 # This is the peace of code that could probably be improved, 368 # I use a heuristic : if not already found, use first valid non 369 # 'attachment' parts found 370 for subpart in part.get_payload(): 371 tmp_contents=dict() 372 _search_message_content(tmp_contents, subpart) 373 for k, v in tmp_contents.iteritems(): 374 if not subpart.get_param('attachment', None, 'content-disposition')=='': 375 # if not an attachment, initiate value if not already found 376 contents.setdefault(k, v) 377 return 378 else: 379 contents[part.get_content_type().lower()]=part 380 return 381 382 return

383

384 -def search_message_content(mail):

385 """ 386 search of message content (text or HTML) inside 387 the structure of the mail. This function is used by L{get_mail_parts()} 388 to set the C{is_body} part of the L{MailPart}s 389 390 @type mail: inherit from email.message.Message 391 @param mail: the message to search in. 392 @rtype: dict 393 @returns: a dictionary of the form C{{'text/plain': text_part, 'text/html': html_part}} 394 where text_part and html_part inherite from C{email.mime.text.MIMEText} 395 and are respectively the I{text} and I{HTML} version of the message content. 396 One part can be missing. The dictionay can aven be empty if none of the 397 parts math the requirements to be considered as the content. 398 """ 399 contents=dict() 400 _search_message_content(contents, mail) 401 return contents

402

403 -def get_mail_parts(msg):

404 """ 405 return a list of all parts of the message as a list of L{MailPart}. 406 Retrieve parts attributes to fill in L{MailPart} object. 407 408 @type msg: inherit email.message.Message 409 @param msg: the message 410 @rtype: list 411 @returns: list of mail parts 412 413 >>> import email.mime.multipart 414 >>> msg=email.mime.multipart.MIMEMultipart(boundary='===limit1==') 415 >>> import email.mime.text 416 >>> txt=email.mime.text.MIMEText('The text.', 'plain', 'us-ascii') 417 >>> msg.attach(txt) 418 >>> import email.mime.image 419 >>> image=email.mime.image.MIMEImage('data', 'png') 420 >>> image.add_header('Content-Disposition', 'attachment', filename='image.png') 421 >>> msg.attach(image) 422 >>> print msg.as_string(unixfrom=False) 423 Content-Type: multipart/mixed; boundary="===limit1==" 424 MIME-Version: 1.0 425 <BLANKLINE> 426 --===limit1== 427 Content-Type: text/plain; charset="us-ascii" 428 MIME-Version: 1.0 429 Content-Transfer-Encoding: 7bit 430 <BLANKLINE> 431 The text. 432 --===limit1== 433 Content-Type: image/png 434 MIME-Version: 1.0 435 Content-Transfer-Encoding: base64 436 Content-Disposition: attachment; filename="image.png" 437 <BLANKLINE> 438 ZGF0YQ== 439 --===limit1==-- 440 >>> parts=get_mail_parts(msg) 441 >>> parts 442 [MailPart<*text/plain charset=us-ascii len=9>, MailPart<image/png filename=image.png len=4>] 443 >>> # the star "*" means this is the mail content, not an attachment 444 >>> parts[0].get_payload().decode(parts[0].charset) 445 u'The text.' 446 >>> parts[1].filename, len(parts[1].get_payload()) 447 (u'image.png', 4) 448 449 """ 450 mailparts=[] 451 452 # retrieve messages of the email 453 contents=search_message_content(msg) 454 # reverse contents dict 455 parts=dict((v,k) for k, v in contents.iteritems()) 456 457 # organize the stack to handle deep first search 458 stack=[ msg, ] 459 while stack: 460 part=stack.pop(0) 461 type=part.get_content_type() 462 if type.startswith('message/'): 463 # ('message/delivery-status', 'message/rfc822', 'message/disposition-notification'): 464 # I don't want to explore the tree deeper her and just save source using msg.as_string() 465 # but I don't use msg.as_string() because I want to use mangle_from_=False 466 filename='message.eml' 467 mailparts.append(MailPart(part, filename=filename, type=type, charset=part.get_param('charset'), description=part.get('Content-Description'))) 468 elif part.is_multipart(): 469 # insert new parts at the beginning of the stack (deep first search) 470 stack[:0]=part.get_payload() 471 else: 472 charset=part.get_param('charset') 473 filename=get_filename(part) 474 475 disposition=None 476 if part.get_param('inline', None, 'content-disposition')=='': 477 disposition='inline' 478 elif part.get_param('attachment', None, 'content-disposition')=='': 479 disposition='attachment' 480 481 mailparts.append(MailPart(part, filename=filename, type=type, charset=charset, content_id=part.get('Content-Id'), description=part.get('Content-Description'), disposition=disposition, is_body=parts.get(part, False))) 482 483 return mailparts

484

485 486 -def decode_text(payload, charset, default_charset):

487 """ 488 Try to decode text content by trying multiple charset until success. 489 First try I{charset}, else try I{default_charset} finally 490 try popular charsets in order : ascii, utf-8, utf-16, windows-1252, cp850 491 If all fail then use I{default_charset} and replace wrong characters 492 493 @type payload: str 494 @param payload: the content to decode 495 @type charset: str or None 496 @param charset: the first charset to try if != C{None} 497 @type default_charset: str or None 498 @param default_charset: the second charset to try if != C{None} 499 500 @rtype: tuple 501 @returns: a tuple of the form C{(payload, charset)} 502 - I{payload}: this is the decoded payload if charset is not None and 503 payload is a unicode string 504 - I{charset}: the charset that was used to decode I{payload} If charset is 505 C{None} then something goes wrong: if I{payload} is unicode then 506 invalid characters have been replaced and the used charset is I{default_charset} 507 else, if I{payload} is still byte string then nothing has been done. 508 509 510 """ 511 for chset in [ charset, default_charset, 'ascii', 'utf-8', 'utf-16', 'windows-1252', 'cp850' ]: 512 if chset: 513 try: 514 return payload.decode(chset), chset 515 except UnicodeError: 516 pass 517 518 if default_charset: 519 return payload.decode(chset, 'replace'), None 520 521 return payload, None

522

523 -class PyzMessage(email.message.Message):

524 """ 525 Inherit from email.message.Message. Combine L{get_mail_parts()}, 526 L{get_mail_addresses()} and L{decode_mail_header()} into a 527 B{convenient} object to access mail contents and attributes. 528 This class also B{sanitize} part filenames. 529 530 @type mailparts: list of L{MailPart} 531 @ivar mailparts: list of L{MailPart} objects composing the email, I{text_part} 532 and I{html_part} are part of this list as are other attachements and embedded 533 contents. 534 @type text_part: L{MailPart} or None 535 @ivar text_part: the L{MailPart} object that contains the I{text} 536 version of the message, None if the mail has not I{text} content. 537 @type html_part: L{MailPart} or None 538 @ivar html_part: the L{MailPart} object that contains the I{HTML} 539 version of the message, None if the mail has not I{HTML} content. 540 541 @note: Sample: 542 543 >>> raw='''Content-Type: text/plain; charset="us-ascii" 544 ... MIME-Version: 1.0 545 ... Content-Transfer-Encoding: 7bit 546 ... Subject: The subject 547 ... From: Me <me@foo.com> 548 ... To: A <a@foo.com>, B <b@foo.com> 549 ... 550 ... The text. 551 ... ''' 552 >>> msg=PyzMessage.factory(raw) 553 >>> print 'Subject: %r' % (msg.get_subject(), ) 554 Subject: u'The subject' 555 >>> print 'From: %r' % (msg.get_address('from'), ) 556 From: (u'Me', 'me@foo.com') 557 >>> print 'To: %r' % (msg.get_addresses('to'), ) 558 To: [(u'A', 'a@foo.com'), (u'B', 'b@foo.com')] 559 >>> print 'Cc: %r' % (msg.get_addresses('cc'), ) 560 Cc: [] 561 >>> for mailpart in msg.mailparts: 562 ... print ' %sfilename=%r sanitized_filename=%r type=%s charset=%s desc=%s size=%d' % ('*'if mailpart.is_body else ' ', mailpart.filename, mailpart.sanitized_filename, mailpart.type, mailpart.charset, mailpart.part.get('Content-Description'), 0 if mailpart.get_payload()==None else len(mailpart.get_payload())) 563 ... if mailpart.is_body=='text/plain': 564 ... payload, used_charset=decode_text(mailpart.get_payload(), mailpart.charset, None) 565 ... print ' >', payload.split('\\n')[0] 566 ... 567 *filename=None sanitized_filename='text.txt' type=text/plain charset=us-ascii desc=None size=10 568 > The text. 569 """ 570 571 @staticmethod

572 - def smart_parser(input):

573 """ 574 Use the appropriate parser and return a email.message.Message object 575 (this is not a L{PyzMessage} object) 576 577 @type input: string, file, bytes, binary_file or email.message.Message 578 @param input: the source of the message 579 @rtype: email.message.Message 580 @returns: the message 581 """ 582 if isinstance(input, email.message.Message): 583 return input 584 585 if sys.version_info<(3, 0): 586 # python 2.x 587 if isinstance(input, basestring): 588 return email.message_from_string(input) 589 elif hasattr(input, 'read') and hasattr(input, 'readline'): 590 return email.message_from_file(input) 591 else: 592 raise ValueError, 'input must be a string, a file or a Message' 593 else: 594 # python 3.x 595 if isinstance(input, str): 596 return email.message_from_string(input) 597 elif isinstance(input, bytes): 598 # python >= 3.2 only 599 return email.message_from_bytes(input) 600 elif hasattr(input, 'read') and hasattr(input, 'readline'): 601 if hasattr(input, 'encoding'): 602 # python >= 3.2 only 603 return email.message_from_file(input) 604 else: 605 return email.message_from_binary_file(input) 606 else: 607 raise ValueError, 'input must be a string a bytes, a file or a Message'

608 609 @staticmethod

610 - def factory(input):

611 """ 612 Use the appropriate parser and return a L{PyzMessage} object 613 see L{smart_parser} 614 @type input: string, file, bytes, binary_file or email.message.Message 615 @param input: the source of the message 616 @rtype: L{PyzMessage} 617 @returns: the L{PyzMessage} message 618 """ 619 return PyzMessage(PyzMessage.smart_parser(input))

620 621

622 - def __init__(self, message):

623 """ 624 Initialize the object with data coming from I{input}. 625 626 @type message: inherit email.message.Message 627 @param message: The message 628 """ 629 630 self.__dict__.update(message.__dict__) 631 632 self.mailparts=get_mail_parts(self) 633 self.text_part=None 634 self.html_part=None 635 636 filenames=[] 637 for part in self.mailparts: 638 ext=mimetypes.guess_extension(part.type) 639 if not ext: 640 # default to .bin 641 ext='.bin' 642 elif ext=='.ksh': 643 # guess_extension() is not very accurate, .txt is more 644 # appropriate than .ksh 645 ext='.txt' 646 647 sanitized_filename=sanitize_filename(part.filename, part.type.split('/', 1)[0], ext) 648 sanitized_filename=handle_filename_collision(sanitized_filename, filenames) 649 filenames.append(sanitized_filename.lower()) 650 part.sanitized_filename=sanitized_filename 651 652 if part.is_body=='text/plain': 653 self.text_part=part 654 655 if part.is_body=='text/html': 656 self.html_part=part

657

658 - def get_addresses(self, name):

659 """ 660 return the I{name} header value as an list of addresses tuple as 661 returned by L{get_mail_addresses()} 662 663 @type name: str 664 @param name: the name of the header to read value from: 'to', 'cc' are 665 valid I{name} here. 666 @rtype: tuple 667 @returns: a tuple of the form C{('Sender Name', 'sender.address@domain.com')} 668 or C{('', '')} if no header match that I{name}. 669 """ 670 return get_mail_addresses(self, name)

671

672 - def get_address(self, name):

673 """ 674 return the I{name} header value as an address tuple as returned by 675 L{get_mail_addresses()} 676 677 @type name: str 678 @param name: the name of the header to read value from: : C{'from'} can 679 be used to return the sender address. 680 @rtype: list of tuple 681 @returns: a list of tuple of the form C{[('Recipient Name', 'recipient.address@domain.com'), ...]} 682 or an empty list if no header match that I{name}. 683 """ 684 value=get_mail_addresses(self, name) 685 if value: 686 return value[0] 687 else: 688 return ('', '')

689

690 - def get_subject(self, default=''):

691 """ 692 return the RFC2047 decoded subject. 693 694 @type default: any 695 @param default: The value to return if the message has no I{Subject} 696 @rtype: unicode 697 @returns: the subject or C{default} 698 """ 699 return self.get_decoded_header('subject', default)

700

701 - def get_decoded_header(self, name, default=''):

702 """ 703 return decoded header I{name} using RFC2047. Always use this function 704 to access header, because any header can contain invalid characters 705 and this function sanitize the string and avoid unicode exception later 706 in your program. 707 EVEN for date, I already saw a "Center box bar horizontal" instead 708 of a minus character. 709 710 @type name: str 711 @param name: the name of the header to read value from. 712 @type default: any 713 @param default: The value to return if the I{name} field don't exist 714 in this message. 715 @rtype: unicode 716 @returns: the value of the header having that I{name} or C{default} if no 717 header have that name. 718 """ 719 value=self.get(name) 720 if value==None: 721 value=default 722 else: 723 value=decode_mail_header(value) 724 return value

725

726 -class PzMessage(PyzMessage):

727 """ 728 Old name and interface for PyzMessage. 729 B{Deprecated} 730 """ 731

732 - def __init__(self, input):

733 """ 734 Initialize the object with data coming from I{input}. 735 736 @type input: str or file or email.message.Message 737 @param input: used as the raw content for the email, can be a string, 738 a file object or an email.message.Message object. 739 """ 740 PyzMessage.__init__(self, smart_parser(input))

741

742 743 -def message_from_string(s, *args, **kws):

744 """ 745 Parse a string into a L{PyzMessage} object model. 746 @type s: str 747 @param s: the input string 748 @rtype: L{PyzMessage} 749 @return: the L{PyzMessage} object 750 """ 751 return PyzMessage(email.message_from_string(s, *args, **kws))

752

753 -def message_from_file(fp, *args, **kws):

754 """ 755 Read a file and parse its contents into a L{PyzMessage} object model. 756 @type fp: text_file 757 @param fp: the input file (must be open in text mode if Python >= 3.0) 758 @rtype: L{PyzMessage} 759 @return: the L{PyzMessage} object 760 """ 761 return PyzMessage(email.message_from_file(fp, *args, **kws))

762

763 -def message_from_bytes(s, *args, **kws):

764 """ 765 Parse a bytes string into a L{PyzMessage} object model. 766 B{(Python >= 3.2)} 767 @type s: bytes 768 @param s: the input bytes string 769 @rtype: L{PyzMessage} 770 @return: the L{PyzMessage} object 771 """ 772 return PyzMessage(email.message_from_bytes(s, *args, **kws))

773

774 -def message_from_binary_file(fp, *args, **kws):

775 """ 776 Read a binary file and parse its contents into a L{PyzMessage} object model. 777 B{(Python >= 3.2)} 778 @type fp: binary_file 779 @param fp: the input file, must be open in binary mode 780 @rtype: L{PyzMessage} 781 @return: the L{PyzMessage} object 782 """ 783 return PyzMessage(email.message_from_binary_file(fp, *args, **kws))

784 785 786 if __name__ == "__main__": 787 import sys 788 789 if len(sys.argv)<=1: 790 print 'usage : %s filename' % sys.argv[0] 791 print 'read an email from file and display a resume of its content' 792 sys.exit(1) 793 794 msg=PyzMessage(open(sys.argv[1])) 795 796 print 'Subject: %r' % (msg.get_subject(), ) 797 print 'From: %r' % (msg.get_address('from'), ) 798 print 'To: %r' % (msg.get_addresses('to'), ) 799 print 'Cc: %r' % (msg.get_addresses('cc'), ) 800 print 'Date: %r' % (msg.get_decoded_header('date', ''), ) 801 print 'Message-Id: %r' % (msg.get_decoded_header('message-id', ''), ) 802 803 for mailpart in msg.mailparts: 804 # dont forget to be careful to sanitize 'filename' and be carefull 805 # for filename collision, to before to save : 806 print ' %sfilename=%r type=%s charset=%s desc=%s size=%d' % ('*'if mailpart.is_body else ' ', mailpart.filename, mailpart.type, mailpart.charset, mailpart.part.get('Content-Description'), 0 if mailpart.get_payload()==None else len(mailpart.get_payload())) 807 808 if mailpart.is_body=='text/plain': 809 # print first 3 lines 810 payload, used_charset=decode_text(mailpart.get_payload(), mailpart.charset, None) 811 for line in payload.split('\n')[:3]: 812 # be careful console can be unable to display unicode characters 813 if line: 814 print ' >', line 815

Source Code for Module pyzmail.parse