PageRenderTime 96ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/mrjob/examples/mr_postfix_bounce/mr_postfix_bounce.py

https://bitbucket.org/wangqiang8511/mrjob
Python | 163 lines | 117 code | 20 blank | 26 comment | 23 complexity | af6f00967ad8f58d3b2419b8d4396453 MD5 | raw file
  1. # Copyright 2011 Yelp
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. mr_postfix_bounce is a mrjob that parses a Postfix log file looking for
  16. messages that have bounced and yielding the (email address, date ordinal).
  17. The emitted email addresses can then be unconfirmed or handled in some other
  18. way.
  19. """
  20. from __future__ import with_statement
  21. __author__ = 'Adam Derewecki <derewecki@gmail.com>'
  22. import datetime
  23. import re
  24. import simplejson
  25. import time
  26. from mrjob.job import MRJob
  27. PROCESS_TYPE_PATTERN = re.compile(
  28. r'postfix-(?P<queue>[^/]+)/(?P<process>[^[]+)\[\d+\]:')
  29. MESSAGE_ID_PATTERN = re.compile(
  30. r'^(?P<message_id>[A-Z0-9]+): (?P<postfix_message>.*)')
  31. VAR_PATTERN = re.compile(r'(?P<name>\w+)=(?P<value>[^ ,]+)')
  32. HOST_PATTERN = re.compile(
  33. r'(?P<before>.*?)[\(]host (?P<host>\S+) (?P<action>[^:]+):'
  34. r' (?P<message>.*)[\)]')
  35. KEY_VALUE_PATTERN = re.compile(r'(?:^|, )(?P<key>\w+)=(?P<value>[^, ]+)')
  36. DOMAIN_PATTERN = re.compile(r'(?<=@)[^.]+\.\w+')
  37. def process_log_line(line):
  38. # log lines don't have year, so make that up
  39. # Note: not safe over year transitions
  40. (date_year, date_month, date_day, date_time, host, process,
  41. postfix_message) = [str(datetime.date.today().year)] + line.split(None, 5)
  42. timetuple = time.strptime(
  43. ' '.join((date_month, date_day, date_time, date_year)),
  44. '%b %d %H:%M:%S %Y')
  45. timestamp = time.mktime(timetuple)
  46. date_ordinal = datetime.date(*timetuple[:3]).toordinal()
  47. process_type_match = PROCESS_TYPE_PATTERN.search(process)
  48. message_id_match = MESSAGE_ID_PATTERN.search(postfix_message)
  49. if process_type_match and message_id_match:
  50. if process_type_match.group('process') == 'smtp':
  51. after_message_id = message_id_match.group('postfix_message')
  52. postfix_log_dict = {}
  53. # match all key=value pairs but not ones in the (message) afterward
  54. # sometimes there isn't a (message)
  55. if '(' in after_message_id:
  56. # split on ( and grab first element in tuple
  57. key_value_section = after_message_id.split('(')[0]
  58. else:
  59. key_value_section = after_message_id
  60. postfix_log_dict = dict(
  61. KEY_VALUE_PATTERN.findall(key_value_section))
  62. # find where key=value ends and save the rest of the string
  63. if postfix_log_dict:
  64. after_vars_idx = max(
  65. after_message_id.index(value) + len(value) + 1
  66. for value in postfix_log_dict)
  67. after_vars = after_message_id[after_vars_idx:]
  68. else:
  69. after_vars = after_message_id
  70. host_match = HOST_PATTERN.search(after_vars)
  71. if host_match:
  72. postfix_log_dict.update({
  73. 'remote_smtp_string': host_match.group('message'),
  74. 'remote_smtp_string_type': host_match.group('action'),
  75. 'remote_host': host_match.group('host')
  76. })
  77. elif len(after_vars.strip()) > 0:
  78. postfix_log_dict['smtp_string'] = after_vars.strip()
  79. postfix_log_dict.update({
  80. 'time': timestamp,
  81. 'date_ordinal': date_ordinal,
  82. 'message_id': message_id_match.group('message_id'),
  83. 'queue': process_type_match.group('queue'),
  84. 'process': process_type_match.group('process')
  85. })
  86. try:
  87. if 'to' in postfix_log_dict:
  88. postfix_log_dict['domain'] = (
  89. DOMAIN_PATTERN.search(
  90. postfix_log_dict['to']).group().lower())
  91. except:
  92. pass
  93. return postfix_log_dict
  94. def domain_startswith(postfix_log_dict, needle):
  95. return postfix_log_dict.get('domain').startswith(needle)
  96. def process_postfix_log_dict(decoded, bounce_rules):
  97. if decoded and 'to' in decoded and decoded.get('status') == 'bounced':
  98. to = decoded.get('to', '').strip('<>')
  99. # check to see if Postfix couldn't deliver the message
  100. if decoded.get('dsn') == '5.4.4':
  101. if 'Host not found' in decoded.get('smtp_string'):
  102. return to
  103. # run over our per-domain bounce processing error conditions
  104. for domain_prefixes, failure_conditions in bounce_rules:
  105. if any(domain_startswith(decoded, domain)
  106. for domain in domain_prefixes):
  107. for point_of_failure, failure_strings in (
  108. failure_conditions.iteritems()):
  109. for failure_string in failure_strings:
  110. if failure_string in decoded.get(point_of_failure, ''):
  111. return to
  112. class MRPostfixBounce(MRJob):
  113. def configure_options(self):
  114. super(MRPostfixBounce, self).configure_options()
  115. self.add_file_option(
  116. '--bounce-processing-rules',
  117. dest='bounce_processing_rules',
  118. default='bounce_processing_rules.json',
  119. help='JSON file of bounce processing rules.'
  120. )
  121. def load_options(self, args):
  122. super(MRPostfixBounce, self).load_options(args=args)
  123. if self.is_mapper_or_reducer():
  124. with open(self.options.bounce_processing_rules) as bounce_rules_f:
  125. self.bounce_processing_rules = simplejson.load(bounce_rules_f)
  126. def mapper(self, _, line):
  127. postfix_log_dict = process_log_line(line)
  128. if postfix_log_dict:
  129. email_address = process_postfix_log_dict(
  130. postfix_log_dict, self.bounce_processing_rules)
  131. if email_address:
  132. yield email_address, postfix_log_dict['date_ordinal']
  133. def reducer(self, email_address, dateordinals):
  134. yield email_address, tuple(dateordinals)
  135. if __name__ == '__main__':
  136. MRPostfixBounce().run()