PageRenderTime 79ms CodeModel.GetById 48ms RepoModel.GetById 0ms app.codeStats 0ms

/civicrm/custom/php/CRM/NYSS/IMAP/Message.php

https://github.com/nysenate/Bluebird-CRM
PHP | 622 lines | 474 code | 64 blank | 84 comment | 56 complexity | 4f1c3050314d9126438b5ecf3da226a4 MD5 | raw file
Possible License(s): JSON, BSD-3-Clause, MPL-2.0-no-copyleft-exception, AGPL-1.0, GPL-2.0, AGPL-3.0, Apache-2.0, MIT, GPL-3.0, CC-BY-4.0, LGPL-2.1, BSD-2-Clause, LGPL-3.0
  1. <?php
  2. class CRM_NYSS_IMAP_Message
  3. {
  4. const MAX_SUBJ_LEN = 255;
  5. private static $_body_type_labels = array(
  6. TYPETEXT => 'text',
  7. TYPEMULTIPART => 'multipart',
  8. TYPEMESSAGE => 'message',
  9. TYPEAPPLICATION => 'application',
  10. TYPEAUDIO => 'audio',
  11. TYPEIMAGE => 'image',
  12. TYPEVIDEO => 'video',
  13. TYPEMODEL => 'model',
  14. TYPEOTHER => 'other');
  15. /* Credit to http://www.regular-expressions.info/email.html
  16. See discussion at the above link regarding the effectiveness/thoroughness
  17. of the pattern. IT WILL NOT CATCH ALL EMAIL ADDRESSES, but it does match
  18. 99% of RFC5322-compliant addresses. Also, detections are not
  19. necessarily VALID.
  20. */
  21. private static $_email_address_regex =
  22. /* mailbox */
  23. '/([a-z0-9!#$%&\'+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*)' .
  24. /* at */
  25. '@' .
  26. /* host */
  27. '((?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)/i';
  28. private $_session = null;
  29. private $_msgnum = 0;
  30. private $_uid = 0;
  31. private $_headers = null;
  32. private $_metadata = null;
  33. private $_structure = null;
  34. private $_content = array('text' => null, 'attachment' => null);
  35. private $_has_attachments = null;
  36. public function __construct($imapSession, $msgnum = 0)
  37. {
  38. $this->_session = $imapSession;
  39. $this->_msgnum = $msgnum;
  40. $this->_uid = imap_uid($this->getConnection(), $msgnum);
  41. // Pre-populate the _headers, _metadata, and _structure properties.
  42. $this->_headers = imap_headerinfo($this->getConnection(), $msgnum, 0, self::MAX_SUBJ_LEN);
  43. $this->_metadata = $this->_genMetaData($this->_headers);
  44. $this->_structure = imap_fetchstructure($this->getConnection(), $this->_msgnum);
  45. // Now that headers and structure are cached, continue loading.
  46. if ($this->isMultipart()) {
  47. $this->_structure->parts = $this->_flattenParts($this->_structure->parts);
  48. $this->_has_attachments = $this->_hasAttachments();
  49. }
  50. else {
  51. $this->_has_attachments = false;
  52. }
  53. // Finally, pre-load the text content, which is necessary for parsing.
  54. $this->_loadTextContent();
  55. } // __construct()
  56. public function getConnection()
  57. {
  58. return $this->_session->getConnection();
  59. } // getConnection()
  60. public function isMultipart()
  61. {
  62. return isset($this->_structure->parts);
  63. } // isMultipart()
  64. public function hasAttachments()
  65. {
  66. return $this->_has_attachments;
  67. } // hasAttachments()
  68. // Cache the message attachments after retrieving them on the first call.
  69. // The array key for each attachment is its part number.
  70. // Each attachment has 'name', 'size', and 'data' attributes.
  71. public function fetchAttachments()
  72. {
  73. if ($this->_content['attachment'] === null) {
  74. $this->_content['attachment'] = array();
  75. if ($this->hasAttachments()) {
  76. foreach ($this->getParts() as $partnum => $part) {
  77. if ($part->ifdisposition && $part->disposition == 'attachment') {
  78. $content = $this->fetchPart($partnum);
  79. $content = $this->_decodeContent($content, $part->encoding, $part->subtype);
  80. // Extract filename from the "dparameters" field of the part.
  81. $filename = $this->_getFilename($part->dparameters);
  82. if (!$filename) {
  83. // If that didn't work, try the "parameters" field.
  84. $filename = $this->_getFilename($part->parameters);
  85. }
  86. if (!$filename) {
  87. // Skip any attachment whose filename cannot be determined.
  88. continue;
  89. }
  90. $tempfilename = imap_mime_header_decode($filename);
  91. for ($i = 0; $i < count($tempfilename); $i++) {
  92. $filename = $tempfilename[$i]->text;
  93. }
  94. $attachment = (object) [ 'name' => $filename,
  95. 'type' => $part->type,
  96. 'size' => $part->bytes,
  97. 'data' => $content ];
  98. $this->_content['attachment'][$partnum] = $attachment;
  99. }
  100. }
  101. }
  102. }
  103. return $this->_content['attachment'];
  104. } // fetchAttachments()
  105. public function fetchBody($section = '')
  106. {
  107. return imap_fetchbody($this->getConnection(), $this->_msgnum, $section, FT_PEEK);
  108. } // fetchBody()
  109. public function fetchPart($section = '1')
  110. {
  111. return $this->fetchBody($section);
  112. } // fetchPart()
  113. public function getHeaders()
  114. {
  115. return $this->_headers;
  116. } // getHeaders()
  117. public function getMetaData()
  118. {
  119. return $this->_metadata;
  120. }
  121. public function getStructure()
  122. {
  123. return $this->_structure;
  124. } // getStructure()
  125. public function getParts()
  126. {
  127. if ($this->isMultipart()) {
  128. return $this->_structure->parts;
  129. }
  130. else {
  131. return null;
  132. }
  133. } // getParts()
  134. // Returns an array of content elements. Each element corresponds to
  135. // a message part that matches the provided type.
  136. public function getContent($type = null)
  137. {
  138. if ($type) {
  139. if (isset($this->_content[$type])) {
  140. return $this->_content[$type];
  141. }
  142. else {
  143. return null;
  144. }
  145. }
  146. else {
  147. return $this->_content;
  148. }
  149. } // getContent()
  150. public function getTextContent()
  151. {
  152. return $this->getContent('text');
  153. } // getTextContent()
  154. // Return the string representation of the given body type.
  155. public function getBodyTypeLabel($bodyType)
  156. {
  157. if (isset(self::$_body_type_labels[$bodyType])) {
  158. return self::$_body_type_labels[$bodyType];
  159. }
  160. else {
  161. return 'unknown';
  162. }
  163. } // getBodyTypeLabel()
  164. /**
  165. ** This function attempts to find various sender addresses in the email.
  166. ** It returns an array with 3 levels of addresses: primary, secondary, other
  167. ** The "primary" element contains an array of one element, and that element
  168. ** is the official sender of the email, based on the headers.
  169. ** The "secondary" element contains any email addresses in the body of the
  170. ** message that were extracted from apparent "From:" headers. This should
  171. ** include the email address of the original sender if the message is a
  172. ** forwarded message.
  173. ** Finally, the "other" element contains an array of all email addresses
  174. ** that could be found in the body, whether or not they were part of a
  175. ** "From:" header.
  176. */
  177. public function findFromAddresses()
  178. {
  179. $addr = array(
  180. 'primary' => array(
  181. 'address'=>$this->_headers->from[0]->mailbox.'@'.$this->_headers->from[0]->host,
  182. 'name'=>isset($this->_headers->from[0]->personal) ? $this->_headers->from[0]->personal : '',
  183. ),
  184. 'secondary' => array(),
  185. 'other' => array(),
  186. );
  187. $parts = $this->getParts();
  188. foreach ($this->getTextContent() as $content) {
  189. $matches = array();
  190. if (preg_match_all('/(^|\n)([\ \t]*([>*][\ \t]*)?)?(From|Reply-To):[\*\s]*(("(\\\"|[^"])*")?[^@]{2,100}@(.*))/i', $content, $matches)) {
  191. foreach ($matches[5] as $k => $v) {
  192. $v = str_replace(array("\n","\r"), array(' ',''), $v);
  193. if (preg_match('#CN=|OU?=|/senate#', $v)) {
  194. //error_log("resolving -$v- with LDAP");
  195. $v = $this->_resolveLDAPAddress($v);
  196. //error_log("resolved to -$v-");
  197. }
  198. $ta = imap_rfc822_parse_adrlist($v, '');
  199. if (count($ta) && $ta[0]->host && $ta[0]->mailbox && $ta[0]->host != '.SYNTAX-ERROR.') {
  200. $newta = array(
  201. 'address' => $ta[0]->mailbox.'@'.$ta[0]->host,
  202. 'name' => isset($ta[0]->personal) ? $ta[0]->personal : null);
  203. switch (strtoupper($matches[2][$k])) {
  204. case 'REPLY TO':
  205. array_unshift($addr['secondary'], $newta);
  206. break;
  207. default:
  208. $addr['secondary'][] = $newta;
  209. break;
  210. }
  211. }
  212. }
  213. }
  214. $matches = array();
  215. if (preg_match_all(static::$_email_address_regex, $content, $matches)) {
  216. foreach ($matches[0] as $k => $v) {
  217. $tv = filter_var(filter_var($v, FILTER_SANITIZE_EMAIL), FILTER_VALIDATE_EMAIL);
  218. if ($tv != $addr['primary']['address']) {
  219. $addr['other'][] = $tv;
  220. }
  221. }
  222. }
  223. }
  224. return $addr;
  225. } // findFromAddresses()
  226. /*
  227. ** Output the text message as simple HTML by doing the following:
  228. ** - Remove angle brackets from email addresses so they don't render as HTML.
  229. ** - Convert tabs and non-breaking spaces to single spaces.
  230. ** - Convert "&", "<", and ">" to their corresponding HTML entities.
  231. ** - Insert <br/> before each newline, and remove non-printing characters.
  232. */
  233. public function renderAsHtml()
  234. {
  235. // get the primary content
  236. $body = $this->_getPrimaryContent();
  237. $patterns = [
  238. '/<((mailto:)?[-\w.]+@[-\w.]+)>/',
  239. '/[\x09\xA0]/',
  240. '/&/', '/</', '/>/', '/(\n)/',
  241. '/[^\n\x20-\x7F]+/',
  242. ];
  243. $replacements = [
  244. '$1', ' ', '&amp;', '&lt;', '&gt;', '<br/>$1', ''
  245. ];
  246. return preg_replace($patterns, $replacements, $body);
  247. } // renderAsHtml()
  248. /*
  249. ** This is a heavy-cost email content parser to find groups of headers within
  250. ** a forwarded email. It is assumed that all header groups will be in
  251. ** reverse date order, with the most recent forward found first and the
  252. ** original message headers found last. This should only be used if the
  253. ** "standard" method (see method findFromAddresses()) is unable to locate
  254. ** an original sender. This scenario generally arises because the forwarded
  255. ** headers have been mangled by the sending server (quoted printable
  256. ** conversions, or other issues).
  257. */
  258. public function searchFullHeaders()
  259. {
  260. /* Forwarding clients may mangle the headers away from spec. The pattern
  261. ** to detect a header line is best-guess based on known mangling patterns.
  262. ** The tracked_headers array needs to include only the mangled versions of
  263. ** desired headers - any still adhering to spec will be picked up normally
  264. */
  265. $tracked_headers = array('reply to', 'sent by');
  266. $tracked_headers_regex = '('.implode('|',$tracked_headers).'|[!-9;-~]+)';
  267. // get the primary content
  268. $content = $this->_getPrimaryContent();
  269. // initialize loop variables
  270. $headers = array();
  271. $header_block = 0;
  272. $in_header = false;
  273. $pattern = '([!-9;-~]+|'.implode('|',$tracked_headers).')';
  274. // read each line of the content and parse for a header
  275. foreach (explode("\n", $content) as $k => $v) {
  276. $trimv = trim($v);
  277. $matches = array();
  278. if (!$trimv) {
  279. $in_header = false;
  280. }
  281. elseif (preg_match('/^'.$tracked_headers_regex.':[[:space:]]*(.*)$/i', $trimv, $matches)) {
  282. if (!$in_header) {
  283. $in_header = true;
  284. $headers[++$header_block] = array();
  285. }
  286. $headers[$header_block][] = array(1 => $matches[1], 2 => $matches[2]);
  287. }
  288. elseif ($in_header) {
  289. $headers[$header_block][count($headers[$header_block])-1][2].= " $trimv";
  290. }
  291. }
  292. return $headers;
  293. } // searchFullHeaders()
  294. // Using the message headers, generate a meta data object.
  295. private function _genMetaData($headers)
  296. {
  297. //deal with various special characters that create problems
  298. $fsubj = $headers->fetchsubject;
  299. if (strpos($fsubj, '?UTF-8?') !== false) {
  300. $fsubj = mb_convert_encoding(mb_decode_mimeheader($fsubj), 'HTML-ENTITIES', 'UTF-8');
  301. //convert some special characters manually
  302. $search = array('&rsquo;');
  303. $replace = array("'");
  304. $fsubj = str_replace($search, $replace, $fsubj);
  305. }
  306. $fl_r = $headers->Recent ? $headers->Recent : '-';
  307. $fl_u = $headers->Unseen ? $headers->Unseen : '-';
  308. $fl_f = $headers->Flagged ? $headers->Flagged : '-';
  309. $fl_a = $headers->Answered ? $headers->Answered : '-';
  310. $fl_d = $headers->Deleted ? $headers->Deleted : '-';
  311. $fl_x = $headers->Draft ? $headers->Draft : '-';
  312. // build return object
  313. $meta = new stdClass();
  314. $meta->subject = $fsubj;
  315. $meta->fromName = isset($headers->from[0]->personal) ? $headers->from[0]->personal : '';
  316. $meta->fromEmail = $headers->from[0]->mailbox.'@'.$headers->from[0]->host;
  317. $meta->uid = $this->_uid;
  318. $meta->msgnum = $this->_msgnum;
  319. $meta->date = date("Y-m-d H:i:s", strtotime($headers->date));
  320. $meta->flags = strtr($headers->Recent.$headers->Unseen.$headers->Flagged.$headers->Answered.$headers->Deleted.$headers->Draft, ' ', '-');
  321. return $meta;
  322. } // genMetaData()
  323. /* This returns the first email part marked as text content type. In simple
  324. ** messages, this is the same as BODY[1]. In multipart messages, it is the
  325. ** *first* text content found. If a message has more than one text section
  326. ** (e.g., text/plain and text/html), only the first will be returned
  327. */
  328. private function _getPrimaryContent()
  329. {
  330. if (is_array($this->_content['text'])) {
  331. if (reset($this->_content['text']) === false) {
  332. return '';
  333. }
  334. else {
  335. return current($this->_content['text']);
  336. }
  337. }
  338. else {
  339. return '';
  340. }
  341. } // _getPrimaryContent()
  342. /*
  343. ** Given a source string, replace all matching HTML tags with a replacement
  344. ** string.
  345. **
  346. ** If no tags are specified, anything that looks like a tag will be matched.
  347. ** Otherwise, $tags is best specified as an array of tag names.
  348. */
  349. private function _html_replace($str, $tags = null, $repl = '')
  350. {
  351. if ($tags === null || $tags == '') {
  352. // Match anything that looks like an HTML tag.
  353. $tag_pattern = '\w+';
  354. }
  355. else if (is_array($tags)) {
  356. // Match only the provided tag names.
  357. $tag_pattern = implode('|', $tags);
  358. }
  359. else {
  360. $tag_pattern = (string)$tags;
  361. }
  362. return preg_replace('%
  363. \v* # Preceding vertical whitespace is eliminated
  364. (?: # Match either open/empty tag or close tag
  365. < # Open/Empty tag initial "<" delimiter
  366. (?: # Group for HTML 4.01 tags
  367. '.$tag_pattern.'
  368. )\b # End group of tag name alternatives
  369. (?: # Non-capture group for optional attribute(s)
  370. \s+ # Attributes must be separated by whitespace
  371. [\w\-.:]+ # Attribute name is required for attr=value pair
  372. (?: # Non-capture group for optional attribute value
  373. \s*=\s* # Name and value separated by "=" and optional ws
  374. (?: # Non-capture group for attrib value alternatives
  375. "[^"]*" # Double quoted string
  376. | \'[^\']*\' # Single quoted string
  377. | [\w\-.:]+ # Non-quoted attrib value can be A-Z0-9_-.:
  378. ) # End of attribute value alternatives
  379. )? # Attribute value is optional
  380. )* # Allow zero or more attribute=value pairs
  381. \s* # Whitespace is allowed before ending delimiter
  382. /? # Empty tag indicator (such as <br />)
  383. > # Open/Empty tag ending ">" delimiter
  384. |
  385. </ # Close tag initial "</" delimiter
  386. (?:
  387. '.$tag_pattern.'
  388. )\b
  389. \s* # Whitespace is allowed before ending delimiter
  390. > # Close tag ending ">" delimiter
  391. ) # End of open/empty or close tag alternatives
  392. \v* # Trailing vertical whitespace is eliminated
  393. %six', $repl, (string)$str);
  394. } // _html_replace()
  395. // Replace common HTML block level elements with newlines.
  396. private function _block2nl($str)
  397. {
  398. $tags = [ 'blockquote', 'br', 'div', 'h[1-6]', 'hr', 'li', 'p' ];
  399. return $this->_html_replace($str, $tags, "\n");
  400. } // _block2nl()
  401. private function _resolveLDAPAddress($addr = '')
  402. {
  403. /* this is hard-coded for now. */
  404. // Parse LDAP info because sometimes addresses are embedded.
  405. // See NYSS #5748 for more details.
  406. // if o= is appended to the end of the email address remove it
  407. $patterns = array(
  408. '#/senate@senate#i', /* standardize reference to senate */
  409. /* SBB DEVCHANGE: This next line was in the original code, but I have found that removing
  410. the /CENTER part of the name makes the search fail. Keep as standard, or remove?
  411. If we remove it, remember to remove the appropriate entry in the $replace array below */
  412. '#/CENTER/senate#i', /* standardize reference to senate */
  413. '/CN=|O=|OU=/i', /* remove LDAP-specific addressing */
  414. '/mailto|\(|\)|:/i', /* remove link remnants, parenthesis */
  415. '/"|\'/i', /* remove quotes */
  416. '/\[|\]/i', /* remove square brackets */
  417. );
  418. $replace = array('/senate', '/senate');
  419. $str = preg_replace($patterns, $replace, trim($addr));
  420. $ret = '';
  421. if (strpos($str, '/senate') !== false) {
  422. $search = false;
  423. $ldapcon = ldap_connect("senmail.senate.state.ny.us", 389);
  424. if ($ldapcon) {
  425. $retrieve = array('sn', 'givenname', 'mail');
  426. $search = ldap_search($ldapcon, 'o=senate', "(displayname=$str)", $retrieve);
  427. } else {
  428. error_log("Failed to create connection to LDAP server (testing msg#={$this->_msgnum}, addr=$str)");
  429. }
  430. $info = ($search === false) ? array('count'=>0) : ldap_get_entries($ldapcon, $search);
  431. if (array_key_exists(0,$info)) {
  432. $name = $info[0]['givenname'][0].' '.$info[0]['sn'][0];
  433. $ret = "$name <{$info[0]['mail'][0]}>";
  434. } else {
  435. error_log("LDAP search returned no results (testing msg#={$this->_msgnum}, addr=$str)");
  436. }
  437. }
  438. return $ret;
  439. } // _resolveLDAPAddress()
  440. // Recursive function that flattens out the multipart hierarchy and
  441. // names the keys using the standard IMAP part number.
  442. private function _flattenParts($msgParts, $flatParts = array(), $prefix = '',
  443. $index = 1, $fullPrefix = true)
  444. {
  445. foreach ($msgParts as $part) {
  446. $flatParts[$prefix.$index] = $part;
  447. if (isset($part->parts)) {
  448. if ($part->type == TYPEMESSAGE) {
  449. $flatParts = $this->_flattenParts($part->parts, $flatParts, $prefix.$index.'.', 0, false);
  450. }
  451. elseif ($fullPrefix) {
  452. $flatParts = $this->_flattenParts($part->parts, $flatParts, $prefix.$index.'.');
  453. }
  454. else {
  455. $flatParts = $this->_flattenParts($part->parts, $flatParts, $prefix);
  456. }
  457. unset($flatParts[$prefix.$index]->parts);
  458. }
  459. $index++;
  460. }
  461. return $flatParts;
  462. } // _flattenParts()
  463. // Decode content based on its encoding and subtype.
  464. // If the content is HTML, convert it to plain text by stripping tags.
  465. // Finally, trim leading and trailing whitespace.
  466. private function _decodeContent($content = '', $enc = ENC7BIT, $subtype = '')
  467. {
  468. $ret = (string)$content;
  469. switch ((int)$enc) {
  470. case ENCBASE64:
  471. $ret = base64_decode($ret);
  472. break; /* base-64 encoding */
  473. case ENCQUOTEDPRINTABLE:
  474. $ret = quoted_printable_decode($ret);
  475. break; /* quoted printable encoding */
  476. default:
  477. /* covers 7BIT/8BIT/BINARY/OTHER, but is essentially a pass-thru */
  478. break;
  479. }
  480. // If HTML, convert <br/> and other block level tags to newlines, strip
  481. // all remaining HTML tags, and convert entities.
  482. if (strcasecmp($subtype, 'HTML') == 0) {
  483. $ret = $this->_block2nl($ret);
  484. $ret = strip_tags($ret);
  485. $ret = html_entity_decode($ret, ENT_QUOTES);
  486. }
  487. return trim($ret);
  488. } // _decodeContent()
  489. // For each message part, if the body type of that part matches the provided
  490. // body type, then load the content, decode it, and add it to the _content
  491. // array using the appropriate label.
  492. // By default, only grab TEXT body parts, such as text/plain and text/html.
  493. private function _loadContent($bodyType = TYPETEXT)
  494. {
  495. $label = $this->getBodyTypeLabel($bodyType);
  496. $this->_content[$label] = array();
  497. if ($this->isMultipart()) {
  498. foreach ($this->getParts() as $partnum => $part) {
  499. if ($part->type === $bodyType) {
  500. $content = $this->fetchPart($partnum);
  501. $content = $this->_decodeContent($content, $part->encoding, $part->subtype);
  502. $this->_content[$label][$partnum] = $content;
  503. }
  504. }
  505. }
  506. else {
  507. // fetchPart() with no args calls fetchBody('1'), which is what we want.
  508. $struct = $this->getStructure();
  509. if ($struct->type === $bodyType) {
  510. $content = $this->fetchPart();
  511. $content = $this->_decodeContent($content, $struct->encoding, $struct->subtype);
  512. $this->_content[$label]['1'] = $content;
  513. }
  514. }
  515. } // _loadContent()
  516. private function _loadTextContent()
  517. {
  518. return $this->_loadContent(TYPETEXT);
  519. } // _loadTextContent()
  520. // Internal function for interating over the message parts looking for
  521. // at least one attachment.
  522. private function _hasAttachments()
  523. {
  524. foreach ($this->getParts() as $part) {
  525. if ($part->ifdisposition && $part->disposition == 'attachment') {
  526. return true;
  527. }
  528. }
  529. return false;
  530. } // _hasAttachments()
  531. // Get the filename attribute from an array of parameters.
  532. private function _getFilename($params)
  533. {
  534. $fname = null;
  535. if (count($params) > 0) {
  536. foreach ($params as $param) {
  537. $attr = strtolower($param->attribute);
  538. if ($attr == 'name' || $attr == 'filename') {
  539. $fname = $param->value;
  540. break;
  541. }
  542. }
  543. }
  544. return $fname;
  545. } // _getFilename()
  546. }