PageRenderTime 24ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/PlancakeEmailParser.php

http://github.com/plancake/official-library-php-email-parser
PHP | 338 lines | 193 code | 32 blank | 113 comment | 20 complexity | cf85d176d0744d2bd33e2aaf658087a6 MD5 | raw file
  1. <?php
  2. /*************************************************************************************
  3. * ===================================================================================*
  4. * Software by: Danyuki Software Limited *
  5. * This file is part of Plancake. *
  6. * *
  7. * Copyright 2009-2010-2011 by: Danyuki Software Limited *
  8. * Support, News, Updates at: http://www.plancake.com *
  9. * Licensed under the LGPL version 3 license. * *
  10. * Danyuki Software Limited is registered in England and Wales (Company No. 07554549) *
  11. **************************************************************************************
  12. * Plancake is distributed in the hope that it will be useful, *
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  15. * GNU Lesser General Public License v3.0 for more details. *
  16. * *
  17. * You should have received a copy of the GNU Lesser General Public License *
  18. * along with this program. If not, see <http://www.gnu.org/licenses/>. *
  19. * *
  20. **************************************************************************************
  21. *
  22. * Valuable contributions by:
  23. * - Chris
  24. *
  25. * **************************************************************************************/
  26. /**
  27. * Extracts the headers and the body of an email
  28. * Obviously it can't extract the bcc header because it doesn't appear in the content
  29. * of the email.
  30. *
  31. * N.B.: if you deal with non-English languages, we recommend you install the IMAP PHP extension:
  32. * the Plancake PHP Email Parser will detect it and used it automatically for better results.
  33. *
  34. * For more info, check:
  35. * https://github.com/plancake/official-library-php-email-parser
  36. *
  37. * @author dan
  38. */
  39. class PlancakeEmailParser {
  40. const PLAINTEXT = 1;
  41. const HTML = 2;
  42. /**
  43. *
  44. * @var boolean
  45. */
  46. private $isImapExtensionAvailable = false;
  47. /**
  48. *
  49. * @var string
  50. */
  51. private $emailRawContent;
  52. /**
  53. *
  54. * @var associative array
  55. */
  56. protected $rawFields;
  57. /**
  58. *
  59. * @var array of string (each element is a line)
  60. */
  61. protected $rawBodyLines;
  62. /**
  63. *
  64. * @param string $emailRawContent
  65. */
  66. public function __construct($emailRawContent) {
  67. $this->emailRawContent = $emailRawContent;
  68. $this->extractHeadersAndRawBody();
  69. if (function_exists('imap_open')) {
  70. $this->isImapExtensionAvailable = true;
  71. }
  72. }
  73. private function extractHeadersAndRawBody()
  74. {
  75. $lines = preg_split("/(\r?\n|\r)/", $this->emailRawContent);
  76. $currentHeader = '';
  77. $i = 0;
  78. foreach ($lines as $line)
  79. {
  80. if(self::isNewLine($line))
  81. {
  82. // end of headers
  83. $this->rawBodyLines = array_slice($lines, $i);
  84. break;
  85. }
  86. if ($this->isLineStartingWithPrintableChar($line)) // start of new header
  87. {
  88. preg_match('/([^:]+): ?(.*)$/', $line, $matches);
  89. $newHeader = strtolower($matches[1]);
  90. $value = $matches[2];
  91. $this->rawFields[$newHeader] = $value;
  92. $currentHeader = $newHeader;
  93. }
  94. else // more lines related to the current header
  95. {
  96. if ($currentHeader) { // to prevent notice from empty lines
  97. $this->rawFields[$currentHeader] .= substr($line, 1);
  98. }
  99. }
  100. $i++;
  101. }
  102. }
  103. /**
  104. *
  105. * @return string (in UTF-8 format)
  106. * @throws Exception if a subject header is not found
  107. */
  108. public function getSubject()
  109. {
  110. if (!isset($this->rawFields['subject']))
  111. {
  112. throw new Exception("Couldn't find the subject of the email");
  113. }
  114. $ret = '';
  115. if ($this->isImapExtensionAvailable) {
  116. foreach (imap_mime_header_decode($this->rawFields['subject']) as $h) { // subject can span into several lines
  117. $charset = ($h->charset == 'default') ? 'US-ASCII' : $h->charset;
  118. $ret .= iconv($charset, "UTF-8//TRANSLIT", $h->text);
  119. }
  120. } else {
  121. $ret = utf8_encode(iconv_mime_decode($this->rawFields['subject']));
  122. }
  123. return $ret;
  124. }
  125. /**
  126. *
  127. * @return array
  128. */
  129. public function getCc()
  130. {
  131. if (!isset($this->rawFields['cc']))
  132. {
  133. return array();
  134. }
  135. return explode(',', $this->rawFields['cc']);
  136. }
  137. /**
  138. *
  139. * @return array
  140. * @throws Exception if a to header is not found or if there are no recipient
  141. */
  142. public function getTo()
  143. {
  144. if ( (!isset($this->rawFields['to'])) || (!count($this->rawFields['to'])))
  145. {
  146. throw new Exception("Couldn't find the recipients of the email");
  147. }
  148. return explode(',', $this->rawFields['to']);
  149. }
  150. /**
  151. * return string - UTF8 encoded
  152. *
  153. * Example of an email body
  154. *
  155. --0016e65b5ec22721580487cb20fd
  156. Content-Type: text/plain; charset=ISO-8859-1
  157. Hi all. I am new to Android development.
  158. Please help me.
  159. --
  160. My signature
  161. email: myemail@gmail.com
  162. web: http://www.example.com
  163. --0016e65b5ec22721580487cb20fd
  164. Content-Type: text/html; charset=ISO-8859-1
  165. */
  166. public function getBody($returnType=self::PLAINTEXT)
  167. {
  168. $body = '';
  169. $detectedContentType = false;
  170. $contentTransferEncoding = null;
  171. $charset = 'ASCII';
  172. $waitingForContentStart = true;
  173. if ($returnType == self::HTML)
  174. $contentTypeRegex = '/^Content-Type: ?text\/html/i';
  175. else
  176. $contentTypeRegex = '/^Content-Type: ?text\/plain/i';
  177. // there could be more than one boundary
  178. preg_match_all('!boundary=(.*)$!mi', $this->emailRawContent, $matches);
  179. $boundaries = $matches[1];
  180. // sometimes boundaries are delimited by quotes - we want to remove them
  181. foreach($boundaries as $i => $v) {
  182. $boundaries[$i] = str_replace(array("'", '"'), '', $v);
  183. }
  184. foreach ($this->rawBodyLines as $line) {
  185. if (!$detectedContentType) {
  186. if (preg_match($contentTypeRegex, $line, $matches)) {
  187. $detectedContentType = true;
  188. }
  189. if(preg_match('/charset=(.*)/i', $line, $matches)) {
  190. $charset = strtoupper(trim($matches[1], '"'));
  191. }
  192. } else if ($detectedContentType && $waitingForContentStart) {
  193. if(preg_match('/charset=(.*)/i', $line, $matches)) {
  194. $charset = strtoupper(trim($matches[1], '"'));
  195. }
  196. if ($contentTransferEncoding == null && preg_match('/^Content-Transfer-Encoding: ?(.*)/i', $line, $matches)) {
  197. $contentTransferEncoding = $matches[1];
  198. }
  199. if (self::isNewLine($line)) {
  200. $waitingForContentStart = false;
  201. }
  202. } else { // ($detectedContentType && !$waitingForContentStart)
  203. // collecting the actual content until we find the delimiter
  204. // if the delimited is AAAAA, the line will be --AAAAA - that's why we use substr
  205. if (is_array($boundaries)) {
  206. if (in_array(substr($line, 2), $boundaries)) { // found the delimiter
  207. break;
  208. }
  209. }
  210. $body .= $line . "\n";
  211. }
  212. }
  213. if (!$detectedContentType)
  214. {
  215. // if here, we missed the text/plain content-type (probably it was
  216. // in the header), thus we assume the whole body is what we are after
  217. $body = implode("\n", $this->rawBodyLines);
  218. }
  219. // removing trailing new lines
  220. $body = preg_replace('/((\r?\n)*)$/', '', $body);
  221. if ($contentTransferEncoding == 'base64')
  222. $body = base64_decode($body);
  223. else if ($contentTransferEncoding == 'quoted-printable')
  224. $body = quoted_printable_decode($body);
  225. if($charset != 'UTF-8') {
  226. // FORMAT=FLOWED, despite being popular in emails, it is not
  227. // supported by iconv
  228. $charset = str_replace("FORMAT=FLOWED", "", $charset);
  229. $bodyCopy = $body;
  230. $body = iconv($charset, 'UTF-8//TRANSLIT', $body);
  231. if ($body === FALSE) { // iconv returns FALSE on failure
  232. $body = utf8_encode($bodyCopy);
  233. }
  234. }
  235. return $body;
  236. }
  237. /**
  238. * @return string - UTF8 encoded
  239. *
  240. */
  241. public function getPlainBody()
  242. {
  243. return $this->getBody(self::PLAINTEXT);
  244. }
  245. /**
  246. * return string - UTF8 encoded
  247. */
  248. public function getHTMLBody()
  249. {
  250. return $this->getBody(self::HTML);
  251. }
  252. /**
  253. * N.B.: if the header doesn't exist an empty string is returned
  254. *
  255. * @param string $headerName - the header we want to retrieve
  256. * @return string - the value of the header
  257. */
  258. public function getHeader($headerName)
  259. {
  260. $headerName = strtolower($headerName);
  261. if (isset($this->rawFields[$headerName]))
  262. {
  263. return $this->rawFields[$headerName];
  264. }
  265. return '';
  266. }
  267. /**
  268. *
  269. * @param string $line
  270. * @return boolean
  271. */
  272. public static function isNewLine($line)
  273. {
  274. $line = str_replace("\r", '', $line);
  275. $line = str_replace("\n", '', $line);
  276. return (strlen($line) === 0);
  277. }
  278. /**
  279. *
  280. * @param string $line
  281. * @return boolean
  282. */
  283. private function isLineStartingWithPrintableChar($line)
  284. {
  285. return preg_match('/^[A-Za-z]/', $line);
  286. }
  287. }
  288. ?>