PageRenderTime 46ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/solr/contrib/dataimporthandler/src/extras/main/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java

https://github.com/simplegeo/lucene-solr
Java | 600 lines | 498 code | 55 blank | 47 comment | 150 complexity | 95c4156faad7f60d50a24dc30872dc98 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.solr.handler.dataimport;
  18. import com.sun.mail.imap.IMAPMessage;
  19. import org.apache.tika.config.TikaConfig;
  20. import org.apache.tika.utils.ParseUtils;
  21. import org.slf4j.Logger;
  22. import org.slf4j.LoggerFactory;
  23. import javax.mail.*;
  24. import javax.mail.internet.AddressException;
  25. import javax.mail.internet.ContentType;
  26. import javax.mail.internet.InternetAddress;
  27. import javax.mail.internet.MimeMessage;
  28. import javax.mail.search.AndTerm;
  29. import javax.mail.search.ComparisonTerm;
  30. import javax.mail.search.ReceivedDateTerm;
  31. import javax.mail.search.SearchTerm;
  32. import java.io.InputStream;
  33. import java.text.ParseException;
  34. import java.text.SimpleDateFormat;
  35. import java.util.*;
  36. /**
  37. * An EntityProcessor instance which can index emails along with their attachments from POP3 or IMAP sources. Refer to
  38. * <a href="http://wiki.apache.org/solr/DataImportHandler">http://wiki.apache.org/solr/DataImportHandler</a> for more
  39. * details. <b>This API is experimental and subject to change</b>
  40. *
  41. * @version $Id$
  42. * @since solr 1.4
  43. */
  44. public class MailEntityProcessor extends EntityProcessorBase {
  45. public static interface CustomFilter {
  46. public SearchTerm getCustomSearch(Folder folder);
  47. }
  48. public void init(Context context) {
  49. super.init(context);
  50. // set attributes using XXX getXXXFromContext(attribute, defualtValue);
  51. // applies variable resolver and return default if value is not found or null
  52. // REQUIRED : connection and folder info
  53. user = getStringFromContext("user", null);
  54. password = getStringFromContext("password", null);
  55. host = getStringFromContext("host", null);
  56. protocol = getStringFromContext("protocol", null);
  57. folderNames = getStringFromContext("folders", null);
  58. // validate
  59. if (host == null || protocol == null || user == null || password == null
  60. || folderNames == null)
  61. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  62. "'user|password|protocol|host|folders' are required attributes");
  63. //OPTIONAL : have defaults and are optional
  64. recurse = getBoolFromContext("recurse", true);
  65. String excludes = getStringFromContext("exclude", "");
  66. if (excludes != null && !excludes.trim().equals("")) {
  67. exclude = Arrays.asList(excludes.split(","));
  68. }
  69. String includes = getStringFromContext("include", "");
  70. if (includes != null && !includes.trim().equals("")) {
  71. include = Arrays.asList(includes.split(","));
  72. }
  73. batchSize = getIntFromContext("batchSize", 20);
  74. customFilter = getStringFromContext("customFilter", "");
  75. String s = getStringFromContext("fetchMailsSince", "");
  76. if (s != null)
  77. try {
  78. fetchMailsSince = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(s);
  79. } catch (ParseException e) {
  80. throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Invalid value for fetchMailSince: " + s, e);
  81. }
  82. fetchSize = getIntFromContext("fetchSize", 32 * 1024);
  83. cTimeout = getIntFromContext("connectTimeout", 30 * 1000);
  84. rTimeout = getIntFromContext("readTimeout", 60 * 1000);
  85. processAttachment = getBoolFromContext("processAttachement", true);
  86. logConfig();
  87. }
  88. public Map<String, Object> nextRow() {
  89. Message mail;
  90. Map<String, Object> row = null;
  91. do {
  92. // try till there is a valid document or folders get exhausted.
  93. // when mail == NULL, it means end of processing
  94. mail = getNextMail();
  95. if (mail != null)
  96. row = getDocumentFromMail(mail);
  97. } while (row == null && mail != null);
  98. return row;
  99. }
  100. private Message getNextMail() {
  101. if (!connected) {
  102. if (!connectToMailBox())
  103. return null;
  104. connected = true;
  105. }
  106. if (folderIter == null) {
  107. createFilters();
  108. folderIter = new FolderIterator(mailbox);
  109. }
  110. // get next message from the folder
  111. // if folder is exhausted get next folder
  112. // loop till a valid mail or all folders exhausted.
  113. while (msgIter == null || !msgIter.hasNext()) {
  114. Folder next = folderIter.hasNext() ? folderIter.next() : null;
  115. if (next == null) {
  116. return null;
  117. }
  118. msgIter = new MessageIterator(next, batchSize);
  119. }
  120. return msgIter.next();
  121. }
  122. private Map<String, Object> getDocumentFromMail(Message mail) {
  123. Map<String, Object> row = new HashMap<String, Object>();
  124. try {
  125. addPartToDocument(mail, row, true);
  126. return row;
  127. } catch (Exception e) {
  128. return null;
  129. }
  130. }
  131. public void addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception {
  132. if (part instanceof Message) {
  133. addEnvelopToDocument(part, row);
  134. }
  135. String ct = part.getContentType();
  136. ContentType ctype = new ContentType(ct);
  137. if (part.isMimeType("multipart/*")) {
  138. Multipart mp = (Multipart) part.getContent();
  139. int count = mp.getCount();
  140. if (part.isMimeType("multipart/alternative"))
  141. count = 1;
  142. for (int i = 0; i < count; i++)
  143. addPartToDocument(mp.getBodyPart(i), row, false);
  144. } else if (part.isMimeType("message/rfc822")) {
  145. addPartToDocument((Part) part.getContent(), row, false);
  146. } else {
  147. String disp = part.getDisposition();
  148. if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) return;
  149. InputStream is = part.getInputStream();
  150. String fileName = part.getFileName();
  151. String content = ParseUtils.getStringContent(is, TikaConfig.getDefaultConfig(), ctype.getBaseType().toLowerCase(Locale.ENGLISH));
  152. if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) {
  153. if (row.get(ATTACHMENT) == null)
  154. row.put(ATTACHMENT, new ArrayList<String>());
  155. List<String> contents = (List<String>) row.get(ATTACHMENT);
  156. contents.add(content);
  157. row.put(ATTACHMENT, contents);
  158. if (row.get(ATTACHMENT_NAMES) == null)
  159. row.put(ATTACHMENT_NAMES, new ArrayList<String>());
  160. List<String> names = (List<String>) row.get(ATTACHMENT_NAMES);
  161. names.add(fileName);
  162. row.put(ATTACHMENT_NAMES, names);
  163. } else {
  164. if (row.get(CONTENT) == null)
  165. row.put(CONTENT, new ArrayList<String>());
  166. List<String> contents = (List<String>) row.get(CONTENT);
  167. contents.add(content);
  168. row.put(CONTENT, contents);
  169. }
  170. }
  171. }
  172. private void addEnvelopToDocument(Part part, Map<String, Object> row) throws MessagingException {
  173. MimeMessage mail = (MimeMessage) part;
  174. Address[] adresses;
  175. if ((adresses = mail.getFrom()) != null && adresses.length > 0)
  176. row.put(FROM, adresses[0].toString());
  177. List<String> to = new ArrayList<String>();
  178. if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null)
  179. addAddressToList(adresses, to);
  180. if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null)
  181. addAddressToList(adresses, to);
  182. if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null)
  183. addAddressToList(adresses, to);
  184. if (to.size() > 0)
  185. row.put(TO_CC_BCC, to);
  186. row.put(MESSAGE_ID, mail.getMessageID());
  187. row.put(SUBJECT, mail.getSubject());
  188. Date d = mail.getSentDate();
  189. if (d != null) {
  190. row.put(SENT_DATE, d);
  191. }
  192. List<String> flags = new ArrayList<String>();
  193. for (Flags.Flag flag : mail.getFlags().getSystemFlags()) {
  194. if (flag == Flags.Flag.ANSWERED)
  195. flags.add(FLAG_ANSWERED);
  196. else if (flag == Flags.Flag.DELETED)
  197. flags.add(FLAG_DELETED);
  198. else if (flag == Flags.Flag.DRAFT)
  199. flags.add(FLAG_DRAFT);
  200. else if (flag == Flags.Flag.FLAGGED)
  201. flags.add(FLAG_FLAGGED);
  202. else if (flag == Flags.Flag.RECENT)
  203. flags.add(FLAG_RECENT);
  204. else if (flag == Flags.Flag.SEEN)
  205. flags.add(FLAG_SEEN);
  206. }
  207. flags.addAll(Arrays.asList(mail.getFlags().getUserFlags()));
  208. row.put(FLAGS, flags);
  209. String[] hdrs = mail.getHeader("X-Mailer");
  210. if (hdrs != null)
  211. row.put(XMAILER, hdrs[0]);
  212. }
  213. private void addAddressToList(Address[] adresses, List<String> to) throws AddressException {
  214. for (Address address : adresses) {
  215. to.add(address.toString());
  216. InternetAddress ia = (InternetAddress) address;
  217. if (ia.isGroup()) {
  218. InternetAddress[] group = ia.getGroup(false);
  219. for (InternetAddress member : group)
  220. to.add(member.toString());
  221. }
  222. }
  223. }
  224. private boolean connectToMailBox() {
  225. try {
  226. Properties props = new Properties();
  227. props.setProperty("mail.store.protocol", protocol);
  228. props.setProperty("mail.imap.fetchsize", "" + fetchSize);
  229. props.setProperty("mail.imap.timeout", "" + rTimeout);
  230. props.setProperty("mail.imap.connectiontimeout", "" + cTimeout);
  231. Session session = Session.getDefaultInstance(props, null);
  232. mailbox = session.getStore(protocol);
  233. mailbox.connect(host, user, password);
  234. LOG.info("Connected to mailbox");
  235. return true;
  236. } catch (MessagingException e) {
  237. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  238. "Connection failed", e);
  239. }
  240. }
  241. private void createFilters() {
  242. if (fetchMailsSince != null) {
  243. filters.add(new MailsSinceLastCheckFilter(fetchMailsSince));
  244. }
  245. if (customFilter != null && !customFilter.equals("")) {
  246. try {
  247. Class cf = Class.forName(customFilter);
  248. Object obj = cf.newInstance();
  249. if (obj instanceof CustomFilter) {
  250. filters.add((CustomFilter) obj);
  251. }
  252. } catch (Exception e) {
  253. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  254. "Custom filter could not be created", e);
  255. }
  256. }
  257. }
  258. private void logConfig() {
  259. if (!LOG.isInfoEnabled()) return;
  260. StringBuffer config = new StringBuffer();
  261. config.append("user : ").append(user).append(System.getProperty("line.separator"));
  262. config.append("pwd : ").append(password).append(System.getProperty("line.separator"));
  263. config.append("protocol : ").append(protocol).append(System.getProperty("line.separator"));
  264. config.append("host : ").append(host).append(System.getProperty("line.separator"));
  265. config.append("folders : ").append(folderNames).append(System.getProperty("line.separator"));
  266. config.append("recurse : ").append(recurse).append(System.getProperty("line.separator"));
  267. config.append("exclude : ").append(exclude.toString()).append(System.getProperty("line.separator"));
  268. config.append("include : ").append(include.toString()).append(System.getProperty("line.separator"));
  269. config.append("batchSize : ").append(batchSize).append(System.getProperty("line.separator"));
  270. config.append("fetchSize : ").append(fetchSize).append(System.getProperty("line.separator"));
  271. config.append("read timeout : ").append(rTimeout).append(System.getProperty("line.separator"));
  272. config.append("conection timeout : ").append(cTimeout).append(System.getProperty("line.separator"));
  273. config.append("custom filter : ").append(customFilter).append(System.getProperty("line.separator"));
  274. config.append("fetch mail since : ").append(fetchMailsSince).append(System.getProperty("line.separator"));
  275. LOG.info(config.toString());
  276. }
  277. class FolderIterator implements Iterator<Folder> {
  278. private Store mailbox;
  279. private List<String> topLevelFolders;
  280. private List<Folder> folders = null;
  281. private Folder lastFolder = null;
  282. public FolderIterator(Store mailBox) {
  283. this.mailbox = mailBox;
  284. folders = new ArrayList<Folder>();
  285. getTopLevelFolders(mailBox);
  286. }
  287. public boolean hasNext() {
  288. return !folders.isEmpty();
  289. }
  290. public Folder next() {
  291. try {
  292. boolean hasMessages = false;
  293. Folder next;
  294. do {
  295. if (lastFolder != null) {
  296. lastFolder.close(false);
  297. lastFolder = null;
  298. }
  299. if (folders.isEmpty()) {
  300. mailbox.close();
  301. return null;
  302. }
  303. next = folders.remove(0);
  304. if (next != null) {
  305. String fullName = next.getFullName();
  306. if (!excludeFolder(fullName)) {
  307. hasMessages = (next.getType() & Folder.HOLDS_MESSAGES) != 0;
  308. next.open(Folder.READ_ONLY);
  309. lastFolder = next;
  310. LOG.info("Opened folder : " + fullName);
  311. }
  312. if (recurse && ((next.getType() & Folder.HOLDS_FOLDERS) != 0)) {
  313. Folder[] children = next.list();
  314. LOG.info("Added its children to list : ");
  315. for (int i = children.length - 1; i >= 0; i--) {
  316. folders.add(0, children[i]);
  317. LOG.info("child name : " + children[i].getFullName());
  318. }
  319. if (children.length == 0)
  320. LOG.info("NO children : ");
  321. }
  322. }
  323. }
  324. while (!hasMessages);
  325. return next;
  326. } catch (MessagingException e) {
  327. //throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  328. // "Folder open failed", e);
  329. }
  330. return null;
  331. }
  332. public void remove() {
  333. throw new UnsupportedOperationException("Its read only mode...");
  334. }
  335. private void getTopLevelFolders(Store mailBox) {
  336. if (folderNames != null)
  337. topLevelFolders = Arrays.asList(folderNames.split(","));
  338. for (int i = 0; topLevelFolders != null && i < topLevelFolders.size(); i++) {
  339. try {
  340. folders.add(mailbox.getFolder(topLevelFolders.get(i)));
  341. } catch (MessagingException e) {
  342. // skip bad ones unless its the last one and still no good folder
  343. if (folders.size() == 0 && i == topLevelFolders.size() - 1)
  344. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  345. "Folder retreival failed");
  346. }
  347. }
  348. if (topLevelFolders == null || topLevelFolders.size() == 0) {
  349. try {
  350. folders.add(mailBox.getDefaultFolder());
  351. } catch (MessagingException e) {
  352. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  353. "Folder retreival failed");
  354. }
  355. }
  356. }
  357. private boolean excludeFolder(String name) {
  358. for (String s : exclude) {
  359. if (name.matches(s))
  360. return true;
  361. }
  362. for (String s : include) {
  363. if (name.matches(s))
  364. return false;
  365. }
  366. return include.size() > 0;
  367. }
  368. }
  369. class MessageIterator implements Iterator<Message> {
  370. private Folder folder;
  371. private Message[] messagesInCurBatch;
  372. private int current = 0;
  373. private int currentBatch = 0;
  374. private int batchSize = 0;
  375. private int totalInFolder = 0;
  376. private boolean doBatching = true;
  377. public MessageIterator(Folder folder, int batchSize) {
  378. try {
  379. this.folder = folder;
  380. this.batchSize = batchSize;
  381. SearchTerm st = getSearchTerm();
  382. if (st != null) {
  383. doBatching = false;
  384. messagesInCurBatch = folder.search(st);
  385. totalInFolder = messagesInCurBatch.length;
  386. folder.fetch(messagesInCurBatch, fp);
  387. current = 0;
  388. LOG.info("Total messages : " + totalInFolder);
  389. LOG.info("Search criteria applied. Batching disabled");
  390. } else {
  391. totalInFolder = folder.getMessageCount();
  392. LOG.info("Total messages : " + totalInFolder);
  393. getNextBatch(batchSize, folder);
  394. }
  395. } catch (MessagingException e) {
  396. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  397. "Message retreival failed", e);
  398. }
  399. }
  400. private void getNextBatch(int batchSize, Folder folder) throws MessagingException {
  401. // after each batch invalidate cache
  402. if (messagesInCurBatch != null) {
  403. for (Message m : messagesInCurBatch) {
  404. if (m instanceof IMAPMessage)
  405. ((IMAPMessage) m).invalidateHeaders();
  406. }
  407. }
  408. int lastMsg = (currentBatch + 1) * batchSize;
  409. lastMsg = lastMsg > totalInFolder ? totalInFolder : lastMsg;
  410. messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, lastMsg);
  411. folder.fetch(messagesInCurBatch, fp);
  412. current = 0;
  413. currentBatch++;
  414. LOG.info("Current Batch : " + currentBatch);
  415. LOG.info("Messages in this batch : " + messagesInCurBatch.length);
  416. }
  417. public boolean hasNext() {
  418. boolean hasMore = current < messagesInCurBatch.length;
  419. if (!hasMore && doBatching
  420. && currentBatch * batchSize < totalInFolder) {
  421. // try next batch
  422. try {
  423. getNextBatch(batchSize, folder);
  424. hasMore = current < messagesInCurBatch.length;
  425. } catch (MessagingException e) {
  426. throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
  427. "Message retreival failed", e);
  428. }
  429. }
  430. return hasMore;
  431. }
  432. public Message next() {
  433. return hasNext() ? messagesInCurBatch[current++] : null;
  434. }
  435. public void remove() {
  436. throw new UnsupportedOperationException("Its read only mode...");
  437. }
  438. private SearchTerm getSearchTerm() {
  439. if (filters.size() == 0)
  440. return null;
  441. if (filters.size() == 1)
  442. return filters.get(0).getCustomSearch(folder);
  443. SearchTerm last = filters.get(0).getCustomSearch(folder);
  444. for (int i = 1; i < filters.size(); i++) {
  445. CustomFilter filter = filters.get(i);
  446. SearchTerm st = filter.getCustomSearch(folder);
  447. if (st != null) {
  448. last = new AndTerm(last, st);
  449. }
  450. }
  451. return last;
  452. }
  453. }
  454. class MailsSinceLastCheckFilter implements CustomFilter {
  455. private Date since;
  456. public MailsSinceLastCheckFilter(Date date) {
  457. since = date;
  458. }
  459. public SearchTerm getCustomSearch(Folder folder) {
  460. return new ReceivedDateTerm(ComparisonTerm.GE, since);
  461. }
  462. }
  463. // user settings stored in member variables
  464. private String user;
  465. private String password;
  466. private String host;
  467. private String protocol;
  468. private String folderNames;
  469. private List<String> exclude = new ArrayList<String>();
  470. private List<String> include = new ArrayList<String>();
  471. private boolean recurse;
  472. private int batchSize;
  473. private int fetchSize;
  474. private int cTimeout;
  475. private int rTimeout;
  476. private Date fetchMailsSince;
  477. private String customFilter;
  478. private boolean processAttachment = true;
  479. // holds the current state
  480. private Store mailbox;
  481. private boolean connected = false;
  482. private FolderIterator folderIter;
  483. private MessageIterator msgIter;
  484. private List<CustomFilter> filters = new ArrayList<CustomFilter>();
  485. private static FetchProfile fp = new FetchProfile();
  486. private static final Logger LOG = LoggerFactory.getLogger(DataImporter.class);
  487. // diagnostics
  488. private int rowCount = 0;
  489. static {
  490. fp.add(FetchProfile.Item.ENVELOPE);
  491. fp.add(FetchProfile.Item.FLAGS);
  492. fp.add("X-Mailer");
  493. }
  494. // Fields To Index
  495. // single valued
  496. private static final String MESSAGE_ID = "messageId";
  497. private static final String SUBJECT = "subject";
  498. private static final String FROM = "from";
  499. private static final String SENT_DATE = "sentDate";
  500. private static final String XMAILER = "xMailer";
  501. // multi valued
  502. private static final String TO_CC_BCC = "allTo";
  503. private static final String FLAGS = "flags";
  504. private static final String CONTENT = "content";
  505. private static final String ATTACHMENT = "attachment";
  506. private static final String ATTACHMENT_NAMES = "attachmentNames";
  507. // flag values
  508. private static final String FLAG_ANSWERED = "answered";
  509. private static final String FLAG_DELETED = "deleted";
  510. private static final String FLAG_DRAFT = "draft";
  511. private static final String FLAG_FLAGGED = "flagged";
  512. private static final String FLAG_RECENT = "recent";
  513. private static final String FLAG_SEEN = "seen";
  514. private int getIntFromContext(String prop, int ifNull) {
  515. int v = ifNull;
  516. try {
  517. String val = context.getEntityAttribute(prop);
  518. if (val != null) {
  519. val = context.replaceTokens(val);
  520. v = Integer.valueOf(val);
  521. }
  522. } catch (NumberFormatException e) {
  523. //do nothing
  524. }
  525. return v;
  526. }
  527. private boolean getBoolFromContext(String prop, boolean ifNull) {
  528. boolean v = ifNull;
  529. String val = context.getEntityAttribute(prop);
  530. if (val != null) {
  531. val = context.replaceTokens(val);
  532. v = Boolean.valueOf(val);
  533. }
  534. return v;
  535. }
  536. private String getStringFromContext(String prop, String ifNull) {
  537. String v = ifNull;
  538. String val = context.getEntityAttribute(prop);
  539. if (val != null) {
  540. val = context.replaceTokens(val);
  541. v = val;
  542. }
  543. return v;
  544. }
  545. }