PageRenderTime 41ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/webmail-archiver/src/main/java/WebMail.java

https://github.com/tledoux/AQuA
Java | 399 lines | 272 code | 60 blank | 67 comment | 40 complexity | ca177dbd9de5179405330e0f7f0757f1 MD5 | raw file
  1. import java.io.ByteArrayInputStream;
  2. import java.io.File;
  3. import java.io.FileOutputStream;
  4. import java.io.IOException;
  5. import java.io.InputStream;
  6. import java.io.PrintStream;
  7. import java.net.InetAddress;
  8. import java.net.URLEncoder;
  9. import java.net.UnknownHostException;
  10. import java.util.Date;
  11. import java.util.Enumeration;
  12. import java.util.List;
  13. import java.util.Properties;
  14. import java.util.concurrent.atomic.AtomicInteger;
  15. import javax.mail.FetchProfile;
  16. import javax.mail.Folder;
  17. import javax.mail.Header;
  18. import javax.mail.Message;
  19. import javax.mail.MessagingException;
  20. import javax.mail.Multipart;
  21. import javax.mail.Part;
  22. import javax.mail.Session;
  23. import javax.mail.Store;
  24. import javax.mail.internet.ContentType;
  25. import javax.mail.internet.ParseException;
  26. import org.apache.log4j.Logger;
  27. import org.archive.io.arc.ARCWriter;
  28. import org.archive.util.ArchiveUtils;
  29. /**
  30. * Class to harvest a mail account in one of the provider (either pop3 or imap).
  31. * The collected mails are then stored in a arc file
  32. *
  33. * Coming from
  34. * http://java.sun.com/developer/onlineTraining/JavaMail/contents.html#JavaMailIntro
  35. * Reading of mail directly inspired from the demo : msgshow.java
  36. *
  37. * Introduction to mail can be found at http://en.wikipedia.org/wiki/MIME
  38. *
  39. */
  40. public class WebMail {
  41. static Logger LOG = Logger.getLogger(WebMail.class);
  42. /**
  43. * Tells whether we want compressed arcs or ordinary ones.
  44. */
  45. private static boolean ARC_COMPRESSED = false;
  46. /**
  47. * Tells where to generates the archives
  48. */
  49. private File storageDir = new File(System.getProperty("java.io.tmpdir"));
  50. /**
  51. * Variables to handle the exchange with the mail provider
  52. */
  53. private Session session = null;
  54. private Store store = null;
  55. private String username, password;
  56. private MailSettings mailSettings;
  57. private Folder folder;
  58. /**
  59. * Variables to handle the creation of the arc file
  60. */
  61. private File arcFile;
  62. private String hostIp;
  63. /**
  64. * Default constructor
  65. */
  66. public WebMail() {
  67. }
  68. /**
  69. * Setter for user and password information
  70. */
  71. public void setUserPass(String username, String password) {
  72. this.username = username;
  73. this.password = password;
  74. }
  75. /**
  76. * Setter for the information about a specific provider
  77. * @param settings
  78. */
  79. public void setMailSettings(MailSettings settings) {
  80. this.mailSettings = settings;
  81. }
  82. /**
  83. * Getter to the arc file generated
  84. * @return the arcFile
  85. */
  86. public File getArcFile() {
  87. return arcFile;
  88. }
  89. public void connect() throws Exception {
  90. Properties props = new Properties();
  91. session = Session.getDefaultInstance(props, null);
  92. store = session.getStore(mailSettings.getProtocol());
  93. store.connect(mailSettings.getHost(), mailSettings.getPort(),
  94. username, password);
  95. }
  96. public void openFolder(String folderName) throws MessagingException {
  97. // Make sure that for pop3
  98. if ( mailSettings.getProtocol() == null ||
  99. mailSettings.getProtocol().startsWith("pop3")
  100. ) {
  101. folderName = "INBOX";
  102. }
  103. // Open the Folder
  104. folder = store.getDefaultFolder();
  105. folder = folder.getFolder(folderName);
  106. if (folder == null) {
  107. throw new MessagingException("Invalid folder");
  108. }
  109. // try to open read-only
  110. folder.open(Folder.READ_ONLY);
  111. }
  112. public void closeFolder() throws Exception {
  113. folder.close(false);
  114. }
  115. public int getMessageCount() throws Exception {
  116. return folder.getMessageCount();
  117. }
  118. public int getNewMessageCount() throws Exception {
  119. return folder.getNewMessageCount();
  120. }
  121. public void disconnect() throws Exception {
  122. store.close();
  123. }
  124. public void setArchive(String prefixe) {
  125. StringBuffer sb = new StringBuffer();
  126. sb.append(prefixe).append("_");
  127. sb.append(this.username.replaceAll("@", "_at_"));
  128. sb.append("_").append(Long.toString(System.currentTimeMillis() / 1000L));
  129. sb.append(".arc");
  130. if (ARC_COMPRESSED) {
  131. sb.append(".gz");
  132. }
  133. this.arcFile = new File(storageDir, sb.toString());
  134. // Retrieve the IP
  135. try {
  136. InetAddress ip = InetAddress.getLocalHost();
  137. this.hostIp = ip.getHostAddress();
  138. } catch (UnknownHostException e) {
  139. this.hostIp = "127.0.0.1";
  140. }
  141. }
  142. public void dumpAllMessages() throws Exception {
  143. // Initialize the arc writer
  144. AtomicInteger ai = new AtomicInteger(1);
  145. PrintStream ps = new PrintStream(new FileOutputStream(this.arcFile));
  146. String now = ArchiveUtils.get14DigitDate();
  147. List<?> lmeta = null;
  148. // Attributes & Flags for all messages ..
  149. Message[] msgs = folder.getMessages();
  150. if (msgs.length == 0) {
  151. LOG.warn("No messages to archive !!!! ");
  152. return;
  153. }
  154. ARCWriter arcWriter = new ARCWriter(
  155. ai, ps, arcFile, ARC_COMPRESSED, now, lmeta);
  156. String prefixeURL = "mailto://" + this.username;
  157. // Use a suitable FetchProfile
  158. FetchProfile fp = new FetchProfile();
  159. fp.add(FetchProfile.Item.ENVELOPE);
  160. folder.fetch(msgs, fp);
  161. for (int i = 0; i < msgs.length; i++) {
  162. // for (int i = 0; i < 1; i++) {
  163. LOG.info("MESSAGE #" + (i + 1) + ":");
  164. dumpInitialMessage(msgs[i], arcWriter, prefixeURL);
  165. }
  166. arcWriter.close();
  167. }
  168. public void dumpInitialMessage(Part p, ARCWriter arcWriter, String prefixeURL) throws MessagingException, IOException {
  169. String uri = prefixeURL;
  170. long fetchBeginTimeStamp = System.currentTimeMillis();
  171. Message m = null;
  172. if (!(p instanceof Message)) {
  173. return;
  174. }
  175. m = (Message)p;
  176. LOG.info("SUBJECT: " + m.getSubject());
  177. uri = prefixeURL + "?subject=" + URLEncoder.encode(m.getSubject(), "UTF-8");
  178. Date d = m.getReceivedDate();
  179. if (d != null) {
  180. fetchBeginTimeStamp = d.getTime();
  181. } else {
  182. d = m.getSentDate();
  183. if (d != null) {
  184. fetchBeginTimeStamp = d.getTime();
  185. } else {
  186. LOG.warn("Not date for this message take now");
  187. }
  188. }
  189. dumpHeader(m, fetchBeginTimeStamp, arcWriter, uri);
  190. dumpPart(p, fetchBeginTimeStamp, arcWriter, uri);
  191. }
  192. public void dumpHeader(Message m, long receiveDate, ARCWriter arcWriter, String prefixeURL) throws MessagingException, IOException {
  193. String uri = prefixeURL;
  194. String contentType = null;
  195. try {
  196. contentType = m.getContentType(); // "message/rfc822"
  197. } catch (MessagingException e) {
  198. contentType = "message/rfc822";
  199. }
  200. Enumeration<?> e = m.getAllHeaders();
  201. StringBuffer sb = new StringBuffer();
  202. while (e.hasMoreElements()) {
  203. Header h = (Header)e.nextElement();
  204. sb.append(h.getName()).append(": ").append(h.getValue()).append("\n");
  205. }
  206. String s = sb.toString();
  207. byte[] b = s.getBytes("UTF-8");
  208. long recordLength = b.length;
  209. InputStream in = new ByteArrayInputStream(b);
  210. arcWriter.write(uri, contentType, this.hostIp, receiveDate, recordLength, in, true);
  211. LOG.info("---------------------------");
  212. }
  213. public void dumpPart(Part p, long receiveDate, ARCWriter arcWriter, String prefixeURL) throws MessagingException, IOException {
  214. String uri = prefixeURL;
  215. String contentType;
  216. try {
  217. contentType = p.getContentType();
  218. } catch (MessagingException e) {
  219. contentType = "no-type";
  220. }
  221. LOG.info("dumpPart CONTENT-TYPE: " + contentType);
  222. try {
  223. LOG.info("CONTENT-TYPE: " + (new ContentType(contentType)).toString());
  224. } catch (ParseException pex) {
  225. LOG.error("BAD CONTENT-TYPE: " + contentType);
  226. return;
  227. }
  228. /*
  229. * Using isMimeType to determine the content type avoids
  230. * fetching the actual content data until we need it.
  231. */
  232. long recordLength = 0;
  233. InputStream in = null;
  234. if (p.isMimeType("multipart/*")) {
  235. // Handle multipart and recursion
  236. dumpMultipart((Multipart)p.getContent(), contentType, receiveDate, arcWriter, uri);
  237. } else if (p.isMimeType("message/rfc822")) {
  238. LOG.info("This is a Nested Message");
  239. dumpPart((Part) p.getContent(), receiveDate, arcWriter, uri);
  240. } else if (p.isMimeType("text/*")) {
  241. // load in memory to know the size
  242. String charset = extractCharset(contentType);
  243. String s = (String)p.getContent();
  244. byte[] b = s.getBytes(charset);
  245. recordLength = b.length;
  246. LOG.info("This is " + contentType + "/" + charset + " of said size " + p.getSize() + " and real size " + recordLength);
  247. in = new ByteArrayInputStream(b);
  248. boolean enforceSize = true;
  249. arcWriter.write(uri, contentType, this.hostIp, receiveDate, recordLength, in, enforceSize);
  250. LOG.info("---------------------------");
  251. } else {
  252. LOG.info("This is " + contentType + " of size " + p.getSize());
  253. recordLength = p.getSize();
  254. in = p.getInputStream();
  255. // TODO need to serialize to a file to known the REAL size (should be true !!!)
  256. boolean enforceSize = false;
  257. arcWriter.write(uri, contentType, this.hostIp, receiveDate, recordLength, in, enforceSize);
  258. LOG.info("---------------------------");
  259. }
  260. if (in != null) in.close();
  261. }
  262. /**
  263. * Extract the charset from a contentType string like
  264. * text/plain; charset=ISO-8859-1
  265. * @param ct the content type string
  266. * @return a charset
  267. */
  268. public String extractCharset(String ct) {
  269. for (String s : ct.split("; ")) {
  270. // LOG.info(s);
  271. if (s.startsWith("charset=")) {
  272. return s.substring(8).replaceAll("\"", "");
  273. }
  274. }
  275. return "UTF-8";
  276. }
  277. public void dumpMultipart(Multipart mp, String contentType, long receiveDate, ARCWriter arcWriter, String prefixeURL) throws MessagingException, IOException {
  278. for (int i = 0, n = mp.getCount(); i < n; i++) {
  279. Part part = mp.getBodyPart(i);
  280. String uri = prefixeURL;
  281. long fetchBeginTimeStamp = receiveDate;
  282. long recordLength = 0;
  283. String ctP = part.getContentType();
  284. String disposition = part.getDisposition();
  285. if ((disposition != null) &&
  286. (disposition.equals(Part.ATTACHMENT) ||
  287. disposition.equals(Part.INLINE))
  288. ) {
  289. LOG.info("Part " + (i + 1) + " kind " + disposition + " type " + ctP + " name " + part.getFileName());
  290. if (part.getFileName() != null) {
  291. uri = prefixeURL + "#" + URLEncoder.encode(part.getFileName(), "UTF-8");
  292. } else {
  293. uri = prefixeURL + "#attachment" + (i + 1);
  294. }
  295. recordLength = part.getSize();
  296. InputStream in = part.getInputStream();
  297. // TODO need to serialize to a file to known the REAL size (should be true !!!)
  298. boolean enforceSize = false;
  299. arcWriter.write(uri, contentType, this.hostIp, fetchBeginTimeStamp, recordLength, in, enforceSize);
  300. in.close();
  301. } else {
  302. if (part.getFileName() != null) {
  303. uri = prefixeURL + "#" + URLEncoder.encode(part.getFileName(), "UTF-8");
  304. } else {
  305. uri = prefixeURL + "#part" + (i + 1);
  306. }
  307. LOG.info("Dump part " + (i+1));
  308. dumpPart(part, fetchBeginTimeStamp, arcWriter, uri);
  309. }
  310. }
  311. }
  312. /**
  313. * Main function for quick testing
  314. * @param args
  315. * @throws MessagingException
  316. */
  317. public static void main(String[] args) throws MessagingException {
  318. try {
  319. WebMail wm = new WebMail();
  320. wm.setUserPass("userX@gmail.com", "XXX");
  321. MailSettings sets = MailSettings.getSettings("GOOGLE_IMAP");
  322. if (sets == null) throw new Exception("Unknown");
  323. wm.setMailSettings(sets);
  324. wm.connect();
  325. // Restriction in POP3 only the inbox folder can be looked at
  326. wm.openFolder("INBOX");
  327. // wm.openFolder("For harvesting");
  328. int totalMessages = wm.getMessageCount();
  329. LOG.info("Total messages = " + totalMessages);
  330. LOG.info("-------------------------------");
  331. wm.setArchive("mail");
  332. wm.dumpAllMessages();
  333. LOG.info("Archive created at " + wm.getArcFile());
  334. } catch(Exception e) {
  335. LOG.error("Error " + e.getMessage(), e);
  336. System.exit(-1);
  337. }
  338. }
  339. }