/src/Main.java

https://gitlab.com/Autronius/Spider_email · Java · 127 lines · 82 code · 23 blank · 22 comment · 10 complexity · 9e80c21b177c62d0f1216510d4d55706 MD5 · raw file

  1. import java.io.BufferedReader;
  2. import java.io.BufferedWriter;
  3. import java.io.FileWriter;
  4. import java.io.IOException;
  5. import java.io.InputStreamReader;
  6. import java.io.Writer;
  7. import java.net.MalformedURLException;
  8. import java.net.URL;
  9. import java.net.URLConnection;
  10. import java.util.ArrayList;
  11. import java.util.Hashtable;
  12. import java.util.List;
  13. import java.util.regex.Matcher;
  14. import java.util.regex.Pattern;
  15. public class Main {
  16. static Pattern emailAddress_Pattern = Pattern.compile("[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})"); //mykong //Westjet: [_a-zA-Z0-9-]+(.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(.[a-zA-Z0-9-]+)*.(([0-9]{1,3})|([a-zA-Z]{2,6}))
  17. static Pattern domainName_Pattern = Pattern.compile("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"); //((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}"); //http://www.mkyong.com/regular-expressions/domain-name-regular-expression-example/
  18. static Pattern relativeURL_Pattern = Pattern.compile("");
  19. /*Args[0]: name of file to stick emails into
  20. * args[1]: root url of sites to crawl
  21. * args[2]: number of email addresses to find
  22. */
  23. public static void main(String[] args){
  24. Writer output = null;
  25. try{
  26. output = new BufferedWriter(new FileWriter(args[0], true));
  27. }catch(Exception e){
  28. e.printStackTrace();
  29. }
  30. String rootWebsiteHTML = getHTMLFromURL(args[1]);
  31. Matcher email_matcher = emailAddress_Pattern.matcher(rootWebsiteHTML);
  32. Matcher domain_matcher = domainName_Pattern.matcher(rootWebsiteHTML);
  33. Matcher relativeURL_matcher = relativeURL_Pattern.matcher(rootWebsiteHTML);
  34. Hashtable<String, Boolean> visitedHashTable = new Hashtable<String, Boolean>();
  35. visitedHashTable.put(args[1], true);
  36. Hashtable<String, Boolean> foundEmails = new Hashtable<String, Boolean>();
  37. //getEmails
  38. List<String> emails = new ArrayList<String>();
  39. findResource(email_matcher, emails, foundEmails);
  40. //getDomains
  41. List<String> domains = new ArrayList<String>();
  42. //getRelativeLinks
  43. /* List<String> relativeLinks = new ArrayList<String>();
  44. findResource(relativeURL_matcher, relativeLinks);
  45. //turn into complete links and add to domains list
  46. for(int i = 0; i < relativeLinks.size(); i++){
  47. domains.add(args[1]+relativeLinks.get(i));
  48. }*/
  49. findResource(domain_matcher, domains, visitedHashTable);
  50. for(int i = 0; (i < Integer.parseInt(args[2])) && (i < domains.size()); i++){
  51. String html = getHTMLFromURL(domains.get(i));
  52. email_matcher = emailAddress_Pattern.matcher(html);
  53. //getEmails
  54. findResource(email_matcher, emails, foundEmails);
  55. //getMoreDomains if not stopping after root
  56. //domain_matcher = domainName_Pattern.matcher(html);
  57. //findResource(domain_matcher, domains, visitedHashTable);
  58. }
  59. //save emails to file
  60. if(output != null){
  61. for(String email: emails){
  62. try {
  63. output.write(email);
  64. } catch (IOException e) {
  65. e.printStackTrace();
  66. }
  67. }
  68. }
  69. }
  70. public static void findResource(Matcher patternMatcher, List<String> listToAddTo, Hashtable<String, Boolean> doNotAddList){
  71. Matcher matcher = patternMatcher;
  72. List<String> resourcesFound = listToAddTo;
  73. while (matcher.find()) {
  74. if(doNotAddList.get(matcher.group()) == null){
  75. resourcesFound.add(matcher.group());
  76. //ensure we don't add it again if we've already found it
  77. doNotAddList.put(matcher.group(), true);
  78. System.out.println("Found: " + matcher.group());
  79. }
  80. }
  81. }
  82. public static String getHTMLFromURL(String Url){
  83. URL url;
  84. String html = "";
  85. try {
  86. // get URL content
  87. url = new URL(Url);
  88. URLConnection conn = url.openConnection();
  89. // open the stream and put it into BufferedReader
  90. BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream()));
  91. String inputLine;
  92. while ((inputLine = br.readLine()) != null) {
  93. html += inputLine;
  94. }
  95. br.close();
  96. } catch (MalformedURLException e) {
  97. e.printStackTrace();
  98. } catch (IOException e) {
  99. e.printStackTrace();
  100. }
  101. return html;
  102. }
  103. }