/src/Main.java
https://gitlab.com/Autronius/Spider_email · Java · 127 lines · 82 code · 23 blank · 22 comment · 10 complexity · 9e80c21b177c62d0f1216510d4d55706 MD5 · raw file
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.Writer;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.Hashtable;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class Main {
-
- static Pattern emailAddress_Pattern = Pattern.compile("[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.[A-Za-z]{2,})"); //mykong //Westjet: [_a-zA-Z0-9-]+(.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(.[a-zA-Z0-9-]+)*.(([0-9]{1,3})|([a-zA-Z]{2,6}))
- static Pattern domainName_Pattern = Pattern.compile("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"); //((?!-)[A-Za-z0-9-]{1,63}(?<!-)\\.)+[A-Za-z]{2,6}"); //http://www.mkyong.com/regular-expressions/domain-name-regular-expression-example/
- static Pattern relativeURL_Pattern = Pattern.compile("");
-
- /*Args[0]: name of file to stick emails into
- * args[1]: root url of sites to crawl
- * args[2]: number of email addresses to find
- */
- public static void main(String[] args){
- Writer output = null;
- try{
- output = new BufferedWriter(new FileWriter(args[0], true));
- }catch(Exception e){
- e.printStackTrace();
- }
-
- String rootWebsiteHTML = getHTMLFromURL(args[1]);
- Matcher email_matcher = emailAddress_Pattern.matcher(rootWebsiteHTML);
- Matcher domain_matcher = domainName_Pattern.matcher(rootWebsiteHTML);
- Matcher relativeURL_matcher = relativeURL_Pattern.matcher(rootWebsiteHTML);
-
- Hashtable<String, Boolean> visitedHashTable = new Hashtable<String, Boolean>();
- visitedHashTable.put(args[1], true);
- Hashtable<String, Boolean> foundEmails = new Hashtable<String, Boolean>();
-
- //getEmails
- List<String> emails = new ArrayList<String>();
- findResource(email_matcher, emails, foundEmails);
-
- //getDomains
- List<String> domains = new ArrayList<String>();
- //getRelativeLinks
- /* List<String> relativeLinks = new ArrayList<String>();
- findResource(relativeURL_matcher, relativeLinks);
-
- //turn into complete links and add to domains list
- for(int i = 0; i < relativeLinks.size(); i++){
- domains.add(args[1]+relativeLinks.get(i));
- }*/
- findResource(domain_matcher, domains, visitedHashTable);
-
- for(int i = 0; (i < Integer.parseInt(args[2])) && (i < domains.size()); i++){
-
- String html = getHTMLFromURL(domains.get(i));
- email_matcher = emailAddress_Pattern.matcher(html);
-
- //getEmails
- findResource(email_matcher, emails, foundEmails);
-
- //getMoreDomains if not stopping after root
- //domain_matcher = domainName_Pattern.matcher(html);
- //findResource(domain_matcher, domains, visitedHashTable);
- }
-
- //save emails to file
- if(output != null){
- for(String email: emails){
- try {
- output.write(email);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- public static void findResource(Matcher patternMatcher, List<String> listToAddTo, Hashtable<String, Boolean> doNotAddList){
-
- Matcher matcher = patternMatcher;
- List<String> resourcesFound = listToAddTo;
-
- while (matcher.find()) {
- if(doNotAddList.get(matcher.group()) == null){
- resourcesFound.add(matcher.group());
- //ensure we don't add it again if we've already found it
- doNotAddList.put(matcher.group(), true);
- System.out.println("Found: " + matcher.group());
- }
- }
- }
- public static String getHTMLFromURL(String Url){
-
- URL url;
- String html = "";
-
- try {
- // get URL content
- url = new URL(Url);
- URLConnection conn = url.openConnection();
- // open the stream and put it into BufferedReader
- BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream()));
-
- String inputLine;
- while ((inputLine = br.readLine()) != null) {
- html += inputLine;
- }
- br.close();
- } catch (MalformedURLException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- return html;
- }
- }