/paw/src/com/google/marvin/paw/OneBoxScraper.java
Java | 176 lines | 137 code | 11 blank | 28 comment | 72 complexity | 6218e7422a25aff79b49c6ac3a441677 MD5 | raw file
1package com.google.marvin.paw; 2 3import java.io.UnsupportedEncodingException; 4import java.net.URLEncoder; 5import java.util.regex.Matcher; 6import java.util.regex.Pattern; 7 8import org.htmlparser.parserapplications.StringExtractor; 9import org.htmlparser.util.ParserException; 10 11import android.util.Log; 12 13public class OneBoxScraper { 14 15 public static String processGoogleResults(String query) { 16 String processedResult = ""; 17 try { 18 String URL = "http://www.google.com/m?q=" + URLEncoder.encode(query, "UTF-8"); 19 StringExtractor se = new StringExtractor(URL); 20 String results = se.extractStrings(true); 21 22 //Uncomment this line to see the raw dump; 23 //very useful when trying to come up with scraping rules 24 //Log.e("PAW Debug", results); 25 26 /* Check for known one box types */ 27 // Weather 28 if ((processedResult.length() < 1) && (results.indexOf("Weather for") == 0)) { 29 int indexOfHumidity = results.indexOf("Humidity"); 30 if (indexOfHumidity != -1) { 31 int endIndex = results.indexOf("%", indexOfHumidity); 32 if (endIndex != -1) { 33 processedResult = results.substring(0, endIndex + 1); 34 //Log.e("PAW Debug", "Weather: " + processedResult); 35 } 36 } 37 } 38 // Flight tracker 39 if ((processedResult.length() < 1) && (results.indexOf("Track status of ") != -1)) { 40 int indexOfTrackStatus = results.indexOf("Track status of "); 41 int indexOfFlightTracker = results.indexOf("www.flightstats.com", indexOfTrackStatus); 42 if (indexOfFlightTracker != -1) { 43 processedResult = results.substring(indexOfTrackStatus, indexOfFlightTracker); 44 //Log.e("PAW Debug", "Flight tracker: " + processedResult); 45 } 46 } 47 // Calculator 48 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 49 String firstLine = results.substring(0, results.indexOf("\n")); 50 if (firstLine.indexOf(" = ") != -1) { 51 processedResult = firstLine; 52 //Log.e("PAW Debug", "Calculator: " + processedResult); 53 } 54 } 55 // Finance 56 // This is tricky, the market line could be the first or the second line 57 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 58 int firstLineBreak = results.indexOf("\n"); 59 String firstLine = results.substring(0, firstLineBreak); 60 if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) { 61 // Copy the Symbol Market line 62 if (firstLine.indexOf(">") != -1) { 63 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 64 } 65 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 66 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 67 secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by "); 68 processedResult = processedResult + secondLine + "\n"; 69 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 70 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 71 processedResult = processedResult + thirdLine; 72 //Log.e("PAW Debug", "Finance: " + processedResult); 73 } 74 } 75 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 76 int zerothLineBreak = results.indexOf("\n"); 77 int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1); 78 String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak); 79 if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) { 80 // Copy the Symbol Market line 81 if (firstLine.indexOf(">") != -1) { 82 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 83 } 84 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 85 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 86 secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by "); 87 processedResult = processedResult + secondLine + "\n"; 88 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 89 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 90 processedResult = processedResult + thirdLine; 91 //Log.e("PAW Debug", "Finance: " + processedResult); 92 } 93 } 94 // Dictionary 95 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 96 int firstLineBreak = results.indexOf("\n"); 97 String firstLine = results.substring(0, firstLineBreak); 98 if (firstLine.indexOf("Web definitions for ") != -1) { 99 if (firstLine.indexOf(">") != -1) { 100 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 101 } 102 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 103 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 104 processedResult = processedResult + secondLine + "\n"; 105 //Log.e("PAW Debug", "Dictionary: " + processedResult); 106 } 107 } 108 // Time 109 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 110 int firstLineBreak = results.indexOf("\n"); 111 String firstLine = results.substring(0, firstLineBreak); 112 if ((firstLine.indexOf(":") != -1) 113 && ((firstLine.indexOf("am ") != -1) || (firstLine.indexOf("pm ") != -1))) { 114 processedResult = firstLine; 115 //Log.e("PAW Debug", "Time: " + processedResult); 116 } 117 } 118 // Sports 119 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 120 int firstLineBreak = results.indexOf("\n"); 121 String firstLine = results.substring(0, firstLineBreak); 122 123 Pattern vsScorePattern = Pattern.compile("[a-zA-Z ]+[0-9]+ - [a-zA-Z ]+[0-9]+"); 124 Pattern recordScorePattern = Pattern.compile("[a-zA-Z ]+ \\([0-9]+-[0-9]+\\)"); 125 Matcher vsScoreMatcher = vsScorePattern.matcher(firstLine); 126 Matcher recordScoreMatcher = recordScorePattern.matcher(firstLine); 127 128 if (vsScoreMatcher.find()) { 129 processedResult = vsScoreMatcher.group(); 130 //Log.e("PAW Debug", "Sports: " + processedResult); 131 } else if (recordScoreMatcher.find()) { 132 processedResult = recordScoreMatcher.group(); 133 //Log.e("PAW Debug", "Sports: " + processedResult); 134 } 135 } 136 137 /* The following will result in a special action that is not speech */ 138 // Local search 139 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 140 int firstLineBreak = results.indexOf("\n"); 141 String firstLine = results.substring(0, firstLineBreak); 142 String localResultsStr = "Local results "; 143 if (firstLine.indexOf(localResultsStr) == 0) { 144 processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8"); 145 } 146 } 147 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 148 int zerothLineBreak = results.indexOf("\n"); 149 int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1); 150 String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak); 151 String localResultsStr = "Local results "; 152 if (firstLine.indexOf(localResultsStr) == 0) { 153 processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8"); 154 } 155 } 156 // YouTube 157 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 158 int firstLineBreak = results.indexOf("\n"); 159 String firstLine = results.substring(0, firstLineBreak); 160 if (firstLine.indexOf("<http://www.youtube.com/watch?") == 0) { 161 processedResult = 162 "PAW_YOUTUBE:" 163 + firstLine.substring(firstLine.indexOf("<") + 1, firstLine.indexOf(">")); 164 } 165 } 166 167 } catch (ParserException e) { 168 // TODO Auto-generated catch block 169 e.printStackTrace(); 170 } catch (UnsupportedEncodingException e) { 171 // TODO Auto-generated catch block 172 e.printStackTrace(); 173 } 174 return processedResult; 175 } 176}