/shell/src/com/google/marvin/shell/OneBoxScraper.java
Java | 268 lines | 199 code | 17 blank | 52 comment | 95 complexity | dceb63e580612ebf791c2daae4c88ca4 MD5 | raw file
1/* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.google.marvin.shell; 18 19import org.htmlparser.parserapplications.StringExtractor; 20import org.htmlparser.util.ParserException; 21 22import java.io.UnsupportedEncodingException; 23import java.net.URLEncoder; 24import java.util.regex.Matcher; 25import java.util.regex.Pattern; 26 27/** 28 * Utility class for scraping the one box and finding the result 29 * 30 * @author clchen@google.com (Charles L. Chen) 31 */ 32final public class OneBoxScraper { 33 34 private OneBoxScraper() { 35 } 36 37 public static String processGoogleResults(String query, String baseUrl) { 38 String processedResult = ""; 39 try { 40 String url = baseUrl + URLEncoder.encode(query, "UTF-8"); 41 StringExtractor se = new StringExtractor(url); 42 String results = se.extractStrings(true); 43 44 // Uncomment this line to see the raw dump; 45 // very useful when trying to come up with scraping rules 46 // Log.e("OneBoxScraper Debug", results); 47 48 /* Check for known one box types */ 49 // Weather 50 if ((processedResult.length() < 1) && (results.indexOf("Weather for") == 0)) { 51 int indexOfHumidity = results.indexOf("Humidity"); 52 if (indexOfHumidity != -1) { 53 int endIndex = results.indexOf("%", indexOfHumidity); 54 if (endIndex != -1) { 55 processedResult = results.substring(0, endIndex + 1); 56 } 57 } 58 } 59 // Flight tracker 60 if ((processedResult.length() < 1) && (results.indexOf("Track status of ") != -1)) { 61 int indexOfTrackStatus = results.indexOf("Track status of "); 62 int indexOfFlightTracker = results.indexOf( 63 "www.flightstats.com", indexOfTrackStatus); 64 if (indexOfFlightTracker != -1) { 65 processedResult = results.substring(indexOfTrackStatus, indexOfFlightTracker); 66 } 67 } 68 // Calculator 69 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 70 String firstLine = results.substring(0, results.indexOf("\n")); 71 if (firstLine.indexOf(" = ") != -1) { 72 processedResult = firstLine; 73 } 74 } 75 // Finance 76 // This is tricky, the market line could be the first or the second 77 // line 78 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 79 int firstLineBreak = results.indexOf("\n"); 80 String firstLine = results.substring(0, firstLineBreak); 81 if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) { 82 // Copy the Symbol Market line 83 if (firstLine.indexOf(">") != -1) { 84 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 85 } 86 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 87 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 88 secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by "); 89 processedResult = processedResult + secondLine + "\n"; 90 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 91 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 92 processedResult = processedResult + thirdLine; 93 } 94 } 95 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 96 int zerothLineBreak = results.indexOf("\n"); 97 int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1); 98 String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak); 99 if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) { 100 // Copy the Symbol Market line 101 if (firstLine.indexOf(">") != -1) { 102 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 103 } 104 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 105 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 106 secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by "); 107 processedResult = processedResult + secondLine + "\n"; 108 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 109 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 110 processedResult = processedResult + thirdLine; 111 } 112 } 113 // Dictionary 114 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 115 int firstLineBreak = results.indexOf("\n"); 116 String firstLine = results.substring(0, firstLineBreak); 117 if (firstLine.indexOf("Web definitions for ") != -1) { 118 if (firstLine.indexOf(">") != -1) { 119 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 120 } 121 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 122 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 123 processedResult = processedResult + secondLine + "\n"; 124 } 125 } 126 // Time 127 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 128 int firstLineBreak = results.indexOf("\n"); 129 String firstLine = results.substring(0, firstLineBreak); 130 if ((firstLine.indexOf(":") != -1) 131 && ((firstLine.indexOf("am ") != -1) || (firstLine.indexOf("pm ") != -1))) { 132 processedResult = firstLine; 133 } 134 } 135 // Sports 136 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 137 int firstLineBreak = results.indexOf("\n"); 138 String firstLine = results.substring(0, firstLineBreak); 139 140 Pattern vsScorePattern = Pattern.compile("[a-zA-Z ]+[0-9]+ - [a-zA-Z ]+[0-9]+"); 141 Pattern recordScorePattern = Pattern.compile("[a-zA-Z ]+ \\([0-9]+-[0-9]+\\)"); 142 Matcher vsScoreMatcher = vsScorePattern.matcher(firstLine); 143 Matcher recordScoreMatcher = recordScorePattern.matcher(firstLine); 144 145 if (vsScoreMatcher.find()) { 146 processedResult = vsScoreMatcher.group(); 147 } else if (recordScoreMatcher.find()) { 148 processedResult = recordScoreMatcher.group(); 149 } 150 } 151 // World cup 152 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 153 int firstLineBreak = results.indexOf("\n"); 154 String firstLine = results.substring(0, firstLineBreak); 155 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 156 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 157 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 158 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 159 int fourthLineBreak = results.indexOf("\n", thirdLineBreak + 1); 160 String fourthLine = results.substring(thirdLineBreak + 1, fourthLineBreak); 161 if (firstLine.contains("2010 FIFA World Cup(tm)") 162 && fourthLine.equals("Upcoming matches:")) { 163 processedResult = secondLine + "\n" + thirdLine; 164 } 165 } 166 167 // Special case for eyes-free shell: Speak the first location result 168 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 169 int firstLineBreak = results.indexOf("\n"); 170 String firstLine = results.substring(0, firstLineBreak); 171 String localResultsStr = "Local results "; 172 if (firstLine.indexOf(localResultsStr) == 0) { 173 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 174 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 175 int fourthLineBreak = results.indexOf("\n", thirdLineBreak + 1); 176 int fifthLineBreak = results.indexOf("\n", fourthLineBreak + 1); 177 178 // <http://www.google.com/m?defaultloc=Mountain+View%2C+CA+94043&site=local&q=costco+94043&latlng=15926316227166107848&mp=1&zp&source=m&ct=res&oi=local_result&sa=X&ei=Ll3CSvGMNZCNtge0z-83&cd=1&resnum=1>Costco 179 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 180 // 1000 N Rengstorff Ave, Mountain View, C.A. 94043 181 String fourthLine = results.substring(thirdLineBreak + 1, fourthLineBreak); 182 // <wtai://wp/mc;6509881841>(650) 9881841 - Ratings: 3/5 183 String fifthLine = results.substring(fourthLineBreak + 1, fifthLineBreak); 184 185 processedResult = thirdLine.substring(thirdLine.indexOf(">") + 1) + "\n"; 186 processedResult = processedResult + fourthLine + "\n"; 187 processedResult = processedResult 188 + fifthLine.substring(fifthLine.indexOf(">") + 1); 189 } 190 } 191 // Special case for eyes-free shell: Speak the first location result 192 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 193 int firstLineBreak = results.indexOf("\n"); 194 int secondLineBreak = results.indexOf("\n", firstLineBreak + 1); 195 int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1); 196 197 // <http://www.google.com/m?defaultloc=Mountain+View%2C+CA+94043&site=local&q=costco+94043&latlng=15926316227166107848&mp=1&zp&source=m&ct=res&oi=local_result&sa=X&ei=Ll3CSvGMNZCNtge0z-83&cd=1&resnum=1>Costco 198 String firstLine = results.substring(0, firstLineBreak); 199 // 1000 N Rengstorff Ave, Mountain View, C.A. 94043 200 String secondLine = results.substring(firstLineBreak + 1, secondLineBreak); 201 // <wtai://wp/mc;6509881841>(650) 9881841 - Ratings: 3/5 202 String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak); 203 204 Pattern addressPattern = Pattern.compile( 205 "[0-9a-zA-Z ]+, [a-zA-Z ]+, [a-zA-Z. ]+ [0-9]+"); 206 Matcher addressMatcher = addressPattern.matcher(secondLine); 207 Pattern phonePattern = Pattern.compile("\\([0-9][0-9][0-9]\\) [0-9-]+"); 208 Matcher phoneMatcher = phonePattern.matcher(thirdLine); 209 210 if (addressMatcher.find() && phoneMatcher.find()) { 211 processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n"; 212 processedResult = processedResult + secondLine + "\n"; 213 processedResult = processedResult 214 + thirdLine.substring(thirdLine.indexOf(">") + 1); 215 } 216 } 217 218 /* The following will result in a special action that is not speech */ 219 // Local search 220 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 221 int firstLineBreak = results.indexOf("\n"); 222 String firstLine = results.substring(0, firstLineBreak); 223 String localResultsStr = "Local results "; 224 if (firstLine.indexOf(localResultsStr) == 0) { 225 processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8"); 226 } 227 } 228 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 229 int zerothLineBreak = results.indexOf("\n"); 230 int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1); 231 String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak); 232 String localResultsStr = "Local results "; 233 if (firstLine.indexOf(localResultsStr) == 0) { 234 processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8"); 235 } 236 } 237 // YouTube 238 if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) { 239 int firstLineBreak = results.indexOf("\n"); 240 String firstLine = results.substring(0, firstLineBreak); 241 if (firstLine.indexOf("<http://www.youtube.com/watch?") == 0) { 242 processedResult = "PAW_YOUTUBE:" + firstLine.substring( 243 firstLine.indexOf("<") + 1, firstLine.indexOf(">")); 244 } 245 } 246 // Try to read the first result if there is no preceding link 247 // since this will usually be a onebox of some sort. 248 if ((processedResult.length() < 1) && (results.indexOf("<") != -1)) { 249 int endIndex = results.indexOf("<", 0); 250 if (endIndex != -1) { 251 processedResult = results.substring(0, endIndex + 1); 252 } 253 // If this is the weather box, try to trim it down by cutting it 254 // off at humidity 255 if ((processedResult.length() > 1) && (processedResult.indexOf("%") != -1)) { 256 processedResult = processedResult.substring( 257 0, processedResult.indexOf("%") + 1); 258 } 259 } 260 // Log.e("processedResultLength", processedResult.length() + ""); 261 } catch (ParserException e) { 262 e.printStackTrace(); 263 } catch (UnsupportedEncodingException e) { 264 e.printStackTrace(); 265 } 266 return processedResult; 267 } 268}