/shell/src/com/google/marvin/shell/OneBoxScraper.java

http://eyes-free.googlecode.com/ · Java · 268 lines · 199 code · 17 blank · 52 comment · 95 complexity · dceb63e580612ebf791c2daae4c88ca4 MD5 · raw file

  1. /*
  2. * Copyright (C) 2010 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5. * use this file except in compliance with the License. You may obtain a copy of
  6. * the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. * License for the specific language governing permissions and limitations under
  14. * the License.
  15. */
  16. package com.google.marvin.shell;
  17. import org.htmlparser.parserapplications.StringExtractor;
  18. import org.htmlparser.util.ParserException;
  19. import java.io.UnsupportedEncodingException;
  20. import java.net.URLEncoder;
  21. import java.util.regex.Matcher;
  22. import java.util.regex.Pattern;
  23. /**
  24. * Utility class for scraping the one box and finding the result
  25. *
  26. * @author clchen@google.com (Charles L. Chen)
  27. */
  28. final public class OneBoxScraper {
  29. private OneBoxScraper() {
  30. }
  31. public static String processGoogleResults(String query, String baseUrl) {
  32. String processedResult = "";
  33. try {
  34. String url = baseUrl + URLEncoder.encode(query, "UTF-8");
  35. StringExtractor se = new StringExtractor(url);
  36. String results = se.extractStrings(true);
  37. // Uncomment this line to see the raw dump;
  38. // very useful when trying to come up with scraping rules
  39. // Log.e("OneBoxScraper Debug", results);
  40. /* Check for known one box types */
  41. // Weather
  42. if ((processedResult.length() < 1) && (results.indexOf("Weather for") == 0)) {
  43. int indexOfHumidity = results.indexOf("Humidity");
  44. if (indexOfHumidity != -1) {
  45. int endIndex = results.indexOf("%", indexOfHumidity);
  46. if (endIndex != -1) {
  47. processedResult = results.substring(0, endIndex + 1);
  48. }
  49. }
  50. }
  51. // Flight tracker
  52. if ((processedResult.length() < 1) && (results.indexOf("Track status of ") != -1)) {
  53. int indexOfTrackStatus = results.indexOf("Track status of ");
  54. int indexOfFlightTracker = results.indexOf(
  55. "www.flightstats.com", indexOfTrackStatus);
  56. if (indexOfFlightTracker != -1) {
  57. processedResult = results.substring(indexOfTrackStatus, indexOfFlightTracker);
  58. }
  59. }
  60. // Calculator
  61. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  62. String firstLine = results.substring(0, results.indexOf("\n"));
  63. if (firstLine.indexOf(" = ") != -1) {
  64. processedResult = firstLine;
  65. }
  66. }
  67. // Finance
  68. // This is tricky, the market line could be the first or the second
  69. // line
  70. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  71. int firstLineBreak = results.indexOf("\n");
  72. String firstLine = results.substring(0, firstLineBreak);
  73. if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
  74. // Copy the Symbol Market line
  75. if (firstLine.indexOf(">") != -1) {
  76. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  77. }
  78. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  79. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  80. secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
  81. processedResult = processedResult + secondLine + "\n";
  82. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  83. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  84. processedResult = processedResult + thirdLine;
  85. }
  86. }
  87. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  88. int zerothLineBreak = results.indexOf("\n");
  89. int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
  90. String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
  91. if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
  92. // Copy the Symbol Market line
  93. if (firstLine.indexOf(">") != -1) {
  94. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  95. }
  96. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  97. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  98. secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
  99. processedResult = processedResult + secondLine + "\n";
  100. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  101. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  102. processedResult = processedResult + thirdLine;
  103. }
  104. }
  105. // Dictionary
  106. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  107. int firstLineBreak = results.indexOf("\n");
  108. String firstLine = results.substring(0, firstLineBreak);
  109. if (firstLine.indexOf("Web definitions for ") != -1) {
  110. if (firstLine.indexOf(">") != -1) {
  111. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  112. }
  113. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  114. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  115. processedResult = processedResult + secondLine + "\n";
  116. }
  117. }
  118. // Time
  119. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  120. int firstLineBreak = results.indexOf("\n");
  121. String firstLine = results.substring(0, firstLineBreak);
  122. if ((firstLine.indexOf(":") != -1)
  123. && ((firstLine.indexOf("am ") != -1) || (firstLine.indexOf("pm ") != -1))) {
  124. processedResult = firstLine;
  125. }
  126. }
  127. // Sports
  128. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  129. int firstLineBreak = results.indexOf("\n");
  130. String firstLine = results.substring(0, firstLineBreak);
  131. Pattern vsScorePattern = Pattern.compile("[a-zA-Z ]+[0-9]+ - [a-zA-Z ]+[0-9]+");
  132. Pattern recordScorePattern = Pattern.compile("[a-zA-Z ]+ \\([0-9]+-[0-9]+\\)");
  133. Matcher vsScoreMatcher = vsScorePattern.matcher(firstLine);
  134. Matcher recordScoreMatcher = recordScorePattern.matcher(firstLine);
  135. if (vsScoreMatcher.find()) {
  136. processedResult = vsScoreMatcher.group();
  137. } else if (recordScoreMatcher.find()) {
  138. processedResult = recordScoreMatcher.group();
  139. }
  140. }
  141. // World cup
  142. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  143. int firstLineBreak = results.indexOf("\n");
  144. String firstLine = results.substring(0, firstLineBreak);
  145. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  146. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  147. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  148. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  149. int fourthLineBreak = results.indexOf("\n", thirdLineBreak + 1);
  150. String fourthLine = results.substring(thirdLineBreak + 1, fourthLineBreak);
  151. if (firstLine.contains("2010 FIFA World Cup(tm)")
  152. && fourthLine.equals("Upcoming matches:")) {
  153. processedResult = secondLine + "\n" + thirdLine;
  154. }
  155. }
  156. // Special case for eyes-free shell: Speak the first location result
  157. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  158. int firstLineBreak = results.indexOf("\n");
  159. String firstLine = results.substring(0, firstLineBreak);
  160. String localResultsStr = "Local results ";
  161. if (firstLine.indexOf(localResultsStr) == 0) {
  162. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  163. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  164. int fourthLineBreak = results.indexOf("\n", thirdLineBreak + 1);
  165. int fifthLineBreak = results.indexOf("\n", fourthLineBreak + 1);
  166. // <http://www.google.com/m?defaultloc=Mountain+View%2C+CA+94043&amp;site=local&amp;q=costco+94043&amp;latlng=15926316227166107848&amp;mp=1&amp;zp&amp;source=m&amp;ct=res&amp;oi=local_result&amp;sa=X&amp;ei=Ll3CSvGMNZCNtge0z-83&amp;cd=1&amp;resnum=1>Costco
  167. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  168. // 1000 N Rengstorff Ave, Mountain View, C.A. 94043
  169. String fourthLine = results.substring(thirdLineBreak + 1, fourthLineBreak);
  170. // <wtai://wp/mc;6509881841>(650) 9881841 - Ratings: 3/5
  171. String fifthLine = results.substring(fourthLineBreak + 1, fifthLineBreak);
  172. processedResult = thirdLine.substring(thirdLine.indexOf(">") + 1) + "\n";
  173. processedResult = processedResult + fourthLine + "\n";
  174. processedResult = processedResult
  175. + fifthLine.substring(fifthLine.indexOf(">") + 1);
  176. }
  177. }
  178. // Special case for eyes-free shell: Speak the first location result
  179. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  180. int firstLineBreak = results.indexOf("\n");
  181. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  182. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  183. // <http://www.google.com/m?defaultloc=Mountain+View%2C+CA+94043&amp;site=local&amp;q=costco+94043&amp;latlng=15926316227166107848&amp;mp=1&amp;zp&amp;source=m&amp;ct=res&amp;oi=local_result&amp;sa=X&amp;ei=Ll3CSvGMNZCNtge0z-83&amp;cd=1&amp;resnum=1>Costco
  184. String firstLine = results.substring(0, firstLineBreak);
  185. // 1000 N Rengstorff Ave, Mountain View, C.A. 94043
  186. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  187. // <wtai://wp/mc;6509881841>(650) 9881841 - Ratings: 3/5
  188. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  189. Pattern addressPattern = Pattern.compile(
  190. "[0-9a-zA-Z ]+, [a-zA-Z ]+, [a-zA-Z. ]+ [0-9]+");
  191. Matcher addressMatcher = addressPattern.matcher(secondLine);
  192. Pattern phonePattern = Pattern.compile("\\([0-9][0-9][0-9]\\) [0-9-]+");
  193. Matcher phoneMatcher = phonePattern.matcher(thirdLine);
  194. if (addressMatcher.find() && phoneMatcher.find()) {
  195. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  196. processedResult = processedResult + secondLine + "\n";
  197. processedResult = processedResult
  198. + thirdLine.substring(thirdLine.indexOf(">") + 1);
  199. }
  200. }
  201. /* The following will result in a special action that is not speech */
  202. // Local search
  203. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  204. int firstLineBreak = results.indexOf("\n");
  205. String firstLine = results.substring(0, firstLineBreak);
  206. String localResultsStr = "Local results ";
  207. if (firstLine.indexOf(localResultsStr) == 0) {
  208. processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
  209. }
  210. }
  211. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  212. int zerothLineBreak = results.indexOf("\n");
  213. int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
  214. String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
  215. String localResultsStr = "Local results ";
  216. if (firstLine.indexOf(localResultsStr) == 0) {
  217. processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
  218. }
  219. }
  220. // YouTube
  221. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  222. int firstLineBreak = results.indexOf("\n");
  223. String firstLine = results.substring(0, firstLineBreak);
  224. if (firstLine.indexOf("<http://www.youtube.com/watch?") == 0) {
  225. processedResult = "PAW_YOUTUBE:" + firstLine.substring(
  226. firstLine.indexOf("<") + 1, firstLine.indexOf(">"));
  227. }
  228. }
  229. // Try to read the first result if there is no preceding link
  230. // since this will usually be a onebox of some sort.
  231. if ((processedResult.length() < 1) && (results.indexOf("<") != -1)) {
  232. int endIndex = results.indexOf("<", 0);
  233. if (endIndex != -1) {
  234. processedResult = results.substring(0, endIndex + 1);
  235. }
  236. // If this is the weather box, try to trim it down by cutting it
  237. // off at humidity
  238. if ((processedResult.length() > 1) && (processedResult.indexOf("%") != -1)) {
  239. processedResult = processedResult.substring(
  240. 0, processedResult.indexOf("%") + 1);
  241. }
  242. }
  243. // Log.e("processedResultLength", processedResult.length() + "");
  244. } catch (ParserException e) {
  245. e.printStackTrace();
  246. } catch (UnsupportedEncodingException e) {
  247. e.printStackTrace();
  248. }
  249. return processedResult;
  250. }
  251. }