/paw/src/com/google/marvin/paw/OneBoxScraper.java

http://eyes-free.googlecode.com/ · Java · 176 lines · 137 code · 11 blank · 28 comment · 72 complexity · 6218e7422a25aff79b49c6ac3a441677 MD5 · raw file

  1. package com.google.marvin.paw;
  2. import java.io.UnsupportedEncodingException;
  3. import java.net.URLEncoder;
  4. import java.util.regex.Matcher;
  5. import java.util.regex.Pattern;
  6. import org.htmlparser.parserapplications.StringExtractor;
  7. import org.htmlparser.util.ParserException;
  8. import android.util.Log;
  9. public class OneBoxScraper {
  10. public static String processGoogleResults(String query) {
  11. String processedResult = "";
  12. try {
  13. String URL = "http://www.google.com/m?q=" + URLEncoder.encode(query, "UTF-8");
  14. StringExtractor se = new StringExtractor(URL);
  15. String results = se.extractStrings(true);
  16. //Uncomment this line to see the raw dump;
  17. //very useful when trying to come up with scraping rules
  18. //Log.e("PAW Debug", results);
  19. /* Check for known one box types */
  20. // Weather
  21. if ((processedResult.length() < 1) && (results.indexOf("Weather for") == 0)) {
  22. int indexOfHumidity = results.indexOf("Humidity");
  23. if (indexOfHumidity != -1) {
  24. int endIndex = results.indexOf("%", indexOfHumidity);
  25. if (endIndex != -1) {
  26. processedResult = results.substring(0, endIndex + 1);
  27. //Log.e("PAW Debug", "Weather: " + processedResult);
  28. }
  29. }
  30. }
  31. // Flight tracker
  32. if ((processedResult.length() < 1) && (results.indexOf("Track status of ") != -1)) {
  33. int indexOfTrackStatus = results.indexOf("Track status of ");
  34. int indexOfFlightTracker = results.indexOf("www.flightstats.com", indexOfTrackStatus);
  35. if (indexOfFlightTracker != -1) {
  36. processedResult = results.substring(indexOfTrackStatus, indexOfFlightTracker);
  37. //Log.e("PAW Debug", "Flight tracker: " + processedResult);
  38. }
  39. }
  40. // Calculator
  41. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  42. String firstLine = results.substring(0, results.indexOf("\n"));
  43. if (firstLine.indexOf(" = ") != -1) {
  44. processedResult = firstLine;
  45. //Log.e("PAW Debug", "Calculator: " + processedResult);
  46. }
  47. }
  48. // Finance
  49. // This is tricky, the market line could be the first or the second line
  50. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  51. int firstLineBreak = results.indexOf("\n");
  52. String firstLine = results.substring(0, firstLineBreak);
  53. if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
  54. // Copy the Symbol Market line
  55. if (firstLine.indexOf(">") != -1) {
  56. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  57. }
  58. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  59. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  60. secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
  61. processedResult = processedResult + secondLine + "\n";
  62. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  63. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  64. processedResult = processedResult + thirdLine;
  65. //Log.e("PAW Debug", "Finance: " + processedResult);
  66. }
  67. }
  68. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  69. int zerothLineBreak = results.indexOf("\n");
  70. int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
  71. String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
  72. if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
  73. // Copy the Symbol Market line
  74. if (firstLine.indexOf(">") != -1) {
  75. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  76. }
  77. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  78. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  79. secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
  80. processedResult = processedResult + secondLine + "\n";
  81. int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
  82. String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
  83. processedResult = processedResult + thirdLine;
  84. //Log.e("PAW Debug", "Finance: " + processedResult);
  85. }
  86. }
  87. // Dictionary
  88. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  89. int firstLineBreak = results.indexOf("\n");
  90. String firstLine = results.substring(0, firstLineBreak);
  91. if (firstLine.indexOf("Web definitions for ") != -1) {
  92. if (firstLine.indexOf(">") != -1) {
  93. processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
  94. }
  95. int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
  96. String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
  97. processedResult = processedResult + secondLine + "\n";
  98. //Log.e("PAW Debug", "Dictionary: " + processedResult);
  99. }
  100. }
  101. // Time
  102. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  103. int firstLineBreak = results.indexOf("\n");
  104. String firstLine = results.substring(0, firstLineBreak);
  105. if ((firstLine.indexOf(":") != -1)
  106. && ((firstLine.indexOf("am ") != -1) || (firstLine.indexOf("pm ") != -1))) {
  107. processedResult = firstLine;
  108. //Log.e("PAW Debug", "Time: " + processedResult);
  109. }
  110. }
  111. // Sports
  112. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  113. int firstLineBreak = results.indexOf("\n");
  114. String firstLine = results.substring(0, firstLineBreak);
  115. Pattern vsScorePattern = Pattern.compile("[a-zA-Z ]+[0-9]+ - [a-zA-Z ]+[0-9]+");
  116. Pattern recordScorePattern = Pattern.compile("[a-zA-Z ]+ \\([0-9]+-[0-9]+\\)");
  117. Matcher vsScoreMatcher = vsScorePattern.matcher(firstLine);
  118. Matcher recordScoreMatcher = recordScorePattern.matcher(firstLine);
  119. if (vsScoreMatcher.find()) {
  120. processedResult = vsScoreMatcher.group();
  121. //Log.e("PAW Debug", "Sports: " + processedResult);
  122. } else if (recordScoreMatcher.find()) {
  123. processedResult = recordScoreMatcher.group();
  124. //Log.e("PAW Debug", "Sports: " + processedResult);
  125. }
  126. }
  127. /* The following will result in a special action that is not speech */
  128. // Local search
  129. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  130. int firstLineBreak = results.indexOf("\n");
  131. String firstLine = results.substring(0, firstLineBreak);
  132. String localResultsStr = "Local results ";
  133. if (firstLine.indexOf(localResultsStr) == 0) {
  134. processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
  135. }
  136. }
  137. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  138. int zerothLineBreak = results.indexOf("\n");
  139. int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
  140. String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
  141. String localResultsStr = "Local results ";
  142. if (firstLine.indexOf(localResultsStr) == 0) {
  143. processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
  144. }
  145. }
  146. // YouTube
  147. if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
  148. int firstLineBreak = results.indexOf("\n");
  149. String firstLine = results.substring(0, firstLineBreak);
  150. if (firstLine.indexOf("<http://www.youtube.com/watch?") == 0) {
  151. processedResult =
  152. "PAW_YOUTUBE:"
  153. + firstLine.substring(firstLine.indexOf("<") + 1, firstLine.indexOf(">"));
  154. }
  155. }
  156. } catch (ParserException e) {
  157. // TODO Auto-generated catch block
  158. e.printStackTrace();
  159. } catch (UnsupportedEncodingException e) {
  160. // TODO Auto-generated catch block
  161. e.printStackTrace();
  162. }
  163. return processedResult;
  164. }
  165. }