PageRenderTime 26ms CodeModel.GetById 1ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 0ms

/paw/src/com/google/marvin/paw/OneBoxScraper.java

http://eyes-free.googlecode.com/
Java | 176 lines | 137 code | 11 blank | 28 comment | 72 complexity | 6218e7422a25aff79b49c6ac3a441677 MD5 | raw file
  1package com.google.marvin.paw;
  2
  3import java.io.UnsupportedEncodingException;
  4import java.net.URLEncoder;
  5import java.util.regex.Matcher;
  6import java.util.regex.Pattern;
  7
  8import org.htmlparser.parserapplications.StringExtractor;
  9import org.htmlparser.util.ParserException;
 10
 11import android.util.Log;
 12
 13public class OneBoxScraper {
 14
 15  public static String processGoogleResults(String query) {
 16    String processedResult = "";
 17    try {
 18      String URL = "http://www.google.com/m?q=" + URLEncoder.encode(query, "UTF-8");
 19      StringExtractor se = new StringExtractor(URL);
 20      String results = se.extractStrings(true);
 21
 22      //Uncomment this line to see the raw dump;
 23      //very useful when trying to come up with scraping rules
 24      //Log.e("PAW Debug", results);
 25
 26      /* Check for known one box types */
 27      // Weather
 28      if ((processedResult.length() < 1) && (results.indexOf("Weather for") == 0)) {
 29        int indexOfHumidity = results.indexOf("Humidity");
 30        if (indexOfHumidity != -1) {
 31          int endIndex = results.indexOf("%", indexOfHumidity);
 32          if (endIndex != -1) {
 33            processedResult = results.substring(0, endIndex + 1);
 34            //Log.e("PAW Debug", "Weather: " + processedResult);
 35          }
 36        }
 37      }
 38      // Flight tracker
 39      if ((processedResult.length() < 1) && (results.indexOf("Track status of ") != -1)) {
 40        int indexOfTrackStatus = results.indexOf("Track status of ");
 41        int indexOfFlightTracker = results.indexOf("www.flightstats.com", indexOfTrackStatus);
 42        if (indexOfFlightTracker != -1) {
 43          processedResult = results.substring(indexOfTrackStatus, indexOfFlightTracker);
 44          //Log.e("PAW Debug", "Flight tracker: " + processedResult);
 45        }
 46      }
 47      // Calculator
 48      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 49        String firstLine = results.substring(0, results.indexOf("\n"));
 50        if (firstLine.indexOf(" = ") != -1) {
 51          processedResult = firstLine;
 52          //Log.e("PAW Debug", "Calculator: " + processedResult);
 53        }
 54      }
 55      // Finance
 56      // This is tricky, the market line could be the first or the second line
 57      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 58        int firstLineBreak = results.indexOf("\n");
 59        String firstLine = results.substring(0, firstLineBreak);
 60        if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
 61          // Copy the Symbol Market line
 62          if (firstLine.indexOf(">") != -1) {
 63            processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
 64          }
 65          int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
 66          String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
 67          secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
 68          processedResult = processedResult + secondLine + "\n";
 69          int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
 70          String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
 71          processedResult = processedResult + thirdLine;
 72          //Log.e("PAW Debug", "Finance: " + processedResult);
 73        }
 74      }
 75      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 76        int zerothLineBreak = results.indexOf("\n");
 77        int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
 78        String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
 79        if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
 80          // Copy the Symbol Market line
 81          if (firstLine.indexOf(">") != -1) {
 82            processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
 83          }
 84          int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
 85          String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
 86          secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
 87          processedResult = processedResult + secondLine + "\n";
 88          int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
 89          String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
 90          processedResult = processedResult + thirdLine;
 91          //Log.e("PAW Debug", "Finance: " + processedResult);
 92        }
 93      }
 94      // Dictionary
 95      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 96        int firstLineBreak = results.indexOf("\n");
 97        String firstLine = results.substring(0, firstLineBreak);
 98        if (firstLine.indexOf("Web definitions for ") != -1) {
 99          if (firstLine.indexOf(">") != -1) {
100            processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
101          }
102          int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
103          String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
104          processedResult = processedResult + secondLine + "\n";
105          //Log.e("PAW Debug", "Dictionary: " + processedResult);
106        }
107      }
108      // Time
109      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
110        int firstLineBreak = results.indexOf("\n");
111        String firstLine = results.substring(0, firstLineBreak);
112        if ((firstLine.indexOf(":") != -1)
113            && ((firstLine.indexOf("am ") != -1) || (firstLine.indexOf("pm ") != -1))) {
114          processedResult = firstLine;
115          //Log.e("PAW Debug", "Time: " + processedResult);
116        }
117      }
118      // Sports
119      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
120        int firstLineBreak = results.indexOf("\n");
121        String firstLine = results.substring(0, firstLineBreak);
122
123        Pattern vsScorePattern = Pattern.compile("[a-zA-Z ]+[0-9]+ - [a-zA-Z ]+[0-9]+");
124        Pattern recordScorePattern = Pattern.compile("[a-zA-Z ]+ \\([0-9]+-[0-9]+\\)");
125        Matcher vsScoreMatcher = vsScorePattern.matcher(firstLine);
126        Matcher recordScoreMatcher = recordScorePattern.matcher(firstLine);
127        
128        if (vsScoreMatcher.find()) {          
129          processedResult = vsScoreMatcher.group();
130          //Log.e("PAW Debug", "Sports: " + processedResult);
131        } else if (recordScoreMatcher.find()) {          
132          processedResult = recordScoreMatcher.group();
133          //Log.e("PAW Debug", "Sports: " + processedResult);
134        }
135      }
136     
137      /* The following will result in a special action that is not speech */
138      // Local search
139      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
140        int firstLineBreak = results.indexOf("\n");
141        String firstLine = results.substring(0, firstLineBreak);
142        String localResultsStr = "Local results ";
143        if (firstLine.indexOf(localResultsStr) == 0) {
144          processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
145        }
146      }
147      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
148        int zerothLineBreak = results.indexOf("\n");
149        int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
150        String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
151        String localResultsStr = "Local results ";
152        if (firstLine.indexOf(localResultsStr) == 0) {
153          processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
154        }
155      }
156      // YouTube
157      if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
158        int firstLineBreak = results.indexOf("\n");
159        String firstLine = results.substring(0, firstLineBreak);
160        if (firstLine.indexOf("<http://www.youtube.com/watch?") == 0) {
161          processedResult =
162              "PAW_YOUTUBE:"
163                  + firstLine.substring(firstLine.indexOf("<") + 1, firstLine.indexOf(">"));
164        }
165      }
166
167    } catch (ParserException e) {
168      // TODO Auto-generated catch block
169      e.printStackTrace();
170    } catch (UnsupportedEncodingException e) {
171      // TODO Auto-generated catch block
172      e.printStackTrace();
173    }
174    return processedResult;
175  }
176}