PageRenderTime 59ms CodeModel.GetById 2ms app.highlight 52ms RepoModel.GetById 1ms app.codeStats 0ms

/shell/src/com/google/marvin/shell/OneBoxScraper.java

http://eyes-free.googlecode.com/
Java | 268 lines | 199 code | 17 blank | 52 comment | 95 complexity | dceb63e580612ebf791c2daae4c88ca4 MD5 | raw file
  1/*
  2 * Copyright (C) 2010 Google Inc.
  3 *
  4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 * use this file except in compliance with the License. You may obtain a copy of
  6 * the License at
  7 *
  8 * http://www.apache.org/licenses/LICENSE-2.0
  9 *
 10 * Unless required by applicable law or agreed to in writing, software
 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 * License for the specific language governing permissions and limitations under
 14 * the License.
 15 */
 16
 17package com.google.marvin.shell;
 18
 19import org.htmlparser.parserapplications.StringExtractor;
 20import org.htmlparser.util.ParserException;
 21
 22import java.io.UnsupportedEncodingException;
 23import java.net.URLEncoder;
 24import java.util.regex.Matcher;
 25import java.util.regex.Pattern;
 26
 27/**
 28 * Utility class for scraping the one box and finding the result
 29 *
 30 * @author clchen@google.com (Charles L. Chen)
 31 */
 32final public class OneBoxScraper {
 33
 34    private OneBoxScraper() {
 35    }
 36
 37    public static String processGoogleResults(String query, String baseUrl) {
 38        String processedResult = "";
 39        try {
 40            String url = baseUrl + URLEncoder.encode(query, "UTF-8");
 41            StringExtractor se = new StringExtractor(url);
 42            String results = se.extractStrings(true);
 43
 44            // Uncomment this line to see the raw dump;
 45            // very useful when trying to come up with scraping rules
 46            // Log.e("OneBoxScraper Debug", results);
 47
 48            /* Check for known one box types */
 49            // Weather
 50            if ((processedResult.length() < 1) && (results.indexOf("Weather for") == 0)) {
 51                int indexOfHumidity = results.indexOf("Humidity");
 52                if (indexOfHumidity != -1) {
 53                    int endIndex = results.indexOf("%", indexOfHumidity);
 54                    if (endIndex != -1) {
 55                        processedResult = results.substring(0, endIndex + 1);
 56                    }
 57                }
 58            }
 59            // Flight tracker
 60            if ((processedResult.length() < 1) && (results.indexOf("Track status of ") != -1)) {
 61                int indexOfTrackStatus = results.indexOf("Track status of ");
 62                int indexOfFlightTracker = results.indexOf(
 63                        "www.flightstats.com", indexOfTrackStatus);
 64                if (indexOfFlightTracker != -1) {
 65                    processedResult = results.substring(indexOfTrackStatus, indexOfFlightTracker);
 66                }
 67            }
 68            // Calculator
 69            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 70                String firstLine = results.substring(0, results.indexOf("\n"));
 71                if (firstLine.indexOf(" = ") != -1) {
 72                    processedResult = firstLine;
 73                }
 74            }
 75            // Finance
 76            // This is tricky, the market line could be the first or the second
 77            // line
 78            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 79                int firstLineBreak = results.indexOf("\n");
 80                String firstLine = results.substring(0, firstLineBreak);
 81                if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
 82                    // Copy the Symbol Market line
 83                    if (firstLine.indexOf(">") != -1) {
 84                        processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
 85                    }
 86                    int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
 87                    String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
 88                    secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
 89                    processedResult = processedResult + secondLine + "\n";
 90                    int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
 91                    String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
 92                    processedResult = processedResult + thirdLine;
 93                }
 94            }
 95            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
 96                int zerothLineBreak = results.indexOf("\n");
 97                int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
 98                String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
 99                if ((firstLine.indexOf(" NASDAQ") != -1) || (firstLine.indexOf(" NYSE") != -1)) {
100                    // Copy the Symbol Market line
101                    if (firstLine.indexOf(">") != -1) {
102                        processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
103                    }
104                    int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
105                    String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
106                    secondLine = secondLine.replace(" +", " Up by ").replace(" -", " Down by ");
107                    processedResult = processedResult + secondLine + "\n";
108                    int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
109                    String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
110                    processedResult = processedResult + thirdLine;
111                }
112            }
113            // Dictionary
114            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
115                int firstLineBreak = results.indexOf("\n");
116                String firstLine = results.substring(0, firstLineBreak);
117                if (firstLine.indexOf("Web definitions for ") != -1) {
118                    if (firstLine.indexOf(">") != -1) {
119                        processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
120                    }
121                    int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
122                    String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
123                    processedResult = processedResult + secondLine + "\n";
124                }
125            }
126            // Time
127            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
128                int firstLineBreak = results.indexOf("\n");
129                String firstLine = results.substring(0, firstLineBreak);
130                if ((firstLine.indexOf(":") != -1)
131                        && ((firstLine.indexOf("am ") != -1) || (firstLine.indexOf("pm ") != -1))) {
132                    processedResult = firstLine;
133                }
134            }
135            // Sports
136            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
137                int firstLineBreak = results.indexOf("\n");
138                String firstLine = results.substring(0, firstLineBreak);
139
140                Pattern vsScorePattern = Pattern.compile("[a-zA-Z ]+[0-9]+ - [a-zA-Z ]+[0-9]+");
141                Pattern recordScorePattern = Pattern.compile("[a-zA-Z ]+ \\([0-9]+-[0-9]+\\)");
142                Matcher vsScoreMatcher = vsScorePattern.matcher(firstLine);
143                Matcher recordScoreMatcher = recordScorePattern.matcher(firstLine);
144
145                if (vsScoreMatcher.find()) {
146                    processedResult = vsScoreMatcher.group();
147                } else if (recordScoreMatcher.find()) {
148                    processedResult = recordScoreMatcher.group();
149                }
150            }
151            // World cup
152            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
153                int firstLineBreak = results.indexOf("\n");
154                String firstLine = results.substring(0, firstLineBreak);
155                int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
156                String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
157                int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
158                String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
159                int fourthLineBreak = results.indexOf("\n", thirdLineBreak + 1);
160                String fourthLine = results.substring(thirdLineBreak + 1, fourthLineBreak);
161                if (firstLine.contains("2010 FIFA World Cup(tm)")
162                        && fourthLine.equals("Upcoming matches:")) {
163                    processedResult = secondLine + "\n" + thirdLine;
164                }
165            }
166
167            // Special case for eyes-free shell: Speak the first location result
168            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
169                int firstLineBreak = results.indexOf("\n");
170                String firstLine = results.substring(0, firstLineBreak);
171                String localResultsStr = "Local results ";
172                if (firstLine.indexOf(localResultsStr) == 0) {
173                    int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
174                    int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
175                    int fourthLineBreak = results.indexOf("\n", thirdLineBreak + 1);
176                    int fifthLineBreak = results.indexOf("\n", fourthLineBreak + 1);
177
178                    // <http://www.google.com/m?defaultloc=Mountain+View%2C+CA+94043&amp;site=local&amp;q=costco+94043&amp;latlng=15926316227166107848&amp;mp=1&amp;zp&amp;source=m&amp;ct=res&amp;oi=local_result&amp;sa=X&amp;ei=Ll3CSvGMNZCNtge0z-83&amp;cd=1&amp;resnum=1>Costco
179                    String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
180                    // 1000 N Rengstorff Ave, Mountain View, C.A. 94043
181                    String fourthLine = results.substring(thirdLineBreak + 1, fourthLineBreak);
182                    // <wtai://wp/mc;6509881841>(650) 9881841 - Ratings: 3/5
183                    String fifthLine = results.substring(fourthLineBreak + 1, fifthLineBreak);
184
185                    processedResult = thirdLine.substring(thirdLine.indexOf(">") + 1) + "\n";
186                    processedResult = processedResult + fourthLine + "\n";
187                    processedResult = processedResult
188                            + fifthLine.substring(fifthLine.indexOf(">") + 1);
189                }
190            }
191            // Special case for eyes-free shell: Speak the first location result
192            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
193                int firstLineBreak = results.indexOf("\n");
194                int secondLineBreak = results.indexOf("\n", firstLineBreak + 1);
195                int thirdLineBreak = results.indexOf("\n", secondLineBreak + 1);
196
197                // <http://www.google.com/m?defaultloc=Mountain+View%2C+CA+94043&amp;site=local&amp;q=costco+94043&amp;latlng=15926316227166107848&amp;mp=1&amp;zp&amp;source=m&amp;ct=res&amp;oi=local_result&amp;sa=X&amp;ei=Ll3CSvGMNZCNtge0z-83&amp;cd=1&amp;resnum=1>Costco
198                String firstLine = results.substring(0, firstLineBreak);
199                // 1000 N Rengstorff Ave, Mountain View, C.A. 94043
200                String secondLine = results.substring(firstLineBreak + 1, secondLineBreak);
201                // <wtai://wp/mc;6509881841>(650) 9881841 - Ratings: 3/5
202                String thirdLine = results.substring(secondLineBreak + 1, thirdLineBreak);
203
204                Pattern addressPattern = Pattern.compile(
205                        "[0-9a-zA-Z ]+, [a-zA-Z ]+, [a-zA-Z. ]+ [0-9]+");
206                Matcher addressMatcher = addressPattern.matcher(secondLine);
207                Pattern phonePattern = Pattern.compile("\\([0-9][0-9][0-9]\\) [0-9-]+");
208                Matcher phoneMatcher = phonePattern.matcher(thirdLine);
209
210                if (addressMatcher.find() && phoneMatcher.find()) {
211                    processedResult = firstLine.substring(firstLine.indexOf(">") + 1) + "\n";
212                    processedResult = processedResult + secondLine + "\n";
213                    processedResult = processedResult
214                            + thirdLine.substring(thirdLine.indexOf(">") + 1);
215                }
216            }
217
218            /* The following will result in a special action that is not speech */
219            // Local search
220            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
221                int firstLineBreak = results.indexOf("\n");
222                String firstLine = results.substring(0, firstLineBreak);
223                String localResultsStr = "Local results ";
224                if (firstLine.indexOf(localResultsStr) == 0) {
225                    processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
226                }
227            }
228            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
229                int zerothLineBreak = results.indexOf("\n");
230                int firstLineBreak = results.indexOf("\n", zerothLineBreak + 1);
231                String firstLine = results.substring(zerothLineBreak + 1, firstLineBreak);
232                String localResultsStr = "Local results ";
233                if (firstLine.indexOf(localResultsStr) == 0) {
234                    processedResult = "PAW_MAPS:" + URLEncoder.encode(query, "UTF-8");
235                }
236            }
237            // YouTube
238            if ((processedResult.length() < 1) && (results.indexOf("\n") != -1)) {
239                int firstLineBreak = results.indexOf("\n");
240                String firstLine = results.substring(0, firstLineBreak);
241                if (firstLine.indexOf("<http://www.youtube.com/watch?") == 0) {
242                    processedResult = "PAW_YOUTUBE:" + firstLine.substring(
243                            firstLine.indexOf("<") + 1, firstLine.indexOf(">"));
244                }
245            }
246            // Try to read the first result if there is no preceding link
247            // since this will usually be a onebox of some sort.
248            if ((processedResult.length() < 1) && (results.indexOf("<") != -1)) {
249                int endIndex = results.indexOf("<", 0);
250                if (endIndex != -1) {
251                    processedResult = results.substring(0, endIndex + 1);
252                }
253                // If this is the weather box, try to trim it down by cutting it
254                // off at humidity
255                if ((processedResult.length() > 1) && (processedResult.indexOf("%") != -1)) {
256                    processedResult = processedResult.substring(
257                            0, processedResult.indexOf("%") + 1);
258                }
259            }
260            // Log.e("processedResultLength", processedResult.length() + "");
261        } catch (ParserException e) {
262            e.printStackTrace();
263        } catch (UnsupportedEncodingException e) {
264            e.printStackTrace();
265        }
266        return processedResult;
267    }
268}