GDataImporter.java | searchcode

/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java

http://google-refine.googlecode.com/
Java | 491 lines | 392 code | 58 blank | 41 comment | 59 complexity | 24f2a649b4deb4bffb75f3d13259fd72 MD5 | raw file
Possible License(s): JSON, LGPL-2.1, MIT, Apache-2.0, BSD-3-Clause

/*
 * Copyright (c) 2010, Thomas F. Morris
 *        All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions are met:
 * - Redistributions of source code must retain the above copyright notice, this 
 *   list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright notice, 
 *   this list of conditions and the following disclaimer in the documentation 
 *   and/or other materials provided with the distribution.
 * 
 * Neither the name of Google nor the names of its contributors may be used to 
 * endorse or promote products derived from this software without specific 
 * prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package com.google.refine.extension.gdata;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.json.JSONObject;

import com.google.gdata.client.GoogleService;
import com.google.gdata.client.spreadsheet.CellQuery;
import com.google.gdata.client.spreadsheet.SpreadsheetService;
import com.google.gdata.data.spreadsheet.Cell;
import com.google.gdata.data.spreadsheet.CellEntry;
import com.google.gdata.data.spreadsheet.CellFeed;
import com.google.gdata.data.spreadsheet.SpreadsheetEntry;
import com.google.gdata.data.spreadsheet.WorksheetEntry;
import com.google.gdata.util.ServiceException;

import com.google.refine.ProjectMetadata;
import com.google.refine.importers.TabularImportingParserBase;
import com.google.refine.importers.TabularImportingParserBase.TableDataReader;
import com.google.refine.importing.ImportingJob;
import com.google.refine.model.Project;
import com.google.refine.util.JSONUtilities;

/**
 * Google Refine parser for Google Spreadsheets.
 * 
 * @author Tom Morris <tfmorris@gmail.com>
 * @copyright 2010 Thomas F. Morris
 * @license New BSD http://www.opensource.org/licenses/bsd-license.php
 */
public class GDataImporter {
    static public void parse(
        String token,
        Project project,
        ProjectMetadata metadata,
        final ImportingJob job,
        int limit,
        JSONObject options,
        List<Exception> exceptions) {
    
        String docType = JSONUtilities.getString(options, "docType", null);
        if ("spreadsheet".equals(docType)) {
            SpreadsheetService service = GDataExtension.getSpreadsheetService(token);
            parse(
                service,
                project,
                metadata,
                job,
                limit,
                options,
                exceptions
            );
        } else if ("table".equals(docType)) {
            GoogleService service = GDataExtension.getFusionTablesGoogleService(token);
            parse(
                service,
                project,
                metadata,
                job,
                limit,
                options,
                exceptions
            );
        }
    }
    
    static public void parse(
        SpreadsheetService service,
        Project project,
        ProjectMetadata metadata,
        final ImportingJob job,
        int limit,
        JSONObject options,
        List<Exception> exceptions) {
        
        String docUrlString = JSONUtilities.getString(options, "docUrl", null);
        String worksheetUrlString = JSONUtilities.getString(options, "sheetUrl", null);
        if (docUrlString != null && worksheetUrlString != null) {
            try {
                parseOneWorkSheet(
                    service,
                    project,
                    metadata,
                    job,
                    new URL(docUrlString),
                    new URL(worksheetUrlString),
                    limit,
                    options,
                    exceptions);
            } catch (MalformedURLException e) {
                e.printStackTrace();
                exceptions.add(e);
            }
        }
    }
    
    static public void parseOneWorkSheet(
        SpreadsheetService service,
        Project project,
        ProjectMetadata metadata,
        final ImportingJob job,
        URL docURL,
        URL worksheetURL,
        int limit,
        JSONObject options,
        List<Exception> exceptions) {
        
        try {
            WorksheetEntry worksheetEntry = service.getEntry(worksheetURL, WorksheetEntry.class);
            String spreadsheetName = docURL.toExternalForm();
            try {
                SpreadsheetEntry spreadsheetEntry = service.getEntry(docURL, SpreadsheetEntry.class);
                spreadsheetName = spreadsheetEntry.getTitle().getPlainText();
            } catch (ServiceException e) { // RedirectRequiredException among others
                // fall back to just using the URL (better for traceability anyway?)
            }
            
            String fileSource = spreadsheetName + " # " +
                worksheetEntry.getTitle().getPlainText();
            
            setProgress(job, fileSource, 0);
            TabularImportingParserBase.readTable(
                project,
                metadata,
                job,
                new WorksheetBatchRowReader(job, fileSource, service, worksheetEntry, 20),
                fileSource,
                limit,
                options,
                exceptions
            );
            setProgress(job, fileSource, 100);
        } catch (IOException e) {
            e.printStackTrace();
            exceptions.add(e);
        } catch (ServiceException e) {
            e.printStackTrace();
            exceptions.add(e);
        }
    }
    
    static private void setProgress(ImportingJob job, String fileSource, int percent) {
        JSONObject progress = JSONUtilities.getObject(job.config, "progress");
        if (progress == null) {
            progress = new JSONObject();
            JSONUtilities.safePut(job.config, "progress", progress);
        }
        JSONUtilities.safePut(progress, "message", "Reading " + fileSource);
        JSONUtilities.safePut(progress, "percent", percent);
    }
    
    static private class WorksheetBatchRowReader implements TableDataReader {
        final ImportingJob job;
        final String fileSource;
        
        final SpreadsheetService service;
        final WorksheetEntry worksheet;
        final int batchSize;
        
        final int totalRows;
        
        int nextRow = 0; // 0-based
        int batchRowStart = 0; // 0-based
        List<List<Object>> rowsOfCells = null;
        
        public WorksheetBatchRowReader(ImportingJob job, String fileSource,
                SpreadsheetService service, WorksheetEntry worksheet,
                int batchSize) {
            this.job = job;
            this.fileSource = fileSource;
            this.service = service;
            this.worksheet = worksheet;
            this.batchSize = batchSize;
            
            this.totalRows = worksheet.getRowCount();
        }
        
        @Override
        public List<Object> getNextRowOfCells() throws IOException {
            if (rowsOfCells == null || (nextRow >= batchRowStart + rowsOfCells.size() && nextRow < totalRows)) {
                int newBatchRowStart = batchRowStart + (rowsOfCells == null ? 0 : rowsOfCells.size());
                try {
                    rowsOfCells = getRowsOfCells(
                        service,
                        worksheet,
                        newBatchRowStart + 1, // convert to 1-based
                        batchSize);
                    
                    batchRowStart = newBatchRowStart;
                    
                    setProgress(job, fileSource, batchRowStart * 100 / totalRows);
                } catch (ServiceException e) {
                    throw new IOException(e);
                }
            }
            
            if (rowsOfCells != null && nextRow - batchRowStart < rowsOfCells.size()) {
                return rowsOfCells.get(nextRow++ - batchRowStart);
            } else {
                return null;
            }
        }
        
        
        List<List<Object>> getRowsOfCells(
            SpreadsheetService service,
            WorksheetEntry worksheet,
            int startRow, // 1-based
            int rowCount
        ) throws IOException, ServiceException {
            URL cellFeedUrl = worksheet.getCellFeedUrl();
            
            int minRow = startRow;
            int maxRow = Math.min(worksheet.getRowCount(), startRow + rowCount - 1);
            int cols = worksheet.getColCount();
            int rows = worksheet.getRowCount();
            
            CellQuery cellQuery = new CellQuery(cellFeedUrl);
            cellQuery.setMinimumRow(minRow);
            cellQuery.setMaximumRow(maxRow);
            cellQuery.setMaximumCol(cols);
            cellQuery.setMaxResults(rows * cols);
            cellQuery.setReturnEmpty(false);
            
            CellFeed cellFeed = service.query(cellQuery, CellFeed.class);
            List<CellEntry> cellEntries = cellFeed.getEntries();
            
            List<List<Object>> rowsOfCells = new ArrayList<List<Object>>(rowCount);
            for (CellEntry cellEntry : cellEntries) {
                Cell cell = cellEntry.getCell();
                if (cell != null) {
                    int row = cell.getRow() - startRow;
                    int col = cell.getCol() - 1;
                    
                    while (row >= rowsOfCells.size()) {
                        rowsOfCells.add(new ArrayList<Object>());
                    }
                    List<Object> rowOfCells = rowsOfCells.get(row);
                    
                    while (col >= rowOfCells.size()) {
                        rowOfCells.add(null);
                    }
                    rowOfCells.set(col, cell.getValue());
                }
            }
            return rowsOfCells;
        }
    }
    
    static public void parse(
            GoogleService service,
            Project project,
            ProjectMetadata metadata,
            final ImportingJob job,
            int limit,
            JSONObject options,
            List<Exception> exceptions) {
        
        String docUrlString = JSONUtilities.getString(options, "docUrl", null);
        String id = getFTid(docUrlString); // Use GDataExtension.getFusionTableKey(url) ?
        // TODO: Allow arbitrary Fusion Tables URL instead of (in addition to?) constructing our own?
        
        try {
            List<FTColumnData> columns = new ArrayList<GDataImporter.FTColumnData>();
            List<List<String>> rows = GDataExtension.runFusionTablesSelect(service, "DESCRIBE " + id);
            if (rows.size() > 1) {
                for (int i = 1; i < rows.size(); i++) {
                    List<String> row = rows.get(i);
                    if (row.size() >= 2) {
                        FTColumnData cd = new FTColumnData();
                        cd.name = row.get(1);
                        cd.type = FTColumnType.STRING;
                        
                        if (row.size() > 2) {
                            String type = row.get(2).toLowerCase();
                            if (type.equals("number")) {
                                cd.type = FTColumnType.NUMBER;
                            } else if (type.equals("datetime")) {
                                cd.type = FTColumnType.DATETIME;
                            } else if (type.equals("location")) {
                                cd.type = FTColumnType.LOCATION;
                            }
                        }
                        columns.add(cd);
                    }
                }
                
                setProgress(job, docUrlString, -1);
                
                // Force these options for the next call because each fusion table
                // is strictly structured with a single line of headers.
                JSONUtilities.safePut(options, "ignoreLines", 0); // number of blank lines at the beginning to ignore
                JSONUtilities.safePut(options, "headerLines", 1); // number of header lines
                
                TabularImportingParserBase.readTable(
                    project,
                    metadata,
                    job,
                    new FusionTableBatchRowReader(job, docUrlString, service, id, columns, 100),
                    docUrlString,
                    limit,
                    options,
                    exceptions
                );
                setProgress(job, docUrlString, 100);
            }
        } catch (IOException e) {
            e.printStackTrace();
            exceptions.add(e);
        } catch (ServiceException e) {
            e.printStackTrace();
            exceptions.add(e);
        }
    }
    
    static private String getFTid(String url) {
        if (url == null) {
            return null;
        }
        int equal = url.lastIndexOf('=');
        if (equal < 0) {
            return null;
        }
        return url.substring(equal + 1);
    }
    
    static private enum FTColumnType {
        STRING,
        NUMBER,
        DATETIME,
        LOCATION
    }
    
    final static private class FTColumnData {
        String name;
        FTColumnType type;
    }
    
    static private class FusionTableBatchRowReader implements TableDataReader {
        final ImportingJob job;
        final String fileSource;
        
        final GoogleService service;
        final List<FTColumnData> columns;
        final int batchSize;
        
        final String baseQuery;
        
        int nextRow = 0; // 0-based
        int batchRowStart = 0; // 0-based
        boolean end = false;
        List<List<Object>> rowsOfCells = null;
        boolean usedHeaders = false;
        
        public FusionTableBatchRowReader(ImportingJob job, String fileSource,
                GoogleService service, String tableId, List<FTColumnData> columns,
                int batchSize) {
            this.job = job;
            this.fileSource = fileSource;
            this.service = service;
            this.columns = columns;
            this.batchSize = batchSize;
            
            StringBuffer sb = new StringBuffer();
            sb.append("SELECT ");
            
            boolean first = true;
            for (FTColumnData cd : columns) {
                if (first) {
                    first = false;
                } else {
                    sb.append(",");
                }
                sb.append("'");
                sb.append(cd.name);
                sb.append("'");
            }
            sb.append(" FROM ");
            sb.append(tableId);
            
            baseQuery = sb.toString();
        }
        
        @Override
        public List<Object> getNextRowOfCells() throws IOException {
            if (!usedHeaders) {
                List<Object> row = new ArrayList<Object>(columns.size());
                for (FTColumnData cd : columns) {
                    row.add(cd.name);
                }
                usedHeaders = true;
                return row;
            }
            
            if (rowsOfCells == null || (nextRow >= batchRowStart + rowsOfCells.size() && !end)) {
                int newBatchRowStart = batchRowStart + (rowsOfCells == null ? 0 : rowsOfCells.size());
                try {
                    rowsOfCells = getRowsOfCells(newBatchRowStart);
                    batchRowStart = newBatchRowStart;
                    
                    setProgress(job, fileSource, -1 /* batchRowStart * 100 / totalRows */);
                } catch (ServiceException e) {
                    throw new IOException(e);
                }
            }
            
            if (rowsOfCells != null && nextRow - batchRowStart < rowsOfCells.size()) {
                return rowsOfCells.get(nextRow++ - batchRowStart);
            } else {
                return null;
            }
        }
        
        
        private List<List<Object>> getRowsOfCells(int startRow) throws IOException, ServiceException {
            List<List<Object>> rowsOfCells = new ArrayList<List<Object>>(batchSize);
            
            String query = baseQuery + " OFFSET " + startRow + " LIMIT " + batchSize;
            
            List<List<String>> rows = GDataExtension.runFusionTablesSelect(service, query);
            if (rows.size() > 1) {
                for (int i = 1; i < rows.size(); i++) {
                    List<String> row = rows.get(i);
                    List<Object> rowOfCells = new ArrayList<Object>(row.size());
                    for (int j = 0; j < row.size() && j < columns.size(); j++) {
                        String text = row.get(j);
                        if (text.isEmpty()) {
                            rowOfCells.add(null);
                        } else {
                            FTColumnData cd = columns.get(j);
                            if (cd.type == FTColumnType.NUMBER) {
                                try {
                                    rowOfCells.add(Long.parseLong(text));
                                    continue;
                                } catch (NumberFormatException e) {
                                    // ignore
                                }
                                try {
                                    double d = Double.parseDouble(text);
                                    if (!Double.isInfinite(d) && !Double.isNaN(d)) {
                                        rowOfCells.add(d);
                                        continue;
                                    }
                                } catch (NumberFormatException e) {
                                    // ignore
                                }
                            }
                            
                            rowOfCells.add(text);
                        }
                    }
                    rowsOfCells.add(rowOfCells);
                }
            }
            end = rows.size() < batchSize + 1;
            return rowsOfCells;
        }
    }
}