PageRenderTime 43ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/extensions/gdata/src/com/google/refine/extension/gdata/GDataImporter.java

http://google-refine.googlecode.com/
Java | 491 lines | 392 code | 58 blank | 41 comment | 59 complexity | 24f2a649b4deb4bffb75f3d13259fd72 MD5 | raw file
Possible License(s): JSON, LGPL-2.1, MIT, Apache-2.0, BSD-3-Clause
  1. /*
  2. * Copyright (c) 2010, Thomas F. Morris
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * - Redistributions of source code must retain the above copyright notice, this
  8. * list of conditions and the following disclaimer.
  9. * - Redistributions in binary form must reproduce the above copyright notice,
  10. * this list of conditions and the following disclaimer in the documentation
  11. * and/or other materials provided with the distribution.
  12. *
  13. * Neither the name of Google nor the names of its contributors may be used to
  14. * endorse or promote products derived from this software without specific
  15. * prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  19. * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  20. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
  21. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  22. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  23. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  24. * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  25. * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  26. * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  27. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. */
  29. package com.google.refine.extension.gdata;
  30. import java.io.IOException;
  31. import java.net.MalformedURLException;
  32. import java.net.URL;
  33. import java.util.ArrayList;
  34. import java.util.List;
  35. import org.json.JSONObject;
  36. import com.google.gdata.client.GoogleService;
  37. import com.google.gdata.client.spreadsheet.CellQuery;
  38. import com.google.gdata.client.spreadsheet.SpreadsheetService;
  39. import com.google.gdata.data.spreadsheet.Cell;
  40. import com.google.gdata.data.spreadsheet.CellEntry;
  41. import com.google.gdata.data.spreadsheet.CellFeed;
  42. import com.google.gdata.data.spreadsheet.SpreadsheetEntry;
  43. import com.google.gdata.data.spreadsheet.WorksheetEntry;
  44. import com.google.gdata.util.ServiceException;
  45. import com.google.refine.ProjectMetadata;
  46. import com.google.refine.importers.TabularImportingParserBase;
  47. import com.google.refine.importers.TabularImportingParserBase.TableDataReader;
  48. import com.google.refine.importing.ImportingJob;
  49. import com.google.refine.model.Project;
  50. import com.google.refine.util.JSONUtilities;
  51. /**
  52. * Google Refine parser for Google Spreadsheets.
  53. *
  54. * @author Tom Morris <tfmorris@gmail.com>
  55. * @copyright 2010 Thomas F. Morris
  56. * @license New BSD http://www.opensource.org/licenses/bsd-license.php
  57. */
  58. public class GDataImporter {
  59. static public void parse(
  60. String token,
  61. Project project,
  62. ProjectMetadata metadata,
  63. final ImportingJob job,
  64. int limit,
  65. JSONObject options,
  66. List<Exception> exceptions) {
  67. String docType = JSONUtilities.getString(options, "docType", null);
  68. if ("spreadsheet".equals(docType)) {
  69. SpreadsheetService service = GDataExtension.getSpreadsheetService(token);
  70. parse(
  71. service,
  72. project,
  73. metadata,
  74. job,
  75. limit,
  76. options,
  77. exceptions
  78. );
  79. } else if ("table".equals(docType)) {
  80. GoogleService service = GDataExtension.getFusionTablesGoogleService(token);
  81. parse(
  82. service,
  83. project,
  84. metadata,
  85. job,
  86. limit,
  87. options,
  88. exceptions
  89. );
  90. }
  91. }
  92. static public void parse(
  93. SpreadsheetService service,
  94. Project project,
  95. ProjectMetadata metadata,
  96. final ImportingJob job,
  97. int limit,
  98. JSONObject options,
  99. List<Exception> exceptions) {
  100. String docUrlString = JSONUtilities.getString(options, "docUrl", null);
  101. String worksheetUrlString = JSONUtilities.getString(options, "sheetUrl", null);
  102. if (docUrlString != null && worksheetUrlString != null) {
  103. try {
  104. parseOneWorkSheet(
  105. service,
  106. project,
  107. metadata,
  108. job,
  109. new URL(docUrlString),
  110. new URL(worksheetUrlString),
  111. limit,
  112. options,
  113. exceptions);
  114. } catch (MalformedURLException e) {
  115. e.printStackTrace();
  116. exceptions.add(e);
  117. }
  118. }
  119. }
  120. static public void parseOneWorkSheet(
  121. SpreadsheetService service,
  122. Project project,
  123. ProjectMetadata metadata,
  124. final ImportingJob job,
  125. URL docURL,
  126. URL worksheetURL,
  127. int limit,
  128. JSONObject options,
  129. List<Exception> exceptions) {
  130. try {
  131. WorksheetEntry worksheetEntry = service.getEntry(worksheetURL, WorksheetEntry.class);
  132. String spreadsheetName = docURL.toExternalForm();
  133. try {
  134. SpreadsheetEntry spreadsheetEntry = service.getEntry(docURL, SpreadsheetEntry.class);
  135. spreadsheetName = spreadsheetEntry.getTitle().getPlainText();
  136. } catch (ServiceException e) { // RedirectRequiredException among others
  137. // fall back to just using the URL (better for traceability anyway?)
  138. }
  139. String fileSource = spreadsheetName + " # " +
  140. worksheetEntry.getTitle().getPlainText();
  141. setProgress(job, fileSource, 0);
  142. TabularImportingParserBase.readTable(
  143. project,
  144. metadata,
  145. job,
  146. new WorksheetBatchRowReader(job, fileSource, service, worksheetEntry, 20),
  147. fileSource,
  148. limit,
  149. options,
  150. exceptions
  151. );
  152. setProgress(job, fileSource, 100);
  153. } catch (IOException e) {
  154. e.printStackTrace();
  155. exceptions.add(e);
  156. } catch (ServiceException e) {
  157. e.printStackTrace();
  158. exceptions.add(e);
  159. }
  160. }
  161. static private void setProgress(ImportingJob job, String fileSource, int percent) {
  162. JSONObject progress = JSONUtilities.getObject(job.config, "progress");
  163. if (progress == null) {
  164. progress = new JSONObject();
  165. JSONUtilities.safePut(job.config, "progress", progress);
  166. }
  167. JSONUtilities.safePut(progress, "message", "Reading " + fileSource);
  168. JSONUtilities.safePut(progress, "percent", percent);
  169. }
  170. static private class WorksheetBatchRowReader implements TableDataReader {
  171. final ImportingJob job;
  172. final String fileSource;
  173. final SpreadsheetService service;
  174. final WorksheetEntry worksheet;
  175. final int batchSize;
  176. final int totalRows;
  177. int nextRow = 0; // 0-based
  178. int batchRowStart = 0; // 0-based
  179. List<List<Object>> rowsOfCells = null;
  180. public WorksheetBatchRowReader(ImportingJob job, String fileSource,
  181. SpreadsheetService service, WorksheetEntry worksheet,
  182. int batchSize) {
  183. this.job = job;
  184. this.fileSource = fileSource;
  185. this.service = service;
  186. this.worksheet = worksheet;
  187. this.batchSize = batchSize;
  188. this.totalRows = worksheet.getRowCount();
  189. }
  190. @Override
  191. public List<Object> getNextRowOfCells() throws IOException {
  192. if (rowsOfCells == null || (nextRow >= batchRowStart + rowsOfCells.size() && nextRow < totalRows)) {
  193. int newBatchRowStart = batchRowStart + (rowsOfCells == null ? 0 : rowsOfCells.size());
  194. try {
  195. rowsOfCells = getRowsOfCells(
  196. service,
  197. worksheet,
  198. newBatchRowStart + 1, // convert to 1-based
  199. batchSize);
  200. batchRowStart = newBatchRowStart;
  201. setProgress(job, fileSource, batchRowStart * 100 / totalRows);
  202. } catch (ServiceException e) {
  203. throw new IOException(e);
  204. }
  205. }
  206. if (rowsOfCells != null && nextRow - batchRowStart < rowsOfCells.size()) {
  207. return rowsOfCells.get(nextRow++ - batchRowStart);
  208. } else {
  209. return null;
  210. }
  211. }
  212. List<List<Object>> getRowsOfCells(
  213. SpreadsheetService service,
  214. WorksheetEntry worksheet,
  215. int startRow, // 1-based
  216. int rowCount
  217. ) throws IOException, ServiceException {
  218. URL cellFeedUrl = worksheet.getCellFeedUrl();
  219. int minRow = startRow;
  220. int maxRow = Math.min(worksheet.getRowCount(), startRow + rowCount - 1);
  221. int cols = worksheet.getColCount();
  222. int rows = worksheet.getRowCount();
  223. CellQuery cellQuery = new CellQuery(cellFeedUrl);
  224. cellQuery.setMinimumRow(minRow);
  225. cellQuery.setMaximumRow(maxRow);
  226. cellQuery.setMaximumCol(cols);
  227. cellQuery.setMaxResults(rows * cols);
  228. cellQuery.setReturnEmpty(false);
  229. CellFeed cellFeed = service.query(cellQuery, CellFeed.class);
  230. List<CellEntry> cellEntries = cellFeed.getEntries();
  231. List<List<Object>> rowsOfCells = new ArrayList<List<Object>>(rowCount);
  232. for (CellEntry cellEntry : cellEntries) {
  233. Cell cell = cellEntry.getCell();
  234. if (cell != null) {
  235. int row = cell.getRow() - startRow;
  236. int col = cell.getCol() - 1;
  237. while (row >= rowsOfCells.size()) {
  238. rowsOfCells.add(new ArrayList<Object>());
  239. }
  240. List<Object> rowOfCells = rowsOfCells.get(row);
  241. while (col >= rowOfCells.size()) {
  242. rowOfCells.add(null);
  243. }
  244. rowOfCells.set(col, cell.getValue());
  245. }
  246. }
  247. return rowsOfCells;
  248. }
  249. }
  250. static public void parse(
  251. GoogleService service,
  252. Project project,
  253. ProjectMetadata metadata,
  254. final ImportingJob job,
  255. int limit,
  256. JSONObject options,
  257. List<Exception> exceptions) {
  258. String docUrlString = JSONUtilities.getString(options, "docUrl", null);
  259. String id = getFTid(docUrlString); // Use GDataExtension.getFusionTableKey(url) ?
  260. // TODO: Allow arbitrary Fusion Tables URL instead of (in addition to?) constructing our own?
  261. try {
  262. List<FTColumnData> columns = new ArrayList<GDataImporter.FTColumnData>();
  263. List<List<String>> rows = GDataExtension.runFusionTablesSelect(service, "DESCRIBE " + id);
  264. if (rows.size() > 1) {
  265. for (int i = 1; i < rows.size(); i++) {
  266. List<String> row = rows.get(i);
  267. if (row.size() >= 2) {
  268. FTColumnData cd = new FTColumnData();
  269. cd.name = row.get(1);
  270. cd.type = FTColumnType.STRING;
  271. if (row.size() > 2) {
  272. String type = row.get(2).toLowerCase();
  273. if (type.equals("number")) {
  274. cd.type = FTColumnType.NUMBER;
  275. } else if (type.equals("datetime")) {
  276. cd.type = FTColumnType.DATETIME;
  277. } else if (type.equals("location")) {
  278. cd.type = FTColumnType.LOCATION;
  279. }
  280. }
  281. columns.add(cd);
  282. }
  283. }
  284. setProgress(job, docUrlString, -1);
  285. // Force these options for the next call because each fusion table
  286. // is strictly structured with a single line of headers.
  287. JSONUtilities.safePut(options, "ignoreLines", 0); // number of blank lines at the beginning to ignore
  288. JSONUtilities.safePut(options, "headerLines", 1); // number of header lines
  289. TabularImportingParserBase.readTable(
  290. project,
  291. metadata,
  292. job,
  293. new FusionTableBatchRowReader(job, docUrlString, service, id, columns, 100),
  294. docUrlString,
  295. limit,
  296. options,
  297. exceptions
  298. );
  299. setProgress(job, docUrlString, 100);
  300. }
  301. } catch (IOException e) {
  302. e.printStackTrace();
  303. exceptions.add(e);
  304. } catch (ServiceException e) {
  305. e.printStackTrace();
  306. exceptions.add(e);
  307. }
  308. }
  309. static private String getFTid(String url) {
  310. if (url == null) {
  311. return null;
  312. }
  313. int equal = url.lastIndexOf('=');
  314. if (equal < 0) {
  315. return null;
  316. }
  317. return url.substring(equal + 1);
  318. }
  319. static private enum FTColumnType {
  320. STRING,
  321. NUMBER,
  322. DATETIME,
  323. LOCATION
  324. }
  325. final static private class FTColumnData {
  326. String name;
  327. FTColumnType type;
  328. }
  329. static private class FusionTableBatchRowReader implements TableDataReader {
  330. final ImportingJob job;
  331. final String fileSource;
  332. final GoogleService service;
  333. final List<FTColumnData> columns;
  334. final int batchSize;
  335. final String baseQuery;
  336. int nextRow = 0; // 0-based
  337. int batchRowStart = 0; // 0-based
  338. boolean end = false;
  339. List<List<Object>> rowsOfCells = null;
  340. boolean usedHeaders = false;
  341. public FusionTableBatchRowReader(ImportingJob job, String fileSource,
  342. GoogleService service, String tableId, List<FTColumnData> columns,
  343. int batchSize) {
  344. this.job = job;
  345. this.fileSource = fileSource;
  346. this.service = service;
  347. this.columns = columns;
  348. this.batchSize = batchSize;
  349. StringBuffer sb = new StringBuffer();
  350. sb.append("SELECT ");
  351. boolean first = true;
  352. for (FTColumnData cd : columns) {
  353. if (first) {
  354. first = false;
  355. } else {
  356. sb.append(",");
  357. }
  358. sb.append("'");
  359. sb.append(cd.name);
  360. sb.append("'");
  361. }
  362. sb.append(" FROM ");
  363. sb.append(tableId);
  364. baseQuery = sb.toString();
  365. }
  366. @Override
  367. public List<Object> getNextRowOfCells() throws IOException {
  368. if (!usedHeaders) {
  369. List<Object> row = new ArrayList<Object>(columns.size());
  370. for (FTColumnData cd : columns) {
  371. row.add(cd.name);
  372. }
  373. usedHeaders = true;
  374. return row;
  375. }
  376. if (rowsOfCells == null || (nextRow >= batchRowStart + rowsOfCells.size() && !end)) {
  377. int newBatchRowStart = batchRowStart + (rowsOfCells == null ? 0 : rowsOfCells.size());
  378. try {
  379. rowsOfCells = getRowsOfCells(newBatchRowStart);
  380. batchRowStart = newBatchRowStart;
  381. setProgress(job, fileSource, -1 /* batchRowStart * 100 / totalRows */);
  382. } catch (ServiceException e) {
  383. throw new IOException(e);
  384. }
  385. }
  386. if (rowsOfCells != null && nextRow - batchRowStart < rowsOfCells.size()) {
  387. return rowsOfCells.get(nextRow++ - batchRowStart);
  388. } else {
  389. return null;
  390. }
  391. }
  392. private List<List<Object>> getRowsOfCells(int startRow) throws IOException, ServiceException {
  393. List<List<Object>> rowsOfCells = new ArrayList<List<Object>>(batchSize);
  394. String query = baseQuery + " OFFSET " + startRow + " LIMIT " + batchSize;
  395. List<List<String>> rows = GDataExtension.runFusionTablesSelect(service, query);
  396. if (rows.size() > 1) {
  397. for (int i = 1; i < rows.size(); i++) {
  398. List<String> row = rows.get(i);
  399. List<Object> rowOfCells = new ArrayList<Object>(row.size());
  400. for (int j = 0; j < row.size() && j < columns.size(); j++) {
  401. String text = row.get(j);
  402. if (text.isEmpty()) {
  403. rowOfCells.add(null);
  404. } else {
  405. FTColumnData cd = columns.get(j);
  406. if (cd.type == FTColumnType.NUMBER) {
  407. try {
  408. rowOfCells.add(Long.parseLong(text));
  409. continue;
  410. } catch (NumberFormatException e) {
  411. // ignore
  412. }
  413. try {
  414. double d = Double.parseDouble(text);
  415. if (!Double.isInfinite(d) && !Double.isNaN(d)) {
  416. rowOfCells.add(d);
  417. continue;
  418. }
  419. } catch (NumberFormatException e) {
  420. // ignore
  421. }
  422. }
  423. rowOfCells.add(text);
  424. }
  425. }
  426. rowsOfCells.add(rowOfCells);
  427. }
  428. }
  429. end = rows.size() < batchSize + 1;
  430. return rowsOfCells;
  431. }
  432. }
  433. }