PageRenderTime 5693ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/nutchindexing/nutch-1.2/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java

https://bitbucket.org/AlexeyD/hibench
Java | 92 lines | 46 code | 13 blank | 33 comment | 16 complexity | bfabca239bacfb6f45e729151b226a6a MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.nutch.parse.msexcel;
  18. // JDK imports
  19. import java.io.InputStream;
  20. // Jakarta POI imports
  21. import org.apache.poi.hssf.usermodel.HSSFCell;
  22. import org.apache.poi.hssf.usermodel.HSSFRow;
  23. import org.apache.poi.hssf.usermodel.HSSFSheet;
  24. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  25. // Nutch imports
  26. import org.apache.nutch.parse.ms.MSExtractor;
  27. /**
  28. * Excel Text and Properties extractor.
  29. *
  30. * @author Rohit Kulkarni & Ashish Vaidya
  31. * @author Jérôme Charron
  32. */
  33. class ExcelExtractor extends MSExtractor {
  34. protected String extractText(InputStream input) throws Exception {
  35. StringBuilder resultText = new StringBuilder();
  36. HSSFWorkbook wb = new HSSFWorkbook(input);
  37. if (wb == null) {
  38. return resultText.toString();
  39. }
  40. HSSFSheet sheet;
  41. HSSFRow row;
  42. HSSFCell cell;
  43. int sNum = 0;
  44. int rNum = 0;
  45. int cNum = 0;
  46. sNum = wb.getNumberOfSheets();
  47. for (int i=0; i<sNum; i++) {
  48. if ((sheet = wb.getSheetAt(i)) == null) {
  49. continue;
  50. }
  51. rNum = sheet.getLastRowNum();
  52. for (int j=0; j<=rNum; j++) {
  53. if ((row = sheet.getRow(j)) == null){
  54. continue;
  55. }
  56. cNum = row.getLastCellNum();
  57. for (int k=0; k<cNum; k++) {
  58. if ((cell = row.getCell((short) k)) != null) {
  59. /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
  60. resultText.append(cell.getDateCellValue().toString())
  61. } else
  62. */
  63. if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
  64. resultText.append(cell.getStringCellValue()).append(" ");
  65. } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
  66. double d = cell.getNumericCellValue();
  67. resultText.append(d).append(" ");
  68. }
  69. /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
  70. resultText.append(cell.getCellFormula());
  71. }
  72. */
  73. }
  74. }
  75. }
  76. }
  77. return resultText.toString();
  78. }
  79. }