/nutchindexing/nutch-1.2/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
Java | 92 lines | 46 code | 13 blank | 33 comment | 16 complexity | bfabca239bacfb6f45e729151b226a6a MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.nutch.parse.msexcel;
- // JDK imports
- import java.io.InputStream;
- // Jakarta POI imports
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- // Nutch imports
- import org.apache.nutch.parse.ms.MSExtractor;
- /**
- * Excel Text and Properties extractor.
- *
- * @author Rohit Kulkarni & Ashish Vaidya
- * @author Jérôme Charron
- */
- class ExcelExtractor extends MSExtractor {
-
- protected String extractText(InputStream input) throws Exception {
-
- StringBuilder resultText = new StringBuilder();
- HSSFWorkbook wb = new HSSFWorkbook(input);
- if (wb == null) {
- return resultText.toString();
- }
-
- HSSFSheet sheet;
- HSSFRow row;
- HSSFCell cell;
- int sNum = 0;
- int rNum = 0;
- int cNum = 0;
-
- sNum = wb.getNumberOfSheets();
-
- for (int i=0; i<sNum; i++) {
- if ((sheet = wb.getSheetAt(i)) == null) {
- continue;
- }
- rNum = sheet.getLastRowNum();
- for (int j=0; j<=rNum; j++) {
- if ((row = sheet.getRow(j)) == null){
- continue;
- }
- cNum = row.getLastCellNum();
-
- for (int k=0; k<cNum; k++) {
- if ((cell = row.getCell((short) k)) != null) {
- /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
- resultText.append(cell.getDateCellValue().toString())
- } else
- */
- if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
- resultText.append(cell.getStringCellValue()).append(" ");
- } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
- double d = cell.getNumericCellValue();
- resultText.append(d).append(" ");
- }
- /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
- resultText.append(cell.getCellFormula());
- }
- */
- }
- }
- }
- }
- return resultText.toString();
- }
-
- }