/edu/uncc/parsets/data/old/CSVParser.java

https://code.google.com/p/parsets/ · Java · 226 lines · 168 code · 22 blank · 36 comment · 49 complexity · 667f3e7ef038064fb629ad6bbdf093c5 MD5 · raw file

  1. package edu.uncc.parsets.data.old;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileNotFoundException;
  5. import java.io.FileReader;
  6. import java.io.IOException;
  7. import au.com.bytecode.opencsv.CSVReader;
  8. import edu.uncc.parsets.data.LocalDB;
  9. import edu.uncc.parsets.util.PSLogging;
  10. /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
  11. * Copyright (c) 2009, Robert Kosara, Caroline Ziemkiewicz,
  12. * and others (see Authors.txt for full list)
  13. * All rights reserved.
  14. *
  15. * Redistribution and use in source and binary forms, with or without
  16. * modification, are permitted provided that the following conditions are met:
  17. *
  18. * * Redistributions of source code must retain the above copyright
  19. * notice, this list of conditions and the following disclaimer.
  20. * * Redistributions in binary form must reproduce the above copyright
  21. * notice, this list of conditions and the following disclaimer in the
  22. * documentation and/or other materials provided with the distribution.
  23. * * Neither the name of UNC Charlotte nor the names of its contributors
  24. * may be used to endorse or promote products derived from this software
  25. * without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY ITS AUTHORS ''AS IS'' AND ANY
  28. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  29. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  30. * DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
  31. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  32. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  33. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  34. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  35. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  36. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  37. \* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  38. public class CSVParser {
  39. private String csvFileName;
  40. private CSVParserListener callBack;
  41. private CSVDataSet dataSet;
  42. private char separator = ';';
  43. private String[] columns;
  44. private CSVReader parser;
  45. public CSVParser(String fileName, CSVParserListener receiver) {
  46. csvFileName = fileName;
  47. callBack = receiver;
  48. dataSet = new CSVDataSet(fileName);
  49. }
  50. /**
  51. * Call the parser to read in a dataset, and parse the XML metadata file if
  52. * it exists. The parser runs in a separate thread, which is returned. The
  53. * caller can wait for the thread to finish using {@link Thread#join()},
  54. * or provide a callback that receives progress notifications.
  55. */
  56. public Thread analyzeCSVFile() {
  57. MetaDataParser mp = new MetaDataParser();
  58. String metafilename = csvFileName.substring(0, csvFileName.lastIndexOf('.'))
  59. + ".xml";
  60. if (new File(metafilename).exists()) {
  61. mp.parse(dataSet, metafilename);
  62. } else {
  63. if (metafilename.contains("_")) {
  64. metafilename = metafilename.substring(0, metafilename.lastIndexOf("_"))+".xml";
  65. if (new File(metafilename).exists()) {
  66. mp.parse(dataSet, metafilename);
  67. String name = new File(csvFileName).getName();
  68. name = name.substring(0, name.lastIndexOf('.'));
  69. name = name.replace('_', ' ');
  70. dataSet.setName(name);
  71. }
  72. }
  73. }
  74. Thread t = new Thread() {
  75. public void run() {
  76. analyzeFile();
  77. }
  78. };
  79. t.start();
  80. return t;
  81. }
  82. private void analyzeFile() {
  83. float numLinesEstimate = 1000;
  84. BufferedReader reader = null;
  85. try {
  86. reader = new BufferedReader(new FileReader(csvFileName));
  87. String firstLine = reader.readLine();
  88. // guess separator char
  89. int numCommas = 0;
  90. int numSemicolons = 0;
  91. if (firstLine != null) {
  92. for (int i = 0; i < firstLine.length(); i++) {
  93. char c = firstLine.charAt(i);
  94. if (c == ',')
  95. numCommas++;
  96. else if (c == ';')
  97. numSemicolons++;
  98. }
  99. if (numCommas > numSemicolons)
  100. separator = ',';
  101. }
  102. CSVReader parser = new CSVReader(new FileReader(csvFileName), separator);
  103. String[] headerLine = parser.readNext();
  104. for (String columnName : headerLine)
  105. dataSet.instantiateDimension(columnName);
  106. int numColumns = headerLine.length;
  107. if (firstLine != null) {
  108. int numBytes = firstLine.length()+1;
  109. int numLines = 1;
  110. String columns[];
  111. while (((columns = parser.readNext()) != null) && (numLines < 100)) {
  112. if (columns.length != numColumns) {
  113. PSLogging.logger.error("Found "+columns.length+" columns instead of "+numColumns+" in line "+numLines);
  114. if (callBack != null)
  115. callBack.errorWrongNumberOfColumns(numColumns, columns.length, numLines);
  116. return;
  117. }
  118. numLines++;
  119. for (int i = 0; i < columns.length; i++) {
  120. numBytes += columns[i].length()+1;
  121. dataSet.getDimension(i).addValue(columns[i]);
  122. }
  123. dataSet.setNumRecords(numLines);
  124. }
  125. File f = new File(csvFileName);
  126. numLinesEstimate = (int) (f.length() / numBytes) * numLines;
  127. numLinesEstimate /= 100f; // to scale from 0 to 100
  128. while (columns != null) {
  129. if (columns.length != numColumns) {
  130. PSLogging.logger.error("Found "+columns.length+" columns instead of "+numColumns+" in line "+numLines);
  131. if (callBack != null)
  132. callBack.errorWrongNumberOfColumns(numColumns, columns.length, numLines);
  133. return;
  134. }
  135. numLines++;
  136. if ((numLines & 0xff) == 0 && callBack != null)
  137. callBack.setProgress((int)(numLines/numLinesEstimate));
  138. for (int i = 0; i < columns.length; i++)
  139. dataSet.getDimension(i).addValue(columns[i]);
  140. dataSet.setNumRecords(numLines);
  141. columns = parser.readNext();
  142. }
  143. }
  144. } catch (FileNotFoundException e) {
  145. PSLogging.logger.error("File not found: "+csvFileName, e);
  146. if (callBack != null)
  147. callBack.errorFileNotFound(csvFileName);
  148. } catch (IOException e) {
  149. PSLogging.logger.error("IOException while reading file: "+csvFileName, e);
  150. if (callBack != null)
  151. callBack.errorReadingFile(csvFileName);
  152. } finally {
  153. try {
  154. reader.close();
  155. } catch (IOException e) {
  156. PSLogging.logger.error("IOException while closing file: "+csvFileName, e);
  157. }
  158. }
  159. if (callBack != null)
  160. callBack.setDataSet(dataSet);
  161. }
  162. public void streamToDB(LocalDB db) {
  163. try {
  164. parser = new CSVReader(new FileReader(csvFileName), separator);
  165. columns = parser.readNext();
  166. db.addLocalDBDataSet(dataSet, this);
  167. if (callBack != null)
  168. callBack.importDone();
  169. } catch (Exception e) {
  170. PSLogging.logger.error("Error streaming data", e);
  171. if (callBack != null)
  172. callBack.errorReadingFile(csvFileName);
  173. }
  174. }
  175. public float[] readNextLine() {
  176. try {
  177. columns = parser.readNext();
  178. if (columns != null) {
  179. float values[] = new float[columns.length];
  180. for (int i = 0; i < columns.length; i++) {
  181. switch (dataSet.getDimension(i).getDataType()) {
  182. case categorical:
  183. values[i] = dataSet.getDimension(i).getNumForKey(columns[i]);
  184. break;
  185. case numerical:
  186. values[i] = Float.valueOf(columns[i]);
  187. break;
  188. default:
  189. values[i] = 0;
  190. break;
  191. }
  192. // return statement was here earlier, makes no sense
  193. }
  194. return values;
  195. }
  196. } catch (Exception e) {
  197. PSLogging.logger.error("Error reading line", e);
  198. }
  199. return null;
  200. }
  201. public CSVDataSet getDataSet() {
  202. return dataSet;
  203. }
  204. }