PageRenderTime 44ms CodeModel.GetById 9ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 1ms

/edu/uncc/parsets/data/old/CSVParser.java

https://code.google.com/p/parsets/
Java | 226 lines | 168 code | 22 blank | 36 comment | 49 complexity | 667f3e7ef038064fb629ad6bbdf093c5 MD5 | raw file
  1package edu.uncc.parsets.data.old;
  2
  3import java.io.BufferedReader;
  4import java.io.File;
  5import java.io.FileNotFoundException;
  6import java.io.FileReader;
  7import java.io.IOException;
  8
  9import au.com.bytecode.opencsv.CSVReader;
 10import edu.uncc.parsets.data.LocalDB;
 11import edu.uncc.parsets.util.PSLogging;
 12
 13/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
 14 * Copyright (c) 2009, Robert Kosara, Caroline Ziemkiewicz,
 15 *                     and others (see Authors.txt for full list)
 16 * All rights reserved.
 17 * 
 18 * Redistribution and use in source and binary forms, with or without
 19 * modification, are permitted provided that the following conditions are met:
 20 * 
 21 *    * Redistributions of source code must retain the above copyright
 22 *      notice, this list of conditions and the following disclaimer.
 23 *    * Redistributions in binary form must reproduce the above copyright
 24 *      notice, this list of conditions and the following disclaimer in the
 25 *      documentation and/or other materials provided with the distribution.
 26 *    * Neither the name of UNC Charlotte nor the names of its contributors
 27 *      may be used to endorse or promote products derived from this software
 28 *      without specific prior written permission.
 29 *      
 30 * THIS SOFTWARE IS PROVIDED BY ITS AUTHORS ''AS IS'' AND ANY
 31 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 32 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 33 * DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
 34 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 35 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 36 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 37 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 39 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 40\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 41
 42public class CSVParser {
 43
 44	private String csvFileName;
 45
 46	private CSVParserListener callBack;
 47
 48	private CSVDataSet dataSet;
 49
 50	private char separator = ';';
 51
 52	private String[] columns;
 53
 54	private CSVReader parser;
 55		
 56	public CSVParser(String fileName, CSVParserListener receiver) {
 57		csvFileName = fileName;
 58		callBack = receiver;
 59		dataSet = new CSVDataSet(fileName);
 60	}
 61
 62	/**
 63	 * Call the parser to read in a dataset, and parse the XML metadata file if
 64	 * it exists. The parser runs in a separate thread, which is returned. The
 65	 * caller can wait for the thread to finish using {@link Thread#join()},
 66	 * or provide a callback that receives progress notifications.
 67	 */
 68	public Thread analyzeCSVFile() {
 69		MetaDataParser mp = new MetaDataParser();
 70		String metafilename = csvFileName.substring(0, csvFileName.lastIndexOf('.'))
 71				+ ".xml";
 72		if (new File(metafilename).exists()) {
 73			mp.parse(dataSet, metafilename);
 74		} else {
 75			if (metafilename.contains("_")) {
 76				metafilename = metafilename.substring(0, metafilename.lastIndexOf("_"))+".xml";
 77				if (new File(metafilename).exists()) {
 78					mp.parse(dataSet, metafilename);
 79					String name = new File(csvFileName).getName();
 80					name = name.substring(0, name.lastIndexOf('.'));
 81					name = name.replace('_', ' ');
 82					dataSet.setName(name);
 83				}
 84			}
 85		}
 86		Thread t = new Thread() {
 87			public void run() {
 88				analyzeFile();
 89			}
 90		};
 91		t.start();
 92		return t;
 93	}
 94	
 95	private void analyzeFile() {
 96		
 97		float numLinesEstimate = 1000;
 98
 99		BufferedReader reader = null;
100
101		try {
102			reader = new BufferedReader(new FileReader(csvFileName));
103			String firstLine = reader.readLine();
104			// guess separator char
105			int numCommas = 0;
106			int numSemicolons = 0;
107			if (firstLine != null) {
108				for (int i = 0; i < firstLine.length(); i++) {
109					char c = firstLine.charAt(i);
110					if (c == ',')
111						numCommas++;
112					else if (c == ';')
113						numSemicolons++;
114				}
115				if (numCommas > numSemicolons)
116					separator = ',';
117			}
118			CSVReader parser = new CSVReader(new FileReader(csvFileName), separator);
119			String[] headerLine = parser.readNext();
120			for (String columnName : headerLine)
121				dataSet.instantiateDimension(columnName);
122
123			int numColumns = headerLine.length;
124			
125			if (firstLine != null) {
126				int numBytes = firstLine.length()+1;
127				int numLines = 1;
128				String columns[];
129				while (((columns = parser.readNext()) != null) && (numLines < 100)) {
130					if (columns.length != numColumns) {
131						PSLogging.logger.error("Found "+columns.length+" columns instead of "+numColumns+" in line "+numLines);
132						if (callBack != null)
133							callBack.errorWrongNumberOfColumns(numColumns, columns.length, numLines);
134						return;
135					}
136					numLines++;
137					for (int i = 0; i < columns.length; i++) {
138						numBytes += columns[i].length()+1;
139						dataSet.getDimension(i).addValue(columns[i]);
140					}
141					dataSet.setNumRecords(numLines);
142				}
143				File f = new File(csvFileName);
144				numLinesEstimate = (int) (f.length() / numBytes) * numLines;
145				numLinesEstimate /= 100f; // to scale from 0 to 100
146
147				while (columns != null) {
148					if (columns.length != numColumns) {
149						PSLogging.logger.error("Found "+columns.length+" columns instead of "+numColumns+" in line "+numLines);
150						if (callBack != null)
151							callBack.errorWrongNumberOfColumns(numColumns, columns.length, numLines);
152						return;
153					}
154					numLines++;
155					if ((numLines & 0xff) == 0 && callBack != null)
156						callBack.setProgress((int)(numLines/numLinesEstimate));
157					for (int i = 0; i < columns.length; i++)
158						dataSet.getDimension(i).addValue(columns[i]);
159					dataSet.setNumRecords(numLines);
160					columns = parser.readNext();
161				}
162			}
163		} catch (FileNotFoundException e) {
164			PSLogging.logger.error("File not found: "+csvFileName, e);
165			if (callBack != null)
166				callBack.errorFileNotFound(csvFileName);
167		} catch (IOException e) {
168			PSLogging.logger.error("IOException while reading file: "+csvFileName, e);
169			if (callBack != null)
170				callBack.errorReadingFile(csvFileName);
171		} finally {
172			try {
173				reader.close();
174			} catch (IOException e) {
175				PSLogging.logger.error("IOException while closing file: "+csvFileName, e);
176			}
177		}
178		if (callBack != null)
179			callBack.setDataSet(dataSet);
180	}
181	
182	public void streamToDB(LocalDB db) {
183		try {
184			parser = new CSVReader(new FileReader(csvFileName), separator);
185			columns = parser.readNext();
186			db.addLocalDBDataSet(dataSet, this);
187			if (callBack != null)
188				callBack.importDone();
189		} catch (Exception e) {
190			PSLogging.logger.error("Error streaming data", e);
191			if (callBack != null)
192				callBack.errorReadingFile(csvFileName);
193		}
194	}
195
196	public float[] readNextLine() {
197		try {
198			columns = parser.readNext();
199			if (columns != null) {
200				float values[] = new float[columns.length];
201				for (int i = 0; i < columns.length; i++) {
202					switch (dataSet.getDimension(i).getDataType()) {
203					case categorical:
204						values[i] = dataSet.getDimension(i).getNumForKey(columns[i]);
205						break;
206					case numerical:
207						values[i] = Float.valueOf(columns[i]);
208						break;
209					default:
210						values[i] = 0;
211						break;
212					}
213					// return statement was here earlier, makes no sense
214				}
215				return values;
216			}
217		} catch (Exception e) {
218			PSLogging.logger.error("Error reading line", e);
219		}
220		return null;
221	}
222	
223	public CSVDataSet getDataSet() {
224		return dataSet;
225	}
226}