/src/main/java/racetrack/util/RFC4180CSVReader.java
Java | 211 lines | 122 code | 23 blank | 66 comment | 96 complexity | d4b04cfcfedf40519f5e9f556ddc7785 MD5 | raw file
- /*
- Copyright 2019 David Trimm
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- package racetrack.util;
- import java.io.BufferedInputStream;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileReader;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.zip.GZIPInputStream;
- import racetrack.framework.BundlesDT;
- /**
- * Read a CSV encoded using the RFC4180 Standard file. Caveats include
- * - Files ending with a .gz will be automatically unzipped
- * - blank fields will be replaced with the notset string
- *
- *@author D. Trimm
- *@version 1.0
- */
- public class RFC4180CSVReader {
- /**
- * Consumer for the parsed tokens
- */
- private CSVTokenConsumer consumer;
- /**
- * Flag to indicate that parsing should continue
- */
- private boolean keep_going = true;
- /**
- * Construct the reader and run it through the file. This is the much faster version but does not support
- * different encodings.
- *
- *@param file csv file to parse
- *@param consumer consumer to direct tokens to
- *@param encoding null if no decoding is specified; else the specified decoding string will be used (e.g., "UTF-8")
- */
- public void BrokenRFC4180CSVReader(File file, CSVTokenConsumer consumer) throws IOException {
- this.consumer = consumer; BufferedReader in = null;
- try {
- // Open the file
- if (file.getName().toLowerCase().endsWith(".gz")) in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
- else in = new BufferedReader(new FileReader(file));
- // Parse the line... and read additional lines if necessary ... finish a single row
- String line; int line_no = 0; while ((line = in.readLine()) != null && keep_going) { line_no++;
- List<String> tokens = new ArrayList<String>(); StringBuffer row = new StringBuffer(), sb = new StringBuffer();
- State state = State.TOKEN_START;
- int char_i = 0; char c = '\n', last_c = '\n'; while (char_i < line.length()) {
- last_c = c; c = line.charAt(char_i); char_i++; row.append(c);
- if (state == State.TOKEN_START) {
- if (c == '\"') { state = State.ESCAPED_TOKEN; c = ' '; /* reset the dquotes parser */ }
- else if (c == ',') { state = State.TOKEN_START; tokens.add(""); }
- else { state = State.NORMAL_TOKEN; sb.append(c); }
- } else if (state == State.NORMAL_TOKEN) {
- if (c == ',') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
- else { sb.append(c); }
- } else if (state == State.ESCAPED_TOKEN) {
- if (char_i == line.length() && c == '\"') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
- else if (char_i == line.length()) { line = in.readLine(); char_i = 0; sb.append(c); }
- else if (c == '\"' && last_c == '\"') { sb.append('\"'); c = ' '; /* reset the dquotes parser */ }
- else if (c == ',' && last_c == '\"') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
- else if (c == '\"') { /* wait */ }
- else { sb.append(c); }
- }
- }
- if (c == ',') tokens.add("");
- if (sb.length() > 0) tokens.add(sb.toString());
- pushTokens(tokens, row.toString(), line_no);
- }
- } catch (IOException ioe) {
- throw ioe;
- } finally {
- if (in != null) in.close();
- }
- }
- enum State { TOKEN_START, ESCAPED_TOKEN, NORMAL_TOKEN };
- /**
- * Constructor... just calls the other version...
- */
- public RFC4180CSVReader(File file, CSVTokenConsumer consumer) throws IOException { this(file, consumer, "UTF-8"); }
- /**
- * Construct the reader and run it through the file. This is the slower version... but does support different encodings.
- *
- *@param file csv file to parse
- *@param consumer consumer to direct tokens to
- *@param encoding null if no decoding is specified; else the specified decoding string will be used (e.g., "UTF-8")
- *@param old_version just to differentiate the newer version
- */
- public RFC4180CSVReader(File file, CSVTokenConsumer consumer, String encoding) throws IOException {
- this.consumer = consumer; InputStream in = null; List<Byte> bytes = new ArrayList<Byte>();
- try {
- if (file.getName().toLowerCase().endsWith(".gz")) in = new BufferedInputStream(new GZIPInputStream(new FileInputStream(file)), 1024*1024*8);
- else in = new BufferedInputStream(new FileInputStream(file), 1024*1024*8);
- int line_no = 1; boolean in_dquotes = false, last_was_dquote = false; List<String> tokens = new ArrayList<String>(); StringBuffer sb = new StringBuffer(), line = new StringBuffer();
- while (in.available() > 0 && keep_going) {
- int c = in.read(); line.append((char) c);
- if (c == -1) { /* Shouldn't Happen */ tokens.add(sb.toString()); pushTokens(tokens, line.toString(), line_no); sb.delete(0,sb.length()); line.delete(0,line.length()); tokens.clear(); line_no++;
- } else if (in_dquotes) {
- if (last_was_dquote) {
- if (c == ',') { /* End Token */ addToken(tokens, sb, bytes, encoding); in_dquotes = false; last_was_dquote = false;
- } else if (c == '\r') { /* Ingore */ in_dquotes = false; last_was_dquote = false;
- } else if (c == '\n') { /* End Token, End Line */ addToken(tokens, sb, bytes, encoding); pushTokens(tokens, line.toString(), line_no); line.delete(0,line.length()); tokens.clear(); in_dquotes = false; line_no++; last_was_dquote = false;
- } else { sb.append((char) c); bytes.add((byte) c); last_was_dquote = false; }
- } else if (c == '\"') { last_was_dquote = true;
- } else { sb.append((char) c); bytes.add((byte) c); }
- } else if (c == ',') { /* End Token */ addToken(tokens, sb, bytes, encoding);
- } else if (c == '\"') { /* Enter Quotes */ in_dquotes = true;
- } else if (c == '\r') { /* Ignore */
- } else if (c == '\n') { /* End Token, End Line */ addToken(tokens, sb, bytes, encoding); pushTokens(tokens, line.toString(), line_no); line.delete(0,line.length()); tokens.clear(); line_no++;
- } else { sb.append((char) c); bytes.add((byte) c); }
- }
- in.close(); in = null;
- } catch (IOException ioe) { throw ioe;
- } finally { if (in != null) in.close();
- }
- }
- /**
- * Add a token -- if encoding is set, use the bytes list to decode the token. Otherwise, use the stringbuffer version. After token is added, clear both structures.
- */
- private void addToken(List<String> tokens, StringBuffer sb, List<Byte> bytes, String encoding) throws UnsupportedEncodingException {
- // System.err.print("sb = \"" + sb.toString() + "\" bytes.size() = " + bytes.size() + " :::: ");
- if (encoding == null) { tokens.add(sb.toString()); } else {
- byte as_array[] = new byte[bytes.size()];
- for (int i=0;i<as_array.length;i++) as_array[i] = bytes.get(i);
- tokens.add(new String(as_array, encoding));
- }
- // System.err.println("added token \"" + tokens.get(tokens.size()-1) + "\"");
- sb.delete(0,sb.length()); bytes.clear();
- }
- /**
- *
- */
- private void pushTokens(List<String> tokens, String line, int line_no) {
- String array[] = new String[tokens.size()];
- for (int i=0;i<array.length;i++) {
- array[i] = tokens.get(i);
- if (array[i] == null || array[i].length() == 0) array[i] = BundlesDT.NOTSET;
- }
- keep_going = consumer.consume(array, line, line_no);
- }
- /**
- *
- */
- public static void main(String args[]) {
- try {
- /* RFC4180CSVReader reader = */ new RFC4180CSVReader(new File(args[0]), new CSVTokenConsumer() {
- public boolean consume(String tokens[], String line, int line_no) {
- System.out.println("@ " + line_no + " | Tokens = " + tokens.length);
- for (int i=0;i<tokens.length;i++) System.out.println(" T[" + i + "] = \"" + tokens[i] + "\" (" + BundlesDT.getEntityDataType(tokens[i]) + ")");
- return true;
- } public void commentLine(String line) { } },"UTF-8" );
- long ts0 = System.currentTimeMillis();
- /* reader = */ new RFC4180CSVReader(new File(args[0]), new NullConsumer());
- long ts1 = System.currentTimeMillis();
- /* reader = */ new RFC4180CSVReader(new File(args[0]), new NullConsumer(), null);
- long ts2 = System.currentTimeMillis();
- System.err.println("Time | line (new) = " + (ts1 - ts0) + " ms | byte (old) = " + (ts2 - ts1) + " ms");
- } catch (IOException ioe) {
- System.err.println("IOException: " + ioe);
- }
- }
- }
- /**
- * Null consumer
- */
- class NullConsumer implements CSVTokenConsumer {
- public boolean consume (String tokens[], String line, int line_no) { return true; }
- public void commentLine(String line) { }
- }