RFC4180CSVReader.java

/src/main/java/racetrack/util/RFC4180CSVReader.java

https://bitbucket.org/dcode/racetrack
Java | 211 lines | 122 code | 23 blank | 66 comment | 96 complexity | d4b04cfcfedf40519f5e9f556ddc7785 MD5 | raw file

/* 

Copyright 2019 David Trimm

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

*/
package racetrack.util;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import racetrack.framework.BundlesDT;

/**
 * Read a CSV encoded using the RFC4180 Standard file.  Caveats include
 * - Files ending with a .gz will be automatically unzipped
 * - blank fields will be replaced with the notset string
 *
 *@author  D. Trimm
 *@version 1.0
 */
public class RFC4180CSVReader {
  /**
   * Consumer for the parsed tokens
   */
  private CSVTokenConsumer consumer;
  /**
   * Flag to indicate that parsing should continue
   */
  private boolean          keep_going = true;

  /**
   * Construct the reader and run it through the file.  This is the much faster version but does not support
   * different encodings.
   *
   *@param file         csv file to parse
   *@param consumer     consumer to direct tokens to
   *@param encoding     null if no decoding is specified; else the specified decoding string will be used (e.g., "UTF-8")
   */
  public void BrokenRFC4180CSVReader(File file, CSVTokenConsumer consumer) throws IOException {
    this.consumer = consumer; BufferedReader in = null;
    try {
      // Open the file
      if (file.getName().toLowerCase().endsWith(".gz")) in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
      else                                              in = new BufferedReader(new FileReader(file));

      // Parse the line... and read additional lines if necessary ... finish a single row
      String line; int line_no = 0; while ((line = in.readLine()) != null && keep_going) { line_no++;
        List<String> tokens = new ArrayList<String>(); StringBuffer row = new StringBuffer(), sb = new StringBuffer();
        State state = State.TOKEN_START;

        int char_i = 0; char c = '\n', last_c = '\n'; while (char_i < line.length()) {
          last_c = c; c = line.charAt(char_i); char_i++; row.append(c);

          if        (state == State.TOKEN_START)   {
            if      (c == '\"') { state = State.ESCAPED_TOKEN; c = ' '; /* reset the dquotes parser */ }
            else if (c == ',')  { state = State.TOKEN_START;   tokens.add("");    }
            else                { state = State.NORMAL_TOKEN;  sb.append(c);      }
          } else if (state == State.NORMAL_TOKEN)  {
            if (c == ',')  { state = State.TOKEN_START;        tokens.add(sb.toString()); sb = new StringBuffer(); }
            else           { sb.append(c); }
          } else if (state == State.ESCAPED_TOKEN) {
            if      (char_i == line.length() && c == '\"') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
            else if (char_i == line.length())              { line = in.readLine(); char_i = 0; sb.append(c); }
            else if (c == '\"' && last_c == '\"')          { sb.append('\"'); c = ' '; /* reset the dquotes parser */ }
            else if (c == ','  && last_c == '\"')          { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
            else if (c == '\"')                            { /* wait */ }
            else                                           { sb.append(c); }
          }
        }

        if (c == ',')        tokens.add("");
        if (sb.length() > 0) tokens.add(sb.toString());

        pushTokens(tokens, row.toString(), line_no);
      }

    } catch (IOException ioe) { 
      throw ioe;
    } finally { 
      if (in != null) in.close(); 
    }
  }

  enum State { TOKEN_START, ESCAPED_TOKEN, NORMAL_TOKEN };

  /**
   * Constructor... just calls the other version...
   */
  public RFC4180CSVReader(File file, CSVTokenConsumer consumer) throws IOException { this(file, consumer, "UTF-8"); }

  /**
   * Construct the reader and run it through the file.  This is the slower version... but does support different encodings.
   *
   *@param file         csv file to parse
   *@param consumer     consumer to direct tokens to
   *@param encoding     null if no decoding is specified; else the specified decoding string will be used (e.g., "UTF-8")
   *@param old_version  just to differentiate the newer version
   */
  public RFC4180CSVReader(File file, CSVTokenConsumer consumer, String encoding) throws IOException {
    this.consumer = consumer; InputStream in = null; List<Byte> bytes = new ArrayList<Byte>();
    try {
      if (file.getName().toLowerCase().endsWith(".gz")) in = new BufferedInputStream(new GZIPInputStream(new FileInputStream(file)), 1024*1024*8);
      else                                              in = new BufferedInputStream(new FileInputStream(file), 1024*1024*8);
      int line_no = 1; boolean in_dquotes = false, last_was_dquote = false; List<String> tokens = new ArrayList<String>(); StringBuffer sb = new StringBuffer(), line = new StringBuffer();
      while (in.available() > 0 && keep_going) {
        int c = in.read(); line.append((char) c);
        if        (c == -1)    { /* Shouldn't Happen    */ tokens.add(sb.toString()); pushTokens(tokens, line.toString(), line_no); sb.delete(0,sb.length()); line.delete(0,line.length()); tokens.clear(); line_no++;
        } else if (in_dquotes) {
          if (last_was_dquote) {
            if        (c == ',')  { /* End Token           */ addToken(tokens, sb, bytes, encoding); in_dquotes = false; last_was_dquote = false;
            } else if (c == '\r') { /* Ingore              */ in_dquotes = false; last_was_dquote = false;
            } else if (c == '\n') { /* End Token, End Line */ addToken(tokens, sb, bytes, encoding); pushTokens(tokens, line.toString(), line_no); line.delete(0,line.length()); tokens.clear(); in_dquotes = false; line_no++; last_was_dquote = false;
            } else                { sb.append((char) c); bytes.add((byte) c); last_was_dquote = false; }
          } else if (c == '\"') { last_was_dquote = true; 
          } else                { sb.append((char) c); bytes.add((byte) c); }
        } else if (c == ',')   { /* End Token           */ addToken(tokens, sb, bytes, encoding);
        } else if (c == '\"')  { /* Enter Quotes        */ in_dquotes = true;
        } else if (c == '\r')  { /* Ignore              */
        } else if (c == '\n')  { /* End Token, End Line */ addToken(tokens, sb, bytes, encoding); pushTokens(tokens, line.toString(), line_no); line.delete(0,line.length()); tokens.clear(); line_no++;
        } else                 { sb.append((char) c); bytes.add((byte) c); }
      }
      in.close(); in = null;
    } catch (IOException ioe) { throw ioe;
    } finally                 { if (in != null) in.close();
    }
  }

  /**
   * Add a token -- if encoding is set, use the bytes list to decode the token.  Otherwise, use the stringbuffer version.  After token is added, clear both structures.
   */
  private void addToken(List<String> tokens, StringBuffer sb, List<Byte> bytes, String encoding) throws UnsupportedEncodingException {
    // System.err.print("sb = \"" + sb.toString() + "\" bytes.size() = " + bytes.size() + " :::: ");
    if (encoding == null) { tokens.add(sb.toString()); } else {
      byte as_array[] = new byte[bytes.size()]; 
      for (int i=0;i<as_array.length;i++) as_array[i] = bytes.get(i);
      tokens.add(new String(as_array, encoding));
    }
    // System.err.println("added token \"" + tokens.get(tokens.size()-1) + "\"");
    sb.delete(0,sb.length()); bytes.clear();
  }

  /**
   *
   */
  private void pushTokens(List<String> tokens, String line, int line_no) {
    String array[] = new String[tokens.size()]; 
    for (int i=0;i<array.length;i++) {
      array[i] = tokens.get(i);
      if (array[i] == null || array[i].length() == 0) array[i] = BundlesDT.NOTSET;
    }
    keep_going = consumer.consume(array, line, line_no);
  }

  /**
   *
   */
  public static void main(String args[]) {
    try {

      /* RFC4180CSVReader reader = */ new RFC4180CSVReader(new File(args[0]), new CSVTokenConsumer() {
        public boolean consume(String tokens[], String line, int line_no) {
          System.out.println("@ " + line_no + " | Tokens = " + tokens.length);
          for (int i=0;i<tokens.length;i++) System.out.println("  T[" + i + "] = \"" + tokens[i] + "\" (" + BundlesDT.getEntityDataType(tokens[i]) + ")");
          return true;
        } public void commentLine(String line) { } },"UTF-8" );

      long ts0 = System.currentTimeMillis();
      /* reader = */ new RFC4180CSVReader(new File(args[0]), new NullConsumer());
      long ts1 = System.currentTimeMillis();
      /* reader = */ new RFC4180CSVReader(new File(args[0]), new NullConsumer(), null);
      long ts2 = System.currentTimeMillis();

      System.err.println("Time | line (new) = " + (ts1 - ts0) + " ms | byte (old) = " + (ts2 - ts1) + " ms");


    } catch (IOException ioe) {
      System.err.println("IOException: " + ioe);
    }
  }
}

/**
 * Null consumer
 */
class NullConsumer implements CSVTokenConsumer {
  public boolean consume    (String tokens[], String line, int line_no) { return true; }
  public void    commentLine(String line) { }
}