PageRenderTime 36ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/racetrack/util/RFC4180CSVReader.java

https://bitbucket.org/dcode/racetrack
Java | 211 lines | 122 code | 23 blank | 66 comment | 96 complexity | d4b04cfcfedf40519f5e9f556ddc7785 MD5 | raw file
  1. /*
  2. Copyright 2019 David Trimm
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package racetrack.util;
  14. import java.io.BufferedInputStream;
  15. import java.io.BufferedReader;
  16. import java.io.File;
  17. import java.io.FileInputStream;
  18. import java.io.FileReader;
  19. import java.io.InputStream;
  20. import java.io.InputStreamReader;
  21. import java.io.IOException;
  22. import java.io.UnsupportedEncodingException;
  23. import java.util.ArrayList;
  24. import java.util.List;
  25. import java.util.zip.GZIPInputStream;
  26. import racetrack.framework.BundlesDT;
  27. /**
  28. * Read a CSV encoded using the RFC4180 Standard file. Caveats include
  29. * - Files ending with a .gz will be automatically unzipped
  30. * - blank fields will be replaced with the notset string
  31. *
  32. *@author D. Trimm
  33. *@version 1.0
  34. */
  35. public class RFC4180CSVReader {
  36. /**
  37. * Consumer for the parsed tokens
  38. */
  39. private CSVTokenConsumer consumer;
  40. /**
  41. * Flag to indicate that parsing should continue
  42. */
  43. private boolean keep_going = true;
  44. /**
  45. * Construct the reader and run it through the file. This is the much faster version but does not support
  46. * different encodings.
  47. *
  48. *@param file csv file to parse
  49. *@param consumer consumer to direct tokens to
  50. *@param encoding null if no decoding is specified; else the specified decoding string will be used (e.g., "UTF-8")
  51. */
  52. public void BrokenRFC4180CSVReader(File file, CSVTokenConsumer consumer) throws IOException {
  53. this.consumer = consumer; BufferedReader in = null;
  54. try {
  55. // Open the file
  56. if (file.getName().toLowerCase().endsWith(".gz")) in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
  57. else in = new BufferedReader(new FileReader(file));
  58. // Parse the line... and read additional lines if necessary ... finish a single row
  59. String line; int line_no = 0; while ((line = in.readLine()) != null && keep_going) { line_no++;
  60. List<String> tokens = new ArrayList<String>(); StringBuffer row = new StringBuffer(), sb = new StringBuffer();
  61. State state = State.TOKEN_START;
  62. int char_i = 0; char c = '\n', last_c = '\n'; while (char_i < line.length()) {
  63. last_c = c; c = line.charAt(char_i); char_i++; row.append(c);
  64. if (state == State.TOKEN_START) {
  65. if (c == '\"') { state = State.ESCAPED_TOKEN; c = ' '; /* reset the dquotes parser */ }
  66. else if (c == ',') { state = State.TOKEN_START; tokens.add(""); }
  67. else { state = State.NORMAL_TOKEN; sb.append(c); }
  68. } else if (state == State.NORMAL_TOKEN) {
  69. if (c == ',') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
  70. else { sb.append(c); }
  71. } else if (state == State.ESCAPED_TOKEN) {
  72. if (char_i == line.length() && c == '\"') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
  73. else if (char_i == line.length()) { line = in.readLine(); char_i = 0; sb.append(c); }
  74. else if (c == '\"' && last_c == '\"') { sb.append('\"'); c = ' '; /* reset the dquotes parser */ }
  75. else if (c == ',' && last_c == '\"') { state = State.TOKEN_START; tokens.add(sb.toString()); sb = new StringBuffer(); }
  76. else if (c == '\"') { /* wait */ }
  77. else { sb.append(c); }
  78. }
  79. }
  80. if (c == ',') tokens.add("");
  81. if (sb.length() > 0) tokens.add(sb.toString());
  82. pushTokens(tokens, row.toString(), line_no);
  83. }
  84. } catch (IOException ioe) {
  85. throw ioe;
  86. } finally {
  87. if (in != null) in.close();
  88. }
  89. }
  90. enum State { TOKEN_START, ESCAPED_TOKEN, NORMAL_TOKEN };
  91. /**
  92. * Constructor... just calls the other version...
  93. */
  94. public RFC4180CSVReader(File file, CSVTokenConsumer consumer) throws IOException { this(file, consumer, "UTF-8"); }
  95. /**
  96. * Construct the reader and run it through the file. This is the slower version... but does support different encodings.
  97. *
  98. *@param file csv file to parse
  99. *@param consumer consumer to direct tokens to
  100. *@param encoding null if no decoding is specified; else the specified decoding string will be used (e.g., "UTF-8")
  101. *@param old_version just to differentiate the newer version
  102. */
  103. public RFC4180CSVReader(File file, CSVTokenConsumer consumer, String encoding) throws IOException {
  104. this.consumer = consumer; InputStream in = null; List<Byte> bytes = new ArrayList<Byte>();
  105. try {
  106. if (file.getName().toLowerCase().endsWith(".gz")) in = new BufferedInputStream(new GZIPInputStream(new FileInputStream(file)), 1024*1024*8);
  107. else in = new BufferedInputStream(new FileInputStream(file), 1024*1024*8);
  108. int line_no = 1; boolean in_dquotes = false, last_was_dquote = false; List<String> tokens = new ArrayList<String>(); StringBuffer sb = new StringBuffer(), line = new StringBuffer();
  109. while (in.available() > 0 && keep_going) {
  110. int c = in.read(); line.append((char) c);
  111. if (c == -1) { /* Shouldn't Happen */ tokens.add(sb.toString()); pushTokens(tokens, line.toString(), line_no); sb.delete(0,sb.length()); line.delete(0,line.length()); tokens.clear(); line_no++;
  112. } else if (in_dquotes) {
  113. if (last_was_dquote) {
  114. if (c == ',') { /* End Token */ addToken(tokens, sb, bytes, encoding); in_dquotes = false; last_was_dquote = false;
  115. } else if (c == '\r') { /* Ingore */ in_dquotes = false; last_was_dquote = false;
  116. } else if (c == '\n') { /* End Token, End Line */ addToken(tokens, sb, bytes, encoding); pushTokens(tokens, line.toString(), line_no); line.delete(0,line.length()); tokens.clear(); in_dquotes = false; line_no++; last_was_dquote = false;
  117. } else { sb.append((char) c); bytes.add((byte) c); last_was_dquote = false; }
  118. } else if (c == '\"') { last_was_dquote = true;
  119. } else { sb.append((char) c); bytes.add((byte) c); }
  120. } else if (c == ',') { /* End Token */ addToken(tokens, sb, bytes, encoding);
  121. } else if (c == '\"') { /* Enter Quotes */ in_dquotes = true;
  122. } else if (c == '\r') { /* Ignore */
  123. } else if (c == '\n') { /* End Token, End Line */ addToken(tokens, sb, bytes, encoding); pushTokens(tokens, line.toString(), line_no); line.delete(0,line.length()); tokens.clear(); line_no++;
  124. } else { sb.append((char) c); bytes.add((byte) c); }
  125. }
  126. in.close(); in = null;
  127. } catch (IOException ioe) { throw ioe;
  128. } finally { if (in != null) in.close();
  129. }
  130. }
  131. /**
  132. * Add a token -- if encoding is set, use the bytes list to decode the token. Otherwise, use the stringbuffer version. After token is added, clear both structures.
  133. */
  134. private void addToken(List<String> tokens, StringBuffer sb, List<Byte> bytes, String encoding) throws UnsupportedEncodingException {
  135. // System.err.print("sb = \"" + sb.toString() + "\" bytes.size() = " + bytes.size() + " :::: ");
  136. if (encoding == null) { tokens.add(sb.toString()); } else {
  137. byte as_array[] = new byte[bytes.size()];
  138. for (int i=0;i<as_array.length;i++) as_array[i] = bytes.get(i);
  139. tokens.add(new String(as_array, encoding));
  140. }
  141. // System.err.println("added token \"" + tokens.get(tokens.size()-1) + "\"");
  142. sb.delete(0,sb.length()); bytes.clear();
  143. }
  144. /**
  145. *
  146. */
  147. private void pushTokens(List<String> tokens, String line, int line_no) {
  148. String array[] = new String[tokens.size()];
  149. for (int i=0;i<array.length;i++) {
  150. array[i] = tokens.get(i);
  151. if (array[i] == null || array[i].length() == 0) array[i] = BundlesDT.NOTSET;
  152. }
  153. keep_going = consumer.consume(array, line, line_no);
  154. }
  155. /**
  156. *
  157. */
  158. public static void main(String args[]) {
  159. try {
  160. /* RFC4180CSVReader reader = */ new RFC4180CSVReader(new File(args[0]), new CSVTokenConsumer() {
  161. public boolean consume(String tokens[], String line, int line_no) {
  162. System.out.println("@ " + line_no + " | Tokens = " + tokens.length);
  163. for (int i=0;i<tokens.length;i++) System.out.println(" T[" + i + "] = \"" + tokens[i] + "\" (" + BundlesDT.getEntityDataType(tokens[i]) + ")");
  164. return true;
  165. } public void commentLine(String line) { } },"UTF-8" );
  166. long ts0 = System.currentTimeMillis();
  167. /* reader = */ new RFC4180CSVReader(new File(args[0]), new NullConsumer());
  168. long ts1 = System.currentTimeMillis();
  169. /* reader = */ new RFC4180CSVReader(new File(args[0]), new NullConsumer(), null);
  170. long ts2 = System.currentTimeMillis();
  171. System.err.println("Time | line (new) = " + (ts1 - ts0) + " ms | byte (old) = " + (ts2 - ts1) + " ms");
  172. } catch (IOException ioe) {
  173. System.err.println("IOException: " + ioe);
  174. }
  175. }
  176. }
  177. /**
  178. * Null consumer
  179. */
  180. class NullConsumer implements CSVTokenConsumer {
  181. public boolean consume (String tokens[], String line, int line_no) { return true; }
  182. public void commentLine(String line) { }
  183. }