/src/CsvHelper/CsvParser.cs

https://github.com/JoshClose/CsvHelper · C# · 1166 lines · 875 code · 192 blank · 99 comment · 224 complexity · 5d986ec7ab74d4e81ab3d16fa5935b92 MD5 · raw file

  1. // Copyright 2009-2021 Josh Close
  2. // This file is a part of CsvHelper and is dual licensed under MS-PL and Apache 2.0.
  3. // See LICENSE.txt for details or visit http://www.opensource.org/licenses/ms-pl.html for MS-PL and http://opensource.org/licenses/Apache-2.0 for Apache 2.0.
  4. // https://github.com/JoshClose/CsvHelper
  5. using CsvHelper.Configuration;
  6. using System;
  7. using System.Buffers;
  8. using System.Collections.Generic;
  9. using System.Diagnostics;
  10. using System.Globalization;
  11. using System.IO;
  12. using System.Linq;
  13. using System.Runtime.CompilerServices;
  14. using System.Text;
  15. using System.Text.RegularExpressions;
  16. using System.Threading.Tasks;
  17. namespace CsvHelper
  18. {
  19. /// <summary>
  20. /// Parses a CSV file.
  21. /// </summary>
  22. public class CsvParser : IParser, IDisposable
  23. {
  24. private readonly CsvConfiguration configuration;
  25. private readonly FieldCache fieldCache = new FieldCache();
  26. private readonly TextReader reader;
  27. private readonly char quote;
  28. private readonly char escape;
  29. private readonly bool countBytes;
  30. private readonly Encoding encoding;
  31. private readonly bool ignoreBlankLines;
  32. private readonly char comment;
  33. private readonly bool allowComments;
  34. private readonly BadDataFound badDataFound;
  35. private readonly bool lineBreakInQuotedFieldIsBadData;
  36. private readonly TrimOptions trimOptions;
  37. private readonly char[] whiteSpaceChars;
  38. private readonly bool leaveOpen;
  39. private readonly CsvMode mode;
  40. private readonly string newLine;
  41. private readonly char newLineFirstChar;
  42. private readonly bool isNewLineSet;
  43. private readonly bool cacheFields;
  44. private readonly string[] delimiterValues;
  45. private readonly bool detectDelimiter;
  46. private string delimiter;
  47. private char delimiterFirstChar;
  48. private char[] buffer;
  49. private int bufferSize;
  50. private int charsRead;
  51. private int bufferPosition;
  52. private int rowStartPosition;
  53. private int fieldStartPosition;
  54. private int row;
  55. private int rawRow;
  56. private long charCount;
  57. private long byteCount;
  58. private bool inQuotes;
  59. private bool inEscape;
  60. private Field[] fields;
  61. private int fieldsPosition;
  62. private bool disposed;
  63. private int quoteCount;
  64. private char[] processFieldBuffer;
  65. private int processFieldBufferSize;
  66. private ParserState state;
  67. private int delimiterPosition = 1;
  68. private int newLinePosition = 1;
  69. private bool fieldIsBadData;
  70. private bool fieldIsQuoted;
  71. private bool isProcessingField;
  72. /// <inheritdoc/>
  73. public long CharCount => charCount;
  74. /// <inheritdoc/>
  75. public long ByteCount => byteCount;
  76. /// <inheritdoc/>
  77. public int Row => row;
  78. /// <inheritdoc/>
  79. public string[] Record
  80. {
  81. get
  82. {
  83. if (fieldsPosition == 0)
  84. {
  85. return null;
  86. }
  87. var record = new string[fieldsPosition];
  88. for (var i = 0; i < record.Length; i++)
  89. {
  90. record[i] = this[i];
  91. }
  92. return record;
  93. }
  94. }
  95. /// <inheritdoc/>
  96. public string RawRecord => new string(buffer, rowStartPosition, bufferPosition - rowStartPosition);
  97. /// <inheritdoc/>
  98. public int Count => fieldsPosition;
  99. /// <inheritdoc/>
  100. public int RawRow => rawRow;
  101. /// <inheritdoc/>
  102. public string Delimiter => delimiter;
  103. /// <inheritdoc/>
  104. public CsvContext Context { get; private set; }
  105. /// <inheritdoc/>
  106. public IParserConfiguration Configuration => configuration;
  107. /// <inheritdoc/>
  108. public string this[int index]
  109. {
  110. get
  111. {
  112. if (isProcessingField)
  113. {
  114. var message =
  115. $"You can't access {nameof(IParser)}[int] or {nameof(IParser)}.{nameof(IParser.Record)} inside of the {nameof(BadDataFound)} callback. " +
  116. $"Use {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.Field)} and {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.RawRecord)} instead."
  117. ;
  118. throw new ParserException(Context, message);
  119. }
  120. isProcessingField = true;
  121. var field = GetField(index);
  122. isProcessingField = false;
  123. return field;
  124. }
  125. }
  126. /// <summary>
  127. /// Initializes a new instance of the <see cref="CsvParser"/> class.
  128. /// </summary>
  129. /// <param name="reader">The reader.</param>
  130. /// <param name="culture">The culture.</param>
  131. /// <param name="leaveOpen">if set to <c>true</c> [leave open].</param>
  132. public CsvParser(TextReader reader, CultureInfo culture, bool leaveOpen = false) : this(reader, new CsvConfiguration(culture) { LeaveOpen = leaveOpen }) { }
  133. /// <summary>
  134. /// Initializes a new instance of the <see cref="CsvParser"/> class.
  135. /// </summary>
  136. /// <param name="reader">The reader.</param>
  137. /// <param name="configuration">The configuration.</param>
  138. public CsvParser(TextReader reader, CsvConfiguration configuration)
  139. {
  140. configuration.Validate();
  141. this.reader = reader;
  142. this.configuration = configuration;
  143. Context = new CsvContext(this);
  144. allowComments = configuration.AllowComments;
  145. badDataFound = configuration.BadDataFound;
  146. bufferSize = configuration.BufferSize;
  147. cacheFields = configuration.CacheFields;
  148. comment = configuration.Comment;
  149. countBytes = configuration.CountBytes;
  150. delimiter = configuration.Delimiter;
  151. delimiterFirstChar = configuration.Delimiter[0];
  152. delimiterValues = configuration.DetectDelimiterValues;
  153. detectDelimiter = configuration.DetectDelimiter;
  154. encoding = configuration.Encoding;
  155. escape = configuration.Escape;
  156. ignoreBlankLines = configuration.IgnoreBlankLines;
  157. isNewLineSet = configuration.IsNewLineSet;
  158. leaveOpen = configuration.LeaveOpen;
  159. lineBreakInQuotedFieldIsBadData = configuration.LineBreakInQuotedFieldIsBadData;
  160. newLine = configuration.NewLine;
  161. newLineFirstChar = configuration.NewLine[0];
  162. mode = configuration.Mode;
  163. processFieldBufferSize = configuration.ProcessFieldBufferSize;
  164. quote = configuration.Quote;
  165. whiteSpaceChars = configuration.WhiteSpaceChars;
  166. trimOptions = configuration.TrimOptions;
  167. buffer = new char[bufferSize];
  168. processFieldBuffer = new char[processFieldBufferSize];
  169. fields = new Field[128];
  170. }
  171. /// <inheritdoc/>
  172. public bool Read()
  173. {
  174. rowStartPosition = bufferPosition;
  175. fieldStartPosition = rowStartPosition;
  176. fieldsPosition = 0;
  177. quoteCount = 0;
  178. row++;
  179. rawRow++;
  180. var c = '\0';
  181. var cPrev = c;
  182. while (true)
  183. {
  184. if (bufferPosition >= charsRead)
  185. {
  186. if (!FillBuffer())
  187. {
  188. return ReadEndOfFile();
  189. }
  190. if (row == 1 && detectDelimiter)
  191. {
  192. DetectDelimiter();
  193. }
  194. }
  195. if (ReadLine(ref c, ref cPrev) == ReadLineResult.Complete)
  196. {
  197. return true;
  198. }
  199. }
  200. }
  201. /// <inheritdoc/>
  202. public async Task<bool> ReadAsync()
  203. {
  204. rowStartPosition = bufferPosition;
  205. fieldStartPosition = rowStartPosition;
  206. fieldsPosition = 0;
  207. quoteCount = 0;
  208. row++;
  209. rawRow++;
  210. var c = '\0';
  211. var cPrev = c;
  212. while (true)
  213. {
  214. if (bufferPosition >= charsRead)
  215. {
  216. if (!await FillBufferAsync())
  217. {
  218. return ReadEndOfFile();
  219. }
  220. if (row == 1 && detectDelimiter)
  221. {
  222. DetectDelimiter();
  223. }
  224. }
  225. if (ReadLine(ref c, ref cPrev) == ReadLineResult.Complete)
  226. {
  227. return true;
  228. }
  229. }
  230. }
  231. private void DetectDelimiter()
  232. {
  233. var text = new string(buffer, 0, charsRead);
  234. while (text.Length > 0)
  235. {
  236. var index = text.IndexOf(newLine);
  237. var line = index > -1 ? text.Substring(0, index + newLine.Length) : text;
  238. var delimiterCounts = new Dictionary<string, int>();
  239. foreach (var delimiter in delimiterValues)
  240. {
  241. // Escape regex special chars to use as regex pattern.
  242. var pattern = Regex.Replace(delimiter, @"([.$^{\[(|)*+?\\])", "\\$1");
  243. delimiterCounts[delimiter] = Regex.Matches(line, pattern).Count;
  244. }
  245. var maxCount = delimiterCounts.OrderByDescending(c => c.Value).First();
  246. if (maxCount.Value > 0)
  247. {
  248. delimiter = maxCount.Key;
  249. delimiterFirstChar = delimiter[0];
  250. configuration.Validate();
  251. break;
  252. }
  253. text = index > -1 ? text.Substring(index + newLine.Length) : string.Empty;
  254. }
  255. }
  256. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  257. private ReadLineResult ReadLine(ref char c, ref char cPrev)
  258. {
  259. while (bufferPosition < charsRead)
  260. {
  261. if (state != ParserState.None)
  262. {
  263. // Continue the state before doing anything else.
  264. ReadLineResult result;
  265. switch (state)
  266. {
  267. case ParserState.Spaces:
  268. result = ReadSpaces(ref c);
  269. break;
  270. case ParserState.BlankLine:
  271. result = ReadBlankLine(ref c);
  272. break;
  273. case ParserState.Delimiter:
  274. result = ReadDelimiter(ref c);
  275. break;
  276. case ParserState.LineEnding:
  277. result = ReadLineEnding(ref c);
  278. break;
  279. case ParserState.NewLine:
  280. result = ReadNewLine(ref c);
  281. break;
  282. default:
  283. throw new InvalidOperationException($"Parser state '{state}' is not valid.");
  284. }
  285. var shouldReturn =
  286. // Buffer needs to be filled.
  287. result == ReadLineResult.Incomplete ||
  288. // Done reading row.
  289. result == ReadLineResult.Complete && (state == ParserState.LineEnding || state == ParserState.NewLine)
  290. ;
  291. if (result == ReadLineResult.Complete)
  292. {
  293. state = ParserState.None;
  294. }
  295. if (shouldReturn)
  296. {
  297. return result;
  298. }
  299. }
  300. cPrev = c;
  301. c = buffer[bufferPosition];
  302. bufferPosition++;
  303. charCount++;
  304. if (countBytes)
  305. {
  306. byteCount += encoding.GetByteCount(new char[] { c });
  307. }
  308. var isFirstCharOfRow = rowStartPosition == bufferPosition - 1;
  309. if (isFirstCharOfRow && (allowComments && c == comment || ignoreBlankLines && ((c == '\r' || c == '\n') && !isNewLineSet || c == newLineFirstChar && isNewLineSet)))
  310. {
  311. state = ParserState.BlankLine;
  312. var result = ReadBlankLine(ref c);
  313. if (result == ReadLineResult.Complete)
  314. {
  315. state = ParserState.None;
  316. continue;
  317. }
  318. else
  319. {
  320. return ReadLineResult.Incomplete;
  321. }
  322. }
  323. if (mode == CsvMode.RFC4180)
  324. {
  325. var isFirstCharOfField = fieldStartPosition == bufferPosition - 1;
  326. if (isFirstCharOfField)
  327. {
  328. if ((trimOptions & TrimOptions.Trim) == TrimOptions.Trim && ArrayHelper.Contains(whiteSpaceChars, c))
  329. {
  330. // Skip through whitespace. This is so we can process the field later.
  331. var result = ReadSpaces(ref c);
  332. if (result == ReadLineResult.Incomplete)
  333. {
  334. return result;
  335. }
  336. }
  337. // Fields are only quoted if the first character is a quote.
  338. // If not, read until a delimiter or newline is found.
  339. fieldIsQuoted = c == quote;
  340. }
  341. if (fieldIsQuoted)
  342. {
  343. if (c == quote || c == escape)
  344. {
  345. quoteCount++;
  346. if (!inQuotes && !isFirstCharOfField && cPrev != escape)
  347. {
  348. fieldIsBadData = true;
  349. }
  350. else if (!fieldIsBadData)
  351. {
  352. // Don't process field quotes after bad data has been detected.
  353. inQuotes = !inQuotes;
  354. }
  355. }
  356. if (inQuotes)
  357. {
  358. if (c == '\r' || c == '\n' && cPrev != '\r')
  359. {
  360. rawRow++;
  361. }
  362. // We don't care about anything else if we're in quotes.
  363. continue;
  364. }
  365. }
  366. else
  367. {
  368. if (c == quote || c == escape)
  369. {
  370. // If the field isn't quoted but contains a
  371. // quote or escape, it's has bad data.
  372. fieldIsBadData = true;
  373. }
  374. }
  375. }
  376. else if (mode == CsvMode.Escape)
  377. {
  378. if (inEscape)
  379. {
  380. inEscape = false;
  381. continue;
  382. }
  383. if (c == escape)
  384. {
  385. inEscape = true;
  386. continue;
  387. }
  388. }
  389. if (c == delimiterFirstChar)
  390. {
  391. state = ParserState.Delimiter;
  392. var result = ReadDelimiter(ref c);
  393. if (result == ReadLineResult.Incomplete)
  394. {
  395. return result;
  396. }
  397. state = ParserState.None;
  398. continue;
  399. }
  400. if (!isNewLineSet && (c == '\r' || c == '\n'))
  401. {
  402. state = ParserState.LineEnding;
  403. var result = ReadLineEnding(ref c);
  404. if (result == ReadLineResult.Complete)
  405. {
  406. state = ParserState.None;
  407. }
  408. return result;
  409. }
  410. if (isNewLineSet && c == newLineFirstChar)
  411. {
  412. state = ParserState.NewLine;
  413. var result = ReadNewLine(ref c);
  414. if (result == ReadLineResult.Complete)
  415. {
  416. state = ParserState.None;
  417. }
  418. return result;
  419. }
  420. }
  421. return ReadLineResult.Incomplete;
  422. }
  423. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  424. private ReadLineResult ReadSpaces(ref char c)
  425. {
  426. while (ArrayHelper.Contains(whiteSpaceChars, c))
  427. {
  428. if (bufferPosition >= charsRead)
  429. {
  430. return ReadLineResult.Incomplete;
  431. }
  432. c = buffer[bufferPosition];
  433. bufferPosition++;
  434. charCount++;
  435. if (countBytes)
  436. {
  437. byteCount += encoding.GetByteCount(new char[] { c });
  438. }
  439. }
  440. return ReadLineResult.Complete;
  441. }
  442. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  443. private ReadLineResult ReadBlankLine(ref char c)
  444. {
  445. while (bufferPosition < charsRead)
  446. {
  447. if (c == '\r' || c == '\n')
  448. {
  449. var result = ReadLineEnding(ref c);
  450. if (result == ReadLineResult.Complete)
  451. {
  452. rowStartPosition = bufferPosition;
  453. fieldStartPosition = rowStartPosition;
  454. row++;
  455. rawRow++;
  456. }
  457. return result;
  458. }
  459. c = buffer[bufferPosition];
  460. bufferPosition++;
  461. charCount++;
  462. if (countBytes)
  463. {
  464. byteCount += encoding.GetByteCount(new char[] { c });
  465. }
  466. }
  467. return ReadLineResult.Incomplete;
  468. }
  469. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  470. private ReadLineResult ReadDelimiter(ref char c)
  471. {
  472. for (var i = delimiterPosition; i < delimiter.Length; i++)
  473. {
  474. if (bufferPosition >= charsRead)
  475. {
  476. return ReadLineResult.Incomplete;
  477. }
  478. delimiterPosition++;
  479. c = buffer[bufferPosition];
  480. if (c != delimiter[i])
  481. {
  482. c = buffer[bufferPosition - 1];
  483. delimiterPosition = 1;
  484. return ReadLineResult.Complete;
  485. }
  486. bufferPosition++;
  487. charCount++;
  488. if (countBytes)
  489. {
  490. byteCount += encoding.GetByteCount(new[] { c });
  491. }
  492. if (bufferPosition >= charsRead)
  493. {
  494. return ReadLineResult.Incomplete;
  495. }
  496. }
  497. AddField(fieldStartPosition, bufferPosition - fieldStartPosition - delimiter.Length);
  498. fieldStartPosition = bufferPosition;
  499. delimiterPosition = 1;
  500. fieldIsBadData = false;
  501. return ReadLineResult.Complete;
  502. }
  503. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  504. private ReadLineResult ReadLineEnding(ref char c)
  505. {
  506. var lessChars = 1;
  507. if (c == '\r')
  508. {
  509. if (bufferPosition >= charsRead)
  510. {
  511. return ReadLineResult.Incomplete;
  512. }
  513. c = buffer[bufferPosition];
  514. if (c == '\n')
  515. {
  516. lessChars++;
  517. bufferPosition++;
  518. charCount++;
  519. if (countBytes)
  520. {
  521. byteCount += encoding.GetByteCount(new char[] { c });
  522. }
  523. }
  524. }
  525. if (state == ParserState.LineEnding)
  526. {
  527. AddField(fieldStartPosition, bufferPosition - fieldStartPosition - lessChars);
  528. }
  529. fieldIsBadData = false;
  530. return ReadLineResult.Complete;
  531. }
  532. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  533. private ReadLineResult ReadNewLine(ref char c)
  534. {
  535. for (var i = newLinePosition; i < newLine.Length; i++)
  536. {
  537. if (bufferPosition >= charsRead)
  538. {
  539. return ReadLineResult.Incomplete;
  540. }
  541. newLinePosition++;
  542. c = buffer[bufferPosition];
  543. if (c != newLine[i])
  544. {
  545. c = buffer[bufferPosition - 1];
  546. newLinePosition = 1;
  547. return ReadLineResult.Complete;
  548. }
  549. bufferPosition++;
  550. charCount++;
  551. if (countBytes)
  552. {
  553. byteCount += encoding.GetByteCount(new[] { c });
  554. }
  555. if (bufferPosition >= charsRead)
  556. {
  557. return ReadLineResult.Incomplete;
  558. }
  559. }
  560. AddField(fieldStartPosition, bufferPosition - fieldStartPosition - newLine.Length);
  561. fieldStartPosition = bufferPosition;
  562. newLinePosition = 1;
  563. fieldIsBadData = false;
  564. return ReadLineResult.Complete;
  565. }
  566. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  567. private bool ReadEndOfFile()
  568. {
  569. var state = this.state;
  570. this.state = ParserState.None;
  571. if (state == ParserState.BlankLine)
  572. {
  573. return false;
  574. }
  575. if (state == ParserState.Delimiter)
  576. {
  577. AddField(fieldStartPosition, bufferPosition - fieldStartPosition - delimiter.Length);
  578. fieldStartPosition = bufferPosition;
  579. AddField(fieldStartPosition, bufferPosition - fieldStartPosition);
  580. return true;
  581. }
  582. if (state == ParserState.LineEnding)
  583. {
  584. AddField(fieldStartPosition, bufferPosition - fieldStartPosition - 1);
  585. return true;
  586. }
  587. if (state == ParserState.NewLine)
  588. {
  589. AddField(fieldStartPosition, bufferPosition - fieldStartPosition - newLine.Length);
  590. return true;
  591. }
  592. if (rowStartPosition < bufferPosition)
  593. {
  594. AddField(fieldStartPosition, bufferPosition - fieldStartPosition);
  595. }
  596. return fieldsPosition > 0;
  597. }
  598. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  599. private void AddField(in int start, in int length)
  600. {
  601. if (fieldsPosition >= fields.Length)
  602. {
  603. Array.Resize(ref fields, fields.Length * 2);
  604. }
  605. ref var field = ref fields[fieldsPosition];
  606. field.Start = start - rowStartPosition;
  607. field.Length = length;
  608. field.QuoteCount = quoteCount;
  609. field.IsBad = fieldIsBadData;
  610. fieldsPosition++;
  611. quoteCount = 0;
  612. }
  613. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  614. private bool FillBuffer()
  615. {
  616. // Don't forget the async method below.
  617. if (rowStartPosition == 0 && charCount > 0 && charsRead == bufferSize)
  618. {
  619. // The record is longer than the memory buffer. Increase the buffer.
  620. bufferSize *= 2;
  621. var tempBuffer = new char[bufferSize];
  622. buffer.CopyTo(tempBuffer, 0);
  623. buffer = tempBuffer;
  624. }
  625. var charsLeft = Math.Max(charsRead - rowStartPosition, 0);
  626. Array.Copy(buffer, rowStartPosition, buffer, 0, charsLeft);
  627. fieldStartPosition -= rowStartPosition;
  628. rowStartPosition = 0;
  629. bufferPosition = charsLeft;
  630. charsRead = reader.Read(buffer, charsLeft, buffer.Length - charsLeft);
  631. if (charsRead == 0)
  632. {
  633. return false;
  634. }
  635. charsRead += charsLeft;
  636. return true;
  637. }
  638. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  639. private async Task<bool> FillBufferAsync()
  640. {
  641. if (rowStartPosition == 0 && charCount > 0 && charsRead == bufferSize)
  642. {
  643. // The record is longer than the memory buffer. Increase the buffer.
  644. bufferSize *= 2;
  645. var tempBuffer = new char[bufferSize];
  646. buffer.CopyTo(tempBuffer, 0);
  647. buffer = tempBuffer;
  648. }
  649. var charsLeft = Math.Max(charsRead - rowStartPosition, 0);
  650. Array.Copy(buffer, rowStartPosition, buffer, 0, charsLeft);
  651. fieldStartPosition -= rowStartPosition;
  652. rowStartPosition = 0;
  653. bufferPosition = charsLeft;
  654. charsRead = await reader.ReadAsync(buffer, charsLeft, buffer.Length - charsLeft);
  655. if (charsRead == 0)
  656. {
  657. return false;
  658. }
  659. charsRead += charsLeft;
  660. return true;
  661. }
  662. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  663. private string GetField(in int index)
  664. {
  665. if (index > fieldsPosition)
  666. {
  667. throw new IndexOutOfRangeException();
  668. }
  669. ref var field = ref fields[index];
  670. if (field.Length == 0)
  671. {
  672. return string.Empty;
  673. }
  674. var start = field.Start + rowStartPosition;
  675. var length = field.Length;
  676. var quoteCount = field.QuoteCount;
  677. ProcessedField processedField;
  678. switch (mode)
  679. {
  680. case CsvMode.RFC4180:
  681. processedField = field.IsBad
  682. ? ProcessRFC4180BadField(start, length)
  683. : ProcessRFC4180Field(start, length, quoteCount);
  684. break;
  685. case CsvMode.Escape:
  686. processedField = ProcessEscapeField(start, length);
  687. break;
  688. case CsvMode.NoEscape:
  689. processedField = ProcessNoEscapeField(start, length);
  690. break;
  691. default:
  692. throw new InvalidOperationException($"ParseMode '{mode}' is not handled.");
  693. }
  694. var value = cacheFields
  695. ? fieldCache.GetField(processedField.Buffer, processedField.Start, processedField.Length)
  696. : new string(processedField.Buffer, processedField.Start, processedField.Length);
  697. return value;
  698. }
  699. /// <inheritdoc/>
  700. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  701. protected ProcessedField ProcessRFC4180Field(in int start, in int length, in int quoteCount)
  702. {
  703. var newStart = start;
  704. var newLength = length;
  705. if ((trimOptions & TrimOptions.Trim) == TrimOptions.Trim)
  706. {
  707. ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars);
  708. }
  709. if (quoteCount == 0)
  710. {
  711. // Not quoted.
  712. // No processing needed.
  713. return new ProcessedField(newStart, newLength, buffer);
  714. }
  715. if (buffer[newStart] != quote || buffer[newStart + newLength - 1] != quote || newLength == 1 && buffer[newStart] == quote)
  716. {
  717. // If the field doesn't have quotes on the ends, or the field is a single quote char, it's bad data.
  718. return ProcessRFC4180BadField(start, length);
  719. }
  720. if (lineBreakInQuotedFieldIsBadData)
  721. {
  722. for (var i = newStart; i < newStart + newLength; i++)
  723. {
  724. if (buffer[i] == '\r' || buffer[i] == '\n')
  725. {
  726. return ProcessRFC4180BadField(start, length);
  727. }
  728. }
  729. }
  730. // Remove the quotes from the ends.
  731. newStart += 1;
  732. newLength -= 2;
  733. if ((trimOptions & TrimOptions.InsideQuotes) == TrimOptions.InsideQuotes)
  734. {
  735. ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars);
  736. }
  737. if (quoteCount == 2)
  738. {
  739. // The only quotes are the ends of the field.
  740. // No more processing is needed.
  741. return new ProcessedField(newStart, newLength, buffer);
  742. }
  743. if (newLength > processFieldBuffer.Length)
  744. {
  745. // Make sure the field processing buffer is large engough.
  746. while (newLength > processFieldBufferSize)
  747. {
  748. processFieldBufferSize *= 2;
  749. }
  750. processFieldBuffer = new char[processFieldBufferSize];
  751. }
  752. // Remove escapes.
  753. var inEscape = false;
  754. var position = 0;
  755. for (var i = newStart; i < newStart + newLength; i++)
  756. {
  757. var c = buffer[i];
  758. if (inEscape)
  759. {
  760. inEscape = false;
  761. }
  762. else if (c == escape)
  763. {
  764. inEscape = true;
  765. continue;
  766. }
  767. processFieldBuffer[position] = c;
  768. position++;
  769. }
  770. return new ProcessedField(0, position, processFieldBuffer);
  771. }
  772. /// <inheritdoc/>
  773. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  774. protected ProcessedField ProcessRFC4180BadField(in int start, in int length)
  775. {
  776. // If field is already known to be bad, different rules can be applied.
  777. var args = new BadDataFoundArgs(new string(buffer, start, length), RawRecord, Context);
  778. badDataFound?.Invoke(args);
  779. var newStart = start;
  780. var newLength = length;
  781. if ((trimOptions & TrimOptions.Trim) == TrimOptions.Trim)
  782. {
  783. ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars);
  784. }
  785. if (buffer[newStart] != quote)
  786. {
  787. // If the field doesn't start with a quote, don't process it.
  788. return new ProcessedField(newStart, newLength, buffer);
  789. }
  790. if (newLength > processFieldBuffer.Length)
  791. {
  792. // Make sure the field processing buffer is large engough.
  793. while (newLength > processFieldBufferSize)
  794. {
  795. processFieldBufferSize *= 2;
  796. }
  797. processFieldBuffer = new char[processFieldBufferSize];
  798. }
  799. // Remove escapes until the last quote is found.
  800. var inEscape = false;
  801. var position = 0;
  802. var c = '\0';
  803. var doneProcessing = false;
  804. for (var i = newStart + 1; i < newStart + newLength; i++)
  805. {
  806. var cPrev = c;
  807. c = buffer[i];
  808. // a,"b",c
  809. // a,"b "" c",d
  810. // a,"b "c d",e
  811. if (inEscape)
  812. {
  813. inEscape = false;
  814. if (c == quote)
  815. {
  816. // Ignore the quote after an escape.
  817. continue;
  818. }
  819. else if (cPrev == quote)
  820. {
  821. // The escape and quote are the same character.
  822. // This is the end of the field.
  823. // Don't process escapes for the rest of the field.
  824. doneProcessing = true;
  825. }
  826. }
  827. if (c == escape && !doneProcessing)
  828. {
  829. inEscape = true;
  830. continue;
  831. }
  832. processFieldBuffer[position] = c;
  833. position++;
  834. }
  835. return new ProcessedField(0, position, processFieldBuffer);
  836. }
  837. /// <inheritdoc/>
  838. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  839. protected ProcessedField ProcessEscapeField(in int start, in int length)
  840. {
  841. var newStart = start;
  842. var newLength = length;
  843. if ((trimOptions & TrimOptions.Trim) == TrimOptions.Trim)
  844. {
  845. ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars);
  846. }
  847. if (newLength > processFieldBuffer.Length)
  848. {
  849. // Make sure the field processing buffer is large engough.
  850. while (newLength > processFieldBufferSize)
  851. {
  852. processFieldBufferSize *= 2;
  853. }
  854. processFieldBuffer = new char[processFieldBufferSize];
  855. }
  856. // Remove escapes.
  857. var inEscape = false;
  858. var position = 0;
  859. for (var i = newStart; i < newStart + newLength; i++)
  860. {
  861. var c = buffer[i];
  862. if (inEscape)
  863. {
  864. inEscape = false;
  865. }
  866. else if (c == escape)
  867. {
  868. inEscape = true;
  869. continue;
  870. }
  871. processFieldBuffer[position] = c;
  872. position++;
  873. }
  874. return new ProcessedField(0, position, processFieldBuffer);
  875. }
  876. /// <inheritdoc/>
  877. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  878. protected ProcessedField ProcessNoEscapeField(in int start, in int length)
  879. {
  880. var newStart = start;
  881. var newLength = length;
  882. if ((trimOptions & TrimOptions.Trim) == TrimOptions.Trim)
  883. {
  884. ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars);
  885. }
  886. return new ProcessedField(newStart, newLength, buffer);
  887. }
  888. /// <inheritdoc/>
  889. public void Dispose()
  890. {
  891. // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
  892. Dispose(disposing: true);
  893. GC.SuppressFinalize(this);
  894. }
  895. /// <inheritdoc/>
  896. protected virtual void Dispose(bool disposing)
  897. {
  898. if (disposed)
  899. {
  900. return;
  901. }
  902. if (disposing)
  903. {
  904. // Dispose managed state (managed objects)
  905. if (!leaveOpen)
  906. {
  907. reader?.Dispose();
  908. }
  909. }
  910. // Free unmanaged resources (unmanaged objects) and override finalizer
  911. // Set large fields to null
  912. disposed = true;
  913. }
  914. /// <summary>
  915. /// Processes a raw field based on configuration.
  916. /// This will remove quotes, remove escapes, and trim if configured to.
  917. /// </summary>
  918. [DebuggerDisplay("Start = {Start}, Length = {Length}, Buffer.Length = {Buffer.Length}")]
  919. protected readonly ref struct ProcessedField
  920. {
  921. /// <summary>
  922. /// The start of the field in the buffer.
  923. /// </summary>
  924. public readonly int Start;
  925. /// <summary>
  926. /// The length of the field in the buffer.
  927. /// </summary>
  928. public readonly int Length;
  929. /// <summary>
  930. /// The buffer that contains the field.
  931. /// </summary>
  932. public readonly char[] Buffer;
  933. /// <summary>
  934. /// Creates a new instance of ProcessedField.
  935. /// </summary>
  936. /// <param name="start">The start of the field in the buffer.</param>
  937. /// <param name="length">The length of the field in the buffer.</param>
  938. /// <param name="buffer">The buffer that contains the field.</param>
  939. public ProcessedField(int start, int length, char[] buffer)
  940. {
  941. Start = start;
  942. Length = length;
  943. Buffer = buffer;
  944. }
  945. }
  946. private enum ReadLineResult
  947. {
  948. None = 0,
  949. Complete,
  950. Incomplete,
  951. }
  952. private enum ParserState
  953. {
  954. None = 0,
  955. Spaces,
  956. BlankLine,
  957. Delimiter,
  958. LineEnding,
  959. NewLine,
  960. }
  961. [DebuggerDisplay("Start = {Start}, Length = {Length}, QuoteCount = {QuoteCount}, IsBad = {IsBad}")]
  962. private struct Field
  963. {
  964. /// <summary>
  965. /// Starting position of the field.
  966. /// This is an offset from <see cref="rowStartPosition"/>.
  967. /// </summary>
  968. public int Start;
  969. public int Length;
  970. public int QuoteCount;
  971. public bool IsBad;
  972. }
  973. }
  974. }