/fparsec/main/FParsecCS/CharStream.cs

http://github.com/sandersn/fing · C# · 2073 lines · 1467 code · 161 blank · 445 comment · 673 complexity · cbb1fa5f9620a84c92c17759d91abb76 MD5 · raw file

Large files are truncated click here to view the full file

  1. // Copyright (c) Stephan Tolksdorf 2007-2009
  2. // License: Simplified BSD License. See accompanying documentation.
  3. #if !LOW_TRUST
  4. using System;
  5. using System.IO;
  6. using System.Collections.Generic;
  7. using System.Text;
  8. using System.Text.RegularExpressions;
  9. using System.Diagnostics;
  10. using System.Reflection;
  11. using System.Runtime.Serialization;
  12. using System.Runtime.InteropServices;
  13. using System.Runtime.CompilerServices;
  14. namespace FParsec {
  15. /// <summary>Provides access to the char content of a binary Stream (or a String) through
  16. /// an iterator-based interface that is especially well suited for parser applications.</summary>
  17. public unsafe sealed class CharStream : IDisposable {
  18. // In order to facilitate efficient backtracking we divide the stream into overlapping
  19. // blocks with equal number of chars. The blocks are overlapping, so that
  20. // backtracking over short distances at a block boundary doesn't trigger a reread of the
  21. // previous block.
  22. //
  23. // Block 0
  24. //
  25. // -----------------|-------- Block 1
  26. // Overlap
  27. // --------|--------|-------- Block 2
  28. // Overlap
  29. // --------|--------|--------
  30. // (...)
  31. // a '-' symbolizes a char, a '|' a block boundary.
  32. //
  33. //
  34. // In general there's no fixed relationship between the number of input bytes and the
  35. // number of input chars. Worse, the encoding can be stateful, which makes it necessary
  36. // to persist the decoder state over block boundaries. If we later want to
  37. // be able to reread a certain block, we therefore need to keep record of various
  38. // bits of information describing the state of the input stream at the beginning of a block:
  39. private class BlockInfo {
  40. /// <summary>the byte stream index of the first char in the block after the OverhangCharsAtBlockBegin</summary>
  41. public long ByteIndex;
  42. /// <summary>the value of the CharStream's ByteBufferIndex before the block is read</summary>
  43. public int ByteBufferIndex;
  44. /// <summary>the number of bytes in the stream from ByteIndex to the first char after the OverhangCharsAfterOverlap</summary>
  45. public int NumberOfBytesInOverlap;
  46. /// <summary>the last char in the overlap with the previous block (used for integrity checking)</summary>
  47. public char LastCharInOverlap;
  48. /// <summary>chars at the block begin that were already read together with chars of the last block before the overlap</summary>
  49. public string OverhangCharsAtBlockBegin;
  50. /// <summary>chars after the overlap with the previous block that were already read together with the overlap chars</summary>
  51. public string OverhangCharsAfterOverlap;
  52. public DecoderState DecoderStateAtBlockBegin;
  53. public DecoderState DecoderStateAfterOverlap;
  54. public BlockInfo(long byteIndex, int byteBufferIndex,
  55. int nBytesInOverlapCount, char lastCharInOverlap,
  56. string overhangCharsAtBlockBegin, DecoderState decoderStateAtBlockBegin,
  57. string overhangCharsAfterOverlap, DecoderState decoderStateAfterOverlap)
  58. {
  59. this.ByteIndex = byteIndex;
  60. this.ByteBufferIndex = byteBufferIndex;
  61. this.NumberOfBytesInOverlap = nBytesInOverlapCount;
  62. this.LastCharInOverlap = lastCharInOverlap;
  63. this.OverhangCharsAtBlockBegin = overhangCharsAtBlockBegin;
  64. this.OverhangCharsAfterOverlap = overhangCharsAfterOverlap;
  65. this.DecoderStateAtBlockBegin = decoderStateAtBlockBegin;
  66. this.DecoderStateAfterOverlap = decoderStateAfterOverlap;
  67. }
  68. }
  69. // Unfortunately the Decoder API has no explicit methods for managing the state,
  70. // which forces us to abuse the comparatively inefficient serialization API for this purpose.
  71. // (The absence of explicit state management or at least a deep cloning method in the Decoder interface
  72. // is almost as puzzling as the absence of such methods in System.Random).
  73. private static Dictionary<Type, MemberInfo[]> SerializableMemberInfoCache;
  74. private static MemberInfo[] GetSerializableDecoderMemberInfo(Decoder decoder) {
  75. Type type = decoder.GetType();
  76. if (!type.IsSerializable) return null;
  77. MemberInfo[] smis;
  78. if (SerializableMemberInfoCache == null) {
  79. SerializableMemberInfoCache = new Dictionary<Type,MemberInfo[]>(8);
  80. }
  81. lock (SerializableMemberInfoCache) {
  82. if (!SerializableMemberInfoCache.TryGetValue(type, out smis) ) {
  83. smis = FormatterServices.GetSerializableMembers(type, new StreamingContext(StreamingContextStates.Clone));
  84. SerializableMemberInfoCache.Add(type, smis);
  85. }
  86. }
  87. return smis;
  88. }
  89. private struct DecoderState {
  90. private object[] DecoderData;
  91. public DecoderState(Decoder decoder, MemberInfo[] serializableDecoderMembers) {
  92. DecoderData = serializableDecoderMembers != null
  93. ? FormatterServices.GetObjectData(decoder, serializableDecoderMembers)
  94. : null;
  95. }
  96. public void WriteTo(ref Decoder decoder, MemberInfo[] serializableDecoderMembers) {
  97. if (DecoderData != null) {
  98. //Decoder newDecoder = (Decoder) FormatterServices.GetUninitializedObject(decoder.GetType());
  99. //FormatterServices.PopulateObjectMembers(newDecoder, serializableDecoderMembers, DecoderData);
  100. //decoder = newDecoder;
  101. FormatterServices.PopulateObjectMembers(decoder, serializableDecoderMembers, DecoderData);
  102. } else {
  103. decoder.Reset();
  104. }
  105. }
  106. }
  107. private const int DefaultBlockSize = 3*(1 << 16); // 3*2^16 = 200k
  108. private const int DefaultByteBufferLength = (1 << 12);
  109. private static int MinimumByteBufferLength = 128; // must be larger than longest detectable preamble (we can only guess here)
  110. private const char EOS = '\uFFFF';
  111. // For ease of use, we need the iterators to hold a reference to the CharStream. If we stored
  112. // a CharStream reference directly in the iterator, the JIT would emit a call to the write barrier
  113. // thunk for each write to the reference field. As we want to use iterators mainly as immutable values,
  114. // we need them to be structs for performance reasons, and since structs are constantly copied
  115. // by design, we would get frequent write barrier calls*. Redirecting the CharStream
  116. // access through an "Anchor" allows us to relieve the GC from having to keep track of all the
  117. // CharStream references in the iterators. The trick is that an Anchor instance does not contain
  118. // any reference to a managed object, only a GCHandle to the CharStream and other value type members
  119. // important to the Iterators. Because the Anchor struct only has primitive members, we can take
  120. // an unmanaged pointer which the GC doesn't need to track. To avoid most GCHandle.Target accesses,
  121. // the CharStream stores pieces of information important to the iterators directly in the Anchor.
  122. //
  123. // * Just to be clear: Write barrier calls are rather cheap (about the cost of a virtual function
  124. // call) and overall FParsec performance is only marginally influenced by this optimization.
  125. // (Each Reply<_,_> value alone currently triggers 2-3 write barrier calls, even when it is
  126. // allocated on the stack and all fields are initialized to 0/null!).
  127. internal Anchor* anchor; // allocated and assigned during construction,
  128. // freed and set to null during disposal
  129. /// <summary>Represents the link between a CharStream and its Iterators.
  130. /// May be allocated on the unmanaged heap and holds a GCHandle, hence must be properly freed.</summary>
  131. internal struct Anchor {
  132. public int Block;
  133. /// <summary>The index of the last block of the stream, or Int32.MaxValue if the end of stream has not yet been detected.</summary>
  134. public int LastBlock;
  135. public GCHandle StreamHandle;
  136. /// <summary>Begin of the used part of the char buffer (stays constant). Is null if the CharStream is empty.</summary>
  137. public char* BufferBegin;
  138. /// <summary>End of the used part of the char buffer (varies for a multi-block stream). Is null if the CharStream is empty.</summary>
  139. public char* BufferEnd;
  140. public long CharIndex;
  141. public long CharIndexPlusOffset;
  142. public long CharIndexOffset;
  143. public long EndIndex;
  144. public int BlockSizeMinusOverlap;
  145. public bool NeedToFree;
  146. public static Anchor* Create(CharStream stream) {
  147. // We create the anchor instance on the unmanaged heap. An alternative would be to use a
  148. // pinned pointer, but that would carry the risk of fragmenting the managed heap
  149. // (because an Anchor is a small object that can be long-lived).
  150. // (If AllocHGlobal becomes a bottleneck, we could replace it with a pool allocator.)
  151. Anchor* p = (Anchor*) Marshal.AllocHGlobal(sizeof(Anchor));
  152. p->NeedToFree = true;
  153. p->StreamHandle = GCHandle.Alloc(stream, GCHandleType.Normal);
  154. return p;
  155. }
  156. public static void Free(Anchor *p) {
  157. p->StreamHandle.Free();
  158. if (p->NeedToFree) Marshal.FreeHGlobal((IntPtr) p);
  159. }
  160. }
  161. /// <summary>The Encoding that is used for decoding the underlying byte stream, or
  162. /// System.Text.UnicodeEncoding in case the stream was directly constructed
  163. /// from a string.</summary>
  164. public Encoding Encoding { get; private set; }
  165. // If the CharStream is constructed from a binary stream, we use a managed string as the char
  166. // buffer. This allows us to apply regular expressions directly to the input.
  167. // In the case of multi-block CharStreams we thus have to mutate the buffer string through pointers.
  168. // This is safe as long as we use a newly constructed string and we don't pass a reference
  169. // to the internal buffer string to the "outside world". (The one instance where we have to pass
  170. // a reference to the buffer string is regex matching. See the docs for Iterator.Match(regex) for more info.)
  171. //
  172. // Apart from Iter.Match(regex) we access the internal buffer only through a pinned pointer.
  173. // This way we avoid the overhead of redundant bounds checking and can support strings, char arrays
  174. // and unmanaged char buffers through the same interface. Accessing the buffer through pointers
  175. // is also a requirement for accessing the CharStream data through an Anchor pointer (see above).
  176. //
  177. // Pinning a string or char array makes life more difficult for the GC. However, as long as
  178. // the buffer is only short-lived or large enough to be allocated on the large object heap,
  179. // there shouldn't be a problem. Furthermore, the buffer strings for CharStreams constructed
  180. // from a binary stream are allocated through the StringBuffer interface and hence always live
  181. // on the large object heap. Thus, the only scenario to really worry about (and which the
  182. // documentation explicitly warns about) is when a large number of small CharStreams
  183. // are constructed directly from strings or char arrays and are used for an extended period of time.
  184. /// <summary>The string holding the char buffer, or null if the buffer is not part of a .NET string.</summary>
  185. internal string BufferString;
  186. /// <summary>A pointer to the beginning of BufferString, or null if BufferString is null.</summary>
  187. internal char* BufferStringPointer;
  188. /// <summary>Holds the GCHandle for CharStreams directly constructed from strings or char arrays.</summary>
  189. private GCHandle BufferHandle;
  190. /// <summary>Holds the StringBuffer for CharStreams constructed from a binary stream.</summary>
  191. private StringBuffer Buffer;
  192. private MultiBlockData Data;
  193. /// <summary>Contains the data and methods needed in case the input byte stream
  194. /// is large enough to span multiple blocks of the CharStream.</summary>
  195. private class MultiBlockData {
  196. public Anchor* anchor;
  197. public Stream Stream;
  198. // we keep a seperate record of the Stream.Position, so that we don't need to require Stream.CanSeek
  199. public long StreamPosition;
  200. public bool LeaveOpen;
  201. public int MaxCharCountForOneByte;
  202. public Decoder Decoder;
  203. public MemberInfo[] SerializableDecoderMembers;
  204. public int BlockSize;
  205. public int BlockOverlap;
  206. /// <summary>BufferBegin + BlockSize - minRegexSpace</summary>
  207. public char* RegexSpaceThreshold;
  208. /// <summary>The byte stream index of the first unused byte in the ByteBuffer.</summary>
  209. public long ByteIndex { get { return StreamPosition - (ByteBufferCount - ByteBufferIndex); } }
  210. public List<BlockInfo> Blocks;
  211. public byte[] ByteBuffer;
  212. public int ByteBufferIndex;
  213. public int ByteBufferCount;
  214. /// <summary>Refills the ByteBuffer if no unused byte is remaining.
  215. /// Returns the number of unused bytes in the (refilled) ByteBuffer.</summary>
  216. private int FillByteBuffer() {
  217. int n = ByteBufferCount - ByteBufferIndex;
  218. if (n > 0) return n;
  219. return ClearAndRefillByteBuffer(0);
  220. }
  221. /// <summary>Refills the ByteBuffer starting at the given index. If the underlying byte
  222. /// stream contains enough bytes, the ByteBuffer is filled up to the ByteBuffer.Length.
  223. /// Returns the number of bytes available for consumption in the refilled ByteBuffer.</summary>
  224. private int ClearAndRefillByteBuffer(int byteBufferIndex) {
  225. Debug.Assert(byteBufferIndex >= 0 && byteBufferIndex <= ByteBuffer.Length);
  226. // Stream.Read is not guaranteed to use all the provided output buffer, so we need
  227. // to call it in a loop when we want to rely on the buffer being fully filled
  228. // (unless we reach the end of the stream). Knowing that the buffer always gets
  229. // completely filled allows us to calculate the buffer utilization after skipping
  230. // a certain number of input bytes. For most streams there will be only one loop
  231. // iteration anyway (or two at the end of the stream).
  232. int i = byteBufferIndex;
  233. int m = ByteBuffer.Length - byteBufferIndex;
  234. while (m != 0) {
  235. int c = Stream.Read(ByteBuffer, i, m);
  236. if (c == 0) break;
  237. i += c;
  238. m -= c;
  239. }
  240. int n = i - byteBufferIndex;
  241. ByteBufferIndex = byteBufferIndex;
  242. ByteBufferCount = byteBufferIndex + n;
  243. StreamPosition += n;
  244. return n;
  245. }
  246. /// <summary>Reads up to the given maximum number of chars into the given buffer.
  247. /// If more than the maximum number of chars have to be read from the stream in order to
  248. /// fill the buffer (due to the way the Decoder API works), the overhang chars are
  249. /// returned through the output parameter.
  250. /// Returns a pointer to one char after the last char read.</summary>
  251. private char* ReadCharsFromStream(char* buffer, int maxCount, out string overhangChars) {
  252. Debug.Assert(maxCount >= 0);
  253. fixed (byte* byteBuffer = ByteBuffer) {
  254. overhangChars = null;
  255. try {
  256. while (maxCount >= MaxCharCountForOneByte) {// if maxCount < MaxCharCountForOneByte, Convert could throw
  257. int nBytesInByteBuffer = FillByteBuffer();
  258. bool flush = nBytesInByteBuffer == 0;
  259. int bytesUsed, charsUsed; bool completed = false;
  260. Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
  261. buffer, maxCount, flush,
  262. out bytesUsed, out charsUsed, out completed);
  263. ByteBufferIndex += bytesUsed; // GetChars consumed bytesUsed bytes from the byte buffer
  264. buffer += charsUsed;
  265. maxCount -= charsUsed;
  266. if (flush && completed) return buffer;
  267. }
  268. if (maxCount == 0) return buffer;
  269. char* cs = stackalloc char[MaxCharCountForOneByte];
  270. for (;;) {
  271. int nBytesInByteBuffer = FillByteBuffer();
  272. bool flush = nBytesInByteBuffer == 0;
  273. int bytesUsed, charsUsed; bool completed;
  274. Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
  275. cs, MaxCharCountForOneByte, flush,
  276. out bytesUsed, out charsUsed, out completed);
  277. ByteBufferIndex += bytesUsed;
  278. if (charsUsed > 0) {
  279. int i = 0;
  280. do {
  281. *(buffer++) = cs[i++];
  282. if (--maxCount == 0) {
  283. if (i < charsUsed) overhangChars = new string(cs, i, charsUsed - i);
  284. return buffer;
  285. }
  286. } while (i < charsUsed);
  287. }
  288. if (flush && completed) return buffer;
  289. }
  290. } catch (DecoderFallbackException e) {
  291. e.Data.Add("Stream.Position", ByteIndex + e.Index);
  292. throw;
  293. }
  294. }
  295. }
  296. /// <summary> Reads a block of chars (must be different from the current block)
  297. /// into the BufferString. Returns a pointer to the first char of the new block,
  298. /// or null if no chars could be read.</summary>
  299. internal char* ReadBlock(int block) {
  300. if (block > anchor->LastBlock) return null;
  301. int prevBlock = anchor->Block;
  302. if (block == prevBlock) throw new InvalidOperationException();
  303. if (SerializableDecoderMembers == null && block > 0) {
  304. if (prevBlock > block)
  305. throw new NotSupportedException("The CharStream does not support seeking backwards over ranges longer than the block overlap because the Encoding's Decoder is not serializable.");
  306. while (prevBlock + 1 < block) ReadBlock(++prevBlock);
  307. }
  308. BlockInfo bi = Blocks[block];
  309. int blockSizeMinusOverlap = BlockSize - BlockOverlap;
  310. long charIndex = Math.BigMul(block, blockSizeMinusOverlap);
  311. char* bufferBegin = anchor->BufferBegin;
  312. char* begin, buffer;
  313. int nCharsToRead;
  314. // fill [0 ... BlockOverlap-1] if block > 0
  315. if (prevBlock == block - 1) {
  316. MemMove(bufferBegin, bufferBegin + blockSizeMinusOverlap, BlockOverlap*2);
  317. Debug.Assert(bufferBegin[BlockOverlap - 1] == bi.LastCharInOverlap);
  318. begin = buffer = bufferBegin + BlockOverlap;
  319. } else if (prevBlock >= 0) {
  320. Stream.Seek(bi.ByteIndex, SeekOrigin.Begin); // will throw if Stream can't seek
  321. // now that there was no exception, we can change the state...
  322. StreamPosition = bi.ByteIndex;
  323. ClearAndRefillByteBuffer(bi.ByteBufferIndex);
  324. bi.DecoderStateAtBlockBegin.WriteTo(ref Decoder, SerializableDecoderMembers); // will reset Decoder if block == 0
  325. if (prevBlock == block + 1) {
  326. // move the overlap into [BlockSize - BlockOverlap, BlockSize - 1] before it gets overwritten
  327. MemMove(bufferBegin + blockSizeMinusOverlap, bufferBegin, BlockOverlap*2);
  328. }
  329. begin = buffer = bufferBegin;
  330. if (block > 0) {
  331. nCharsToRead = BlockOverlap;
  332. if (bi.OverhangCharsAtBlockBegin != null) {
  333. nCharsToRead -= bi.OverhangCharsAtBlockBegin.Length;
  334. for (int i = 0; i < bi.OverhangCharsAtBlockBegin.Length; ++i)
  335. *(buffer++) = bi.OverhangCharsAtBlockBegin[i];
  336. }
  337. string overhangCharsAfterOverlap;
  338. buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlap);
  339. if ( buffer != bufferBegin + BlockOverlap
  340. || ByteIndex != bi.ByteIndex + bi.NumberOfBytesInOverlap
  341. || *(buffer - 1) != bi.LastCharInOverlap
  342. || overhangCharsAfterOverlap != bi.OverhangCharsAfterOverlap)
  343. throw new IOException("CharStream: stream integrity error");
  344. }
  345. } else { // ReadBlock was called from the constructor
  346. if (block != 0) throw new InvalidOperationException();
  347. begin = buffer = bufferBegin;
  348. }
  349. // fill [0 ... BlockSize-BlockOverlap-1] if block == 0
  350. // and [BlockOverlap ... BlockSize-BlockOverlap-1] otherwise
  351. if (block == 0) {
  352. nCharsToRead = blockSizeMinusOverlap;
  353. } else {
  354. nCharsToRead = blockSizeMinusOverlap - BlockOverlap;
  355. if (bi.OverhangCharsAfterOverlap != null) {
  356. nCharsToRead -= bi.OverhangCharsAfterOverlap.Length;
  357. for (int i = 0; i < bi.OverhangCharsAfterOverlap.Length; ++i)
  358. *(buffer++) = bi.OverhangCharsAfterOverlap[i];
  359. }
  360. }
  361. string overhangCharsAtNextBlockBegin;
  362. buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAtNextBlockBegin);
  363. long byteIndexAtNextBlockBegin = ByteIndex;
  364. int byteBufferIndexAtNextBlockBegin = ByteBufferIndex;
  365. // fill [BlockSize-BlockOverlap ... BlockSize-1]
  366. if (block == Blocks.Count - 1) { // next block hasn't yet been read
  367. DecoderState decoderStateAtNextBlockBegin = new DecoderState(Decoder, SerializableDecoderMembers);
  368. nCharsToRead = BlockOverlap;
  369. if (overhangCharsAtNextBlockBegin != null) {
  370. nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
  371. for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
  372. *(buffer++) = overhangCharsAtNextBlockBegin[i];
  373. }
  374. string overhangCharsAfterOverlapWithNextBlock;
  375. buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
  376. if (anchor->LastBlock == Int32.MaxValue) { // last block hasn't yet been detected
  377. if (buffer == bufferBegin + BlockSize) {
  378. DecoderState decoderStateAfterOverlapWithNextBlock = new DecoderState(Decoder, SerializableDecoderMembers);
  379. int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
  380. Blocks.Add(new BlockInfo(byteIndexAtNextBlockBegin, byteBufferIndexAtNextBlockBegin,
  381. nBytesInOverlapWithNextBlock, *(buffer - 1),
  382. overhangCharsAtNextBlockBegin, decoderStateAtNextBlockBegin,
  383. overhangCharsAfterOverlapWithNextBlock, decoderStateAfterOverlapWithNextBlock));
  384. } else { // we reached the end of the stream
  385. anchor->LastBlock = block;
  386. anchor->EndIndex = anchor->CharIndexOffset + charIndex + (buffer - bufferBegin);
  387. }
  388. } else if (anchor->EndIndex != anchor->CharIndexOffset + charIndex + (buffer - bufferBegin)) {
  389. throw new IOException("CharStream: stream integrity error");
  390. }
  391. } else {
  392. BlockInfo nbi = Blocks[block + 1];
  393. if (buffer != bufferBegin + blockSizeMinusOverlap
  394. || byteIndexAtNextBlockBegin != nbi.ByteIndex
  395. || byteBufferIndexAtNextBlockBegin != nbi.ByteBufferIndex
  396. || overhangCharsAtNextBlockBegin != nbi.OverhangCharsAtBlockBegin)
  397. throw new IOException("CharStream: stream integrity error");
  398. if (prevBlock != block + 1 || (block == 0 && SerializableDecoderMembers == null)) { // jumping back to block 0 is supported even if the decoder is not serializable
  399. nCharsToRead = BlockOverlap;
  400. if (overhangCharsAtNextBlockBegin != null) {
  401. nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
  402. for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
  403. *(buffer++) = overhangCharsAtNextBlockBegin[i];
  404. }
  405. string overhangCharsAfterOverlapWithNextBlock;
  406. buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
  407. int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
  408. if (buffer != bufferBegin + BlockSize
  409. || nBytesInOverlapWithNextBlock != nbi.NumberOfBytesInOverlap
  410. || *(buffer - 1) != nbi.LastCharInOverlap
  411. || overhangCharsAfterOverlapWithNextBlock != nbi.OverhangCharsAfterOverlap)
  412. throw new IOException("CharStream: stream integrity error");
  413. } else {
  414. Debug.Assert(bufferBegin[BlockSize - 1] == nbi.LastCharInOverlap);
  415. buffer += BlockOverlap; // we already copied the chars at the beginning of this function
  416. int off = nbi.NumberOfBytesInOverlap - (ByteBufferCount - ByteBufferIndex);
  417. if (off > 0) {
  418. // we wouldn't have gotten here if the Stream didn't support seeking
  419. Stream.Seek(off, SeekOrigin.Current);
  420. StreamPosition += off;
  421. ClearAndRefillByteBuffer(off%ByteBuffer.Length);
  422. } else {
  423. ByteBufferIndex += nbi.NumberOfBytesInOverlap;
  424. }
  425. nbi.DecoderStateAfterOverlap.WriteTo(ref Decoder, SerializableDecoderMembers);
  426. }
  427. }
  428. anchor->Block = block;
  429. anchor->CharIndex = charIndex;
  430. anchor->CharIndexPlusOffset = anchor->CharIndexOffset + charIndex;
  431. anchor->BufferEnd = buffer;
  432. return begin == buffer ? null : begin;
  433. }
  434. }
  435. /// <summary>Reads all remaining chars into the given buffer. If the remaining stream
  436. /// content holds more than the given maximum number of chars, an exception will be thrown.</summary>
  437. private static int ReadAllRemainingCharsFromStream(char* buffer, int maxCount, byte[] byteBuffer, int byteBufferIndex, int byteBufferCount, Stream stream, long streamPosition, Decoder decoder) {
  438. Debug.Assert(maxCount > 0 && byteBufferIndex >= 0 && byteBufferIndex < byteBufferCount);
  439. fixed (byte* pByteBuffer = byteBuffer) {
  440. bool flush = false;
  441. int bufferCount = 0;
  442. for (;;) {
  443. try {
  444. bufferCount += decoder.GetChars(pByteBuffer + byteBufferIndex, byteBufferCount - byteBufferIndex,
  445. buffer + bufferCount, maxCount - bufferCount, flush);
  446. } catch (DecoderFallbackException e) {
  447. e.Data.Add("Stream.Position", streamPosition - (byteBufferCount - byteBufferIndex) + e.Index);
  448. throw;
  449. }
  450. if (flush) break;
  451. byteBufferIndex = 0; // GetChars consumed all bytes in the byte buffer
  452. byteBufferCount = stream.Read(byteBuffer, 0, byteBuffer.Length);
  453. streamPosition += byteBufferCount;
  454. flush = byteBufferCount == 0;
  455. }
  456. return bufferCount;
  457. }
  458. }
  459. /// <summary>The current block in BufferString.</summary>
  460. private int Block { get { return anchor->Block; } }
  461. /// <summary>The number of chars in BufferString.</summary>
  462. private int BufferCount { get { return PositiveDistance(anchor->BufferBegin, anchor->BufferEnd); } }
  463. /// <summary>The index of the first char in the stream, i.e. Begin.Index.
  464. /// This value is determined by the streamBeginIndex argument of some of the CharStream constructors.
  465. /// By default this value is 0.</summary>
  466. public long BeginIndex { get { return anchor->CharIndexOffset; } }
  467. /// <summary>The index of the last char of the stream plus 1,
  468. /// or Int64.MaxValue if the end of stream has not yet been detected.</summary>
  469. public long EndIndex { get { return anchor->EndIndex; } }
  470. [Obsolete("CharStream.IndexOffset has been renamed to CharStream.BeginIndex.")]
  471. public long IndexOffset { get { return BeginIndex; } }
  472. [Obsolete("CharStream.EndOfStream has been renamed to CharStream.EndIndex.")]
  473. public long EndOfStream { get { return EndIndex; } }
  474. // we don't have a public constructor that only takes a string to avoid potential confusion with a filepath constructor
  475. internal CharStream(string chars) {
  476. Debug.Assert(chars != null);
  477. BufferString = chars;
  478. // ByteBufferIndex = 0; // we recycle ByteBufferIndex for BufferStringIndex
  479. BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
  480. char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject();
  481. BufferStringPointer = bufferBegin;
  482. CharConstructorContinue(bufferBegin, chars.Length, 0);
  483. }
  484. /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive).</summary>
  485. /// <exception cref="ArgumentNullException">chars is null.</exception>
  486. /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
  487. public CharStream(string chars, int index, int length) : this(chars, index, length, 0) {}
  488. /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
  489. /// <exception cref="ArgumentNullException">chars is null.</exception>
  490. /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
  491. public CharStream(string chars, int index, int length, long streamBeginIndex) {
  492. if (chars == null) throw new ArgumentNullException("chars");
  493. if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
  494. if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
  495. if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
  496. BufferString = chars;
  497. BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
  498. char* pBufferString = (char*)BufferHandle.AddrOfPinnedObject();
  499. BufferStringPointer = pBufferString;
  500. CharConstructorContinue(pBufferString + index, length, streamBeginIndex);
  501. }
  502. /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive).</summary>
  503. /// <exception cref="ArgumentNullException">chars is null.</exception>
  504. /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
  505. public CharStream(char[] chars, int index, int length) : this(chars, index, length, 0) { }
  506. /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
  507. /// <exception cref="NullReferenceException">chars is null.</exception>
  508. /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
  509. public CharStream(char[] chars, int index, int length, long streamBeginIndex) {
  510. if (chars == null) throw new ArgumentNullException("chars");
  511. if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
  512. if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
  513. if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
  514. BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
  515. char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject() + index;
  516. CharConstructorContinue(bufferBegin, length, streamBeginIndex);
  517. }
  518. /// <summary>Constructs a CharStream from the length chars at the pointer address.</summary>
  519. /// <exception cref="ArgumentNullException">chars is null.</exception>
  520. /// <exception cref="ArgumentOutOfRangeException">length is negative.</exception>
  521. public CharStream(char* chars, int length) : this(chars, length, 0) {}
  522. /// <summary>Constructs a CharStream from the length chars at the pointer address. The first char in the stream is assigned the index streamBeginIndex.</summary>
  523. /// <exception cref="ArgumentNullException">chars is null.</exception>
  524. /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: length ≥ 0 and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
  525. public CharStream(char* chars, int length, long streamBeginIndex) {
  526. if (chars == null) throw new ArgumentNullException("chars");
  527. if (length < 0) throw new ArgumentOutOfRangeException("length", "The length is negative.");
  528. if (chars > unchecked(chars + length))
  529. throw new ArgumentOutOfRangeException("length", "The length is out of range.");
  530. if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
  531. CharConstructorContinue(chars, length, streamBeginIndex);
  532. }
  533. private void CharConstructorContinue(char* bufferBegin, int length, long streamBeginIndex) {
  534. Debug.Assert((bufferBegin != null || length == 0) && length >= 0 && bufferBegin <= unchecked(bufferBegin + length) && streamBeginIndex >= 0 && streamBeginIndex < (1L << 60));
  535. Encoding = Encoding.Unicode;
  536. var anchor = Anchor.Create(this);
  537. this.anchor = anchor;
  538. if (length != 0) {
  539. anchor->BufferBegin = bufferBegin;
  540. anchor->BufferEnd = bufferBegin + length;
  541. anchor->BlockSizeMinusOverlap = length;
  542. } else {
  543. anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
  544. anchor->BufferEnd = null;
  545. anchor->BlockSizeMinusOverlap = 0;
  546. }
  547. anchor->Block = 0;
  548. anchor->LastBlock = 0;
  549. anchor->CharIndex = 0;
  550. anchor->CharIndexPlusOffset = streamBeginIndex;
  551. anchor->CharIndexOffset = streamBeginIndex;
  552. anchor->EndIndex = streamBeginIndex + length;
  553. }
  554. internal CharStream(string chars, char* pChars, char* begin, int length, long streamIndexOffset, Anchor* newUninitializedAnchor) {
  555. Debug.Assert((chars == null ? pChars == null : pChars <= begin)
  556. && (begin != null || length == 0) && length >= 0 && begin <= unchecked(begin + length) && streamIndexOffset >= 0 && streamIndexOffset < (1L << 60));
  557. Debug.Assert(newUninitializedAnchor->NeedToFree == false && !newUninitializedAnchor->StreamHandle.IsAllocated
  558. && newUninitializedAnchor->Block == 0 && newUninitializedAnchor->LastBlock == 0 && newUninitializedAnchor->CharIndex == 0);
  559. BufferString = chars;
  560. BufferStringPointer = pChars;
  561. Encoding = Encoding.Unicode;
  562. var anchor = newUninitializedAnchor;
  563. this.anchor = anchor;
  564. if (length != 0) {
  565. anchor->BufferBegin = begin;
  566. anchor->BufferEnd = begin + length;
  567. anchor->BlockSizeMinusOverlap = length;
  568. } else {
  569. anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
  570. anchor->BufferEnd = null;
  571. anchor->BlockSizeMinusOverlap = 0;
  572. }
  573. anchor->CharIndexPlusOffset = streamIndexOffset;
  574. anchor->CharIndexOffset = streamIndexOffset;
  575. anchor->EndIndex = streamIndexOffset + length;
  576. anchor->StreamHandle = GCHandle.Alloc(this, GCHandleType.Normal);
  577. }
  578. /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
  579. public CharStream(string path, Encoding encoding)
  580. : this(path, encoding, true,
  581. DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
  582. /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
  583. public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
  584. : this(path, encoding, detectEncodingFromByteOrderMarks,
  585. DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
  586. /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, blockSize, blockOverlap, minRegexSpace, byteBufferLength).</summary>
  587. public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks,
  588. int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
  589. {
  590. if (encoding == null) throw new ArgumentNullException("encoding");
  591. var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan);
  592. try {
  593. StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
  594. blockSize, blockOverlap, minRegexSpace, byteBufferLength);
  595. } catch {
  596. stream.Dispose();
  597. throw;
  598. }
  599. }
  600. /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
  601. public CharStream(Stream stream, Encoding encoding)
  602. : this(stream,
  603. false, encoding, true,
  604. DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
  605. /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
  606. public CharStream(Stream stream, bool leaveOpen, Encoding encoding)
  607. : this(stream,
  608. leaveOpen, encoding, true,
  609. DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
  610. /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
  611. public CharStream(Stream stream, bool leaveOpen, Encoding encoding, bool detectEncodingFromByteOrderMarks)
  612. : this(stream,
  613. leaveOpen, encoding, detectEncodingFromByteOrderMarks,
  614. DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
  615. /// <summary>Constructs a CharStream from a byte Stream.</summary>
  616. /// <param name="stream">The byte stream providing the input.</param>
  617. /// <param name="leaveOpen">Indicates whether the byte Stream should be left open when the CharStream has finished reading it.</param>
  618. /// <param name="encoding">The (default) Encoding used for decoding the byte Stream into chars.</param>
  619. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether the constructor should detect the encoding from a unicode byte-order mark at the beginning of the stream. An encoding detected from a byte-order mark overrides the default encoding.</param>
  620. /// <param name="blockSize">The number of chars per block. The default is 3×2^16 ≈ 200k.</param>
  621. /// <param name="blockOverlap">The number of chars at the end of a block that are preserved when reading the next block into the char buffer. It must be less than blockSize/2, but not less than encoding.GetMaxCharCount(1). The default is blockSize/3.</param>
  622. /// <param name="minRegexSpace">The number of chars that are guaranteed to be visible to a regular expression when it is matched on the stream (assuming there are enough chars remaining in the stream). Must not be greater than blockOverlap. The default is 2/3 of blockOverlap.</param>
  623. /// <param name="byteBufferLength">The size of the byte buffer used for decoding purposes. The default is 2^12 = 4KB.</param>
  624. public CharStream(Stream stream, bool leaveOpen,
  625. Encoding encoding, bool detectEncodingFromByteOrderMarks,
  626. int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
  627. {
  628. if (stream == null) throw new ArgumentNullException("stream");
  629. if (!stream.CanRead) throw new ArgumentException("stream is not readable");
  630. if (encoding == null) throw new ArgumentNullException("encoding");
  631. StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
  632. blockSize, blockOverlap, minRegexSpace, byteBufferLength);
  633. }
  634. /// <summary>we modify this flag via reflection in the unit test</summary>
  635. private static bool DoNotRoundUpBlockSizeToSimplifyTesting = false;
  636. private void StreamConstructorContinue(Stream stream, bool leaveOpen,
  637. Encoding encoding, bool detectEncodingFromByteOrderMarks,
  638. int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
  639. {
  640. if (byteBufferLength < MinimumByteBufferLength) byteBufferLength = MinimumByteBufferLength;
  641. int bytesInStream = -1;
  642. long streamPosition;
  643. if (stream.CanSeek) {
  644. streamPosition = stream.Position;
  645. long streamLength = stream.Length - streamPosition;
  646. if (streamLength <= Int32.MaxValue) {
  647. bytesInStream = (int)streamLength;
  648. if (bytesInStream < byteBufferLength) byteBufferLength = bytesInStream;
  649. }
  650. } else {
  651. streamPosition = 0;
  652. }
  653. byte[] byteBuffer = new byte[byteBufferLength];
  654. int byteBufferCount = 0;
  655. do {
  656. int n = stream.Read(byteBuffer, byteBufferCount, byteBufferLength - byteBufferCount);
  657. if (n == 0) {
  658. bytesInStream = byteBufferCount;
  659. break;
  660. }
  661. byteBufferCount += n;
  662. } while (byteBufferCount < MinimumByteBufferLength);
  663. streamPosition += byteBufferCount;
  664. int preambleLength = Helper.DetectPreamble(byteBuffer, byteBufferCount, ref encoding, detectEncodingFromByteOrderMarks);
  665. bytesInStream -= preambleLength;
  666. Encoding = encoding;
  667. Decoder decoder = encoding.GetDecoder();
  668. // we allow such small block sizes only to simplify testing
  669. if (blockSize < 8) blockSize = DefaultBlockSize;
  670. bool allCharsFitIntoOneBlock = false;
  671. if (bytesInStream >= 0 && bytesInStream/4 <= blockSize) {
  672. if (bytesInStream != 0) {
  673. try {
  674. int maxCharCount = Encoding.GetMaxCharCount(bytesInStream); // may throw ArgumentOutOfRangeException
  675. if (blockSize >= maxCharCount) {
  676. allCharsFitIntoOneBlock = true;
  677. blockSize = maxCharCount;
  678. }
  679. } catch (ArgumentOutOfRangeException) { }
  680. } else {
  681. allCharsFitIntoOneBlock = true;
  682. blockSize = 0;
  683. }
  684. }
  685. var buffer = StringBuffer.Create(blockSize);
  686. Debug.Assert(buffer.Length >= blockSize && (blockSize > 0 || buffer.StringPointer == null));
  687. Buffer = buffer;
  688. BufferString = buffer.String;
  689. BufferStringPointer = buffer.StringPointer;
  690. char* bufferBegin = buffer.StringPointer + buffer.Index;
  691. try {
  692. if (allCharsFitIntoOneBlock) {
  693. int bufferCount = preambleLength == byteBufferCount
  694. ? 0
  695. : ReadAllRemainingCharsFromStream(bufferBegin, buffer.Length, byteBuffer, preambleLength, byteBufferCount, stream, streamPosition, decoder);
  696. if (!leaveOpen) stream.Close();
  697. var anchor = Anchor.Create(this);
  698. this.anchor = anchor;
  699. anchor->BlockSizeMinusOverlap = bufferCount;
  700. anchor->EndIndex = bufferCount;
  701. if (bufferCount != 0) {
  702. anchor->BufferBegin = bufferBegin;
  703. anchor->BufferEnd = bufferBegin + bufferCount;
  704. } else {
  705. anchor->BufferBegin = null;
  706. anchor->BufferEnd = null;
  707. }
  708. anchor->Block = 0;
  709. anchor->LastBlock = 0;
  710. anchor->CharIndex = 0;
  711. anchor->CharIndexOffset = 0;
  712. anchor->CharIndexPlusOffset = 0;
  713. } else {
  714. if (!DoNotRoundUpBlockSizeToSimplifyTesting) blockSize = buffer.Length;
  715. var d = new MultiBlockData();
  716. Data = d;
  717. d.Stream = stream;
  718. d.StreamPosition = streamPosition;
  719. d.LeaveOpen = leaveOpen;
  720. d.Decoder = decoder;
  721. d.ByteBuffer = byteBuffer;
  722. d.ByteBufferIndex = preambleLength;
  723. d.ByteBufferCount = byteBufferCount;
  724. d.MaxCharCountForOneByte = Math.Max(1, Encoding.GetMaxCharCount(1));
  725. d.SerializableDecoderMembers = GetSerializableDecoderMemberInfo(decoder);
  726. if (blockSize < 3*d.MaxCharCountForOneByte) blockSize = 3*d.MaxCharCountForOneByte;
  727. // MaxCharCountForOneByte == the maximum number of overhang chars
  728. if( Math.Min(blockOverlap, blockSize - 2*blockOverlap) < d.MaxCharCountForOneByte
  729. || blockOverlap >= blockSize/2) blockOverlap = blockSize/3;
  730. if (minRegexSpace < 0 || minRegexSpace > blockOverlap) minRegexSpace = 2*blockOverlap/3;
  731. d.BlockSize = blockSize;
  732. d.BlockOverlap = blockOverlap;
  733. d.RegexSpaceThreshold = bufferBegin + (blockSize - minRegexSpace);
  734. var anchor = Anchor.Create(this);
  735. this.anchor = anchor;
  736. d.anchor = anchor;
  737. anchor->BlockSizeMinusOverlap = blockSize - blockOverlap;
  738. anchor->EndIndex = Int64.MaxValue;
  739. anchor->BufferBegin = bufferBegin;
  740. anchor->BufferEnd = bufferBegin;
  741. anchor->Block = -2; // special value recognized by ReadBlock
  742. anchor->LastBlock = Int32.MaxValue;
  743. anchor->CharIndex = 0;
  744. anchor->CharIndexOffset = 0;
  745. anchor->CharIndexPlusOffset = 0;
  746. d.Blocks = new List<BlockInfo>();
  747. // the first block has no overlap with a previous block
  748. d.Blocks.Add(new BlockInfo(preambleLength, preambleLength, 0, EOS, null, new DecoderState(), null, new Decod