/fparsec/main/FParsecCS/CharStream.cs
http://github.com/sandersn/fing · C# · 2073 lines · 1467 code · 161 blank · 445 comment · 673 complexity · cbb1fa5f9620a84c92c17759d91abb76 MD5 · raw file
Large files are truncated click here to view the full file
- // Copyright (c) Stephan Tolksdorf 2007-2009
- // License: Simplified BSD License. See accompanying documentation.
- #if !LOW_TRUST
- using System;
- using System.IO;
- using System.Collections.Generic;
- using System.Text;
- using System.Text.RegularExpressions;
- using System.Diagnostics;
- using System.Reflection;
- using System.Runtime.Serialization;
- using System.Runtime.InteropServices;
- using System.Runtime.CompilerServices;
- namespace FParsec {
- /// <summary>Provides access to the char content of a binary Stream (or a String) through
- /// an iterator-based interface that is especially well suited for parser applications.</summary>
- public unsafe sealed class CharStream : IDisposable {
- // In order to facilitate efficient backtracking we divide the stream into overlapping
- // blocks with equal number of chars. The blocks are overlapping, so that
- // backtracking over short distances at a block boundary doesn't trigger a reread of the
- // previous block.
- //
- // Block 0
- //
- // -----------------|-------- Block 1
- // Overlap
- // --------|--------|-------- Block 2
- // Overlap
- // --------|--------|--------
- // (...)
- // a '-' symbolizes a char, a '|' a block boundary.
- //
- //
- // In general there's no fixed relationship between the number of input bytes and the
- // number of input chars. Worse, the encoding can be stateful, which makes it necessary
- // to persist the decoder state over block boundaries. If we later want to
- // be able to reread a certain block, we therefore need to keep record of various
- // bits of information describing the state of the input stream at the beginning of a block:
- private class BlockInfo {
- /// <summary>the byte stream index of the first char in the block after the OverhangCharsAtBlockBegin</summary>
- public long ByteIndex;
- /// <summary>the value of the CharStream's ByteBufferIndex before the block is read</summary>
- public int ByteBufferIndex;
- /// <summary>the number of bytes in the stream from ByteIndex to the first char after the OverhangCharsAfterOverlap</summary>
- public int NumberOfBytesInOverlap;
- /// <summary>the last char in the overlap with the previous block (used for integrity checking)</summary>
- public char LastCharInOverlap;
- /// <summary>chars at the block begin that were already read together with chars of the last block before the overlap</summary>
- public string OverhangCharsAtBlockBegin;
- /// <summary>chars after the overlap with the previous block that were already read together with the overlap chars</summary>
- public string OverhangCharsAfterOverlap;
- public DecoderState DecoderStateAtBlockBegin;
- public DecoderState DecoderStateAfterOverlap;
- public BlockInfo(long byteIndex, int byteBufferIndex,
- int nBytesInOverlapCount, char lastCharInOverlap,
- string overhangCharsAtBlockBegin, DecoderState decoderStateAtBlockBegin,
- string overhangCharsAfterOverlap, DecoderState decoderStateAfterOverlap)
- {
- this.ByteIndex = byteIndex;
- this.ByteBufferIndex = byteBufferIndex;
- this.NumberOfBytesInOverlap = nBytesInOverlapCount;
- this.LastCharInOverlap = lastCharInOverlap;
- this.OverhangCharsAtBlockBegin = overhangCharsAtBlockBegin;
- this.OverhangCharsAfterOverlap = overhangCharsAfterOverlap;
- this.DecoderStateAtBlockBegin = decoderStateAtBlockBegin;
- this.DecoderStateAfterOverlap = decoderStateAfterOverlap;
- }
- }
- // Unfortunately the Decoder API has no explicit methods for managing the state,
- // which forces us to abuse the comparatively inefficient serialization API for this purpose.
- // (The absence of explicit state management or at least a deep cloning method in the Decoder interface
- // is almost as puzzling as the absence of such methods in System.Random).
- private static Dictionary<Type, MemberInfo[]> SerializableMemberInfoCache;
- private static MemberInfo[] GetSerializableDecoderMemberInfo(Decoder decoder) {
- Type type = decoder.GetType();
- if (!type.IsSerializable) return null;
- MemberInfo[] smis;
- if (SerializableMemberInfoCache == null) {
- SerializableMemberInfoCache = new Dictionary<Type,MemberInfo[]>(8);
- }
- lock (SerializableMemberInfoCache) {
- if (!SerializableMemberInfoCache.TryGetValue(type, out smis) ) {
- smis = FormatterServices.GetSerializableMembers(type, new StreamingContext(StreamingContextStates.Clone));
- SerializableMemberInfoCache.Add(type, smis);
- }
- }
- return smis;
- }
- private struct DecoderState {
- private object[] DecoderData;
- public DecoderState(Decoder decoder, MemberInfo[] serializableDecoderMembers) {
- DecoderData = serializableDecoderMembers != null
- ? FormatterServices.GetObjectData(decoder, serializableDecoderMembers)
- : null;
- }
- public void WriteTo(ref Decoder decoder, MemberInfo[] serializableDecoderMembers) {
- if (DecoderData != null) {
- //Decoder newDecoder = (Decoder) FormatterServices.GetUninitializedObject(decoder.GetType());
- //FormatterServices.PopulateObjectMembers(newDecoder, serializableDecoderMembers, DecoderData);
- //decoder = newDecoder;
- FormatterServices.PopulateObjectMembers(decoder, serializableDecoderMembers, DecoderData);
- } else {
- decoder.Reset();
- }
- }
- }
- private const int DefaultBlockSize = 3*(1 << 16); // 3*2^16 = 200k
- private const int DefaultByteBufferLength = (1 << 12);
- private static int MinimumByteBufferLength = 128; // must be larger than longest detectable preamble (we can only guess here)
- private const char EOS = '\uFFFF';
- // For ease of use, we need the iterators to hold a reference to the CharStream. If we stored
- // a CharStream reference directly in the iterator, the JIT would emit a call to the write barrier
- // thunk for each write to the reference field. As we want to use iterators mainly as immutable values,
- // we need them to be structs for performance reasons, and since structs are constantly copied
- // by design, we would get frequent write barrier calls*. Redirecting the CharStream
- // access through an "Anchor" allows us to relieve the GC from having to keep track of all the
- // CharStream references in the iterators. The trick is that an Anchor instance does not contain
- // any reference to a managed object, only a GCHandle to the CharStream and other value type members
- // important to the Iterators. Because the Anchor struct only has primitive members, we can take
- // an unmanaged pointer which the GC doesn't need to track. To avoid most GCHandle.Target accesses,
- // the CharStream stores pieces of information important to the iterators directly in the Anchor.
- //
- // * Just to be clear: Write barrier calls are rather cheap (about the cost of a virtual function
- // call) and overall FParsec performance is only marginally influenced by this optimization.
- // (Each Reply<_,_> value alone currently triggers 2-3 write barrier calls, even when it is
- // allocated on the stack and all fields are initialized to 0/null!).
- internal Anchor* anchor; // allocated and assigned during construction,
- // freed and set to null during disposal
- /// <summary>Represents the link between a CharStream and its Iterators.
- /// May be allocated on the unmanaged heap and holds a GCHandle, hence must be properly freed.</summary>
- internal struct Anchor {
- public int Block;
- /// <summary>The index of the last block of the stream, or Int32.MaxValue if the end of stream has not yet been detected.</summary>
- public int LastBlock;
- public GCHandle StreamHandle;
- /// <summary>Begin of the used part of the char buffer (stays constant). Is null if the CharStream is empty.</summary>
- public char* BufferBegin;
- /// <summary>End of the used part of the char buffer (varies for a multi-block stream). Is null if the CharStream is empty.</summary>
- public char* BufferEnd;
- public long CharIndex;
- public long CharIndexPlusOffset;
- public long CharIndexOffset;
- public long EndIndex;
- public int BlockSizeMinusOverlap;
- public bool NeedToFree;
- public static Anchor* Create(CharStream stream) {
- // We create the anchor instance on the unmanaged heap. An alternative would be to use a
- // pinned pointer, but that would carry the risk of fragmenting the managed heap
- // (because an Anchor is a small object that can be long-lived).
- // (If AllocHGlobal becomes a bottleneck, we could replace it with a pool allocator.)
- Anchor* p = (Anchor*) Marshal.AllocHGlobal(sizeof(Anchor));
- p->NeedToFree = true;
- p->StreamHandle = GCHandle.Alloc(stream, GCHandleType.Normal);
- return p;
- }
- public static void Free(Anchor *p) {
- p->StreamHandle.Free();
- if (p->NeedToFree) Marshal.FreeHGlobal((IntPtr) p);
- }
- }
- /// <summary>The Encoding that is used for decoding the underlying byte stream, or
- /// System.Text.UnicodeEncoding in case the stream was directly constructed
- /// from a string.</summary>
- public Encoding Encoding { get; private set; }
- // If the CharStream is constructed from a binary stream, we use a managed string as the char
- // buffer. This allows us to apply regular expressions directly to the input.
- // In the case of multi-block CharStreams we thus have to mutate the buffer string through pointers.
- // This is safe as long as we use a newly constructed string and we don't pass a reference
- // to the internal buffer string to the "outside world". (The one instance where we have to pass
- // a reference to the buffer string is regex matching. See the docs for Iterator.Match(regex) for more info.)
- //
- // Apart from Iter.Match(regex) we access the internal buffer only through a pinned pointer.
- // This way we avoid the overhead of redundant bounds checking and can support strings, char arrays
- // and unmanaged char buffers through the same interface. Accessing the buffer through pointers
- // is also a requirement for accessing the CharStream data through an Anchor pointer (see above).
- //
- // Pinning a string or char array makes life more difficult for the GC. However, as long as
- // the buffer is only short-lived or large enough to be allocated on the large object heap,
- // there shouldn't be a problem. Furthermore, the buffer strings for CharStreams constructed
- // from a binary stream are allocated through the StringBuffer interface and hence always live
- // on the large object heap. Thus, the only scenario to really worry about (and which the
- // documentation explicitly warns about) is when a large number of small CharStreams
- // are constructed directly from strings or char arrays and are used for an extended period of time.
- /// <summary>The string holding the char buffer, or null if the buffer is not part of a .NET string.</summary>
- internal string BufferString;
- /// <summary>A pointer to the beginning of BufferString, or null if BufferString is null.</summary>
- internal char* BufferStringPointer;
- /// <summary>Holds the GCHandle for CharStreams directly constructed from strings or char arrays.</summary>
- private GCHandle BufferHandle;
- /// <summary>Holds the StringBuffer for CharStreams constructed from a binary stream.</summary>
- private StringBuffer Buffer;
- private MultiBlockData Data;
- /// <summary>Contains the data and methods needed in case the input byte stream
- /// is large enough to span multiple blocks of the CharStream.</summary>
- private class MultiBlockData {
- public Anchor* anchor;
- public Stream Stream;
- // we keep a seperate record of the Stream.Position, so that we don't need to require Stream.CanSeek
- public long StreamPosition;
- public bool LeaveOpen;
- public int MaxCharCountForOneByte;
- public Decoder Decoder;
- public MemberInfo[] SerializableDecoderMembers;
- public int BlockSize;
- public int BlockOverlap;
- /// <summary>BufferBegin + BlockSize - minRegexSpace</summary>
- public char* RegexSpaceThreshold;
- /// <summary>The byte stream index of the first unused byte in the ByteBuffer.</summary>
- public long ByteIndex { get { return StreamPosition - (ByteBufferCount - ByteBufferIndex); } }
- public List<BlockInfo> Blocks;
- public byte[] ByteBuffer;
- public int ByteBufferIndex;
- public int ByteBufferCount;
- /// <summary>Refills the ByteBuffer if no unused byte is remaining.
- /// Returns the number of unused bytes in the (refilled) ByteBuffer.</summary>
- private int FillByteBuffer() {
- int n = ByteBufferCount - ByteBufferIndex;
- if (n > 0) return n;
- return ClearAndRefillByteBuffer(0);
- }
- /// <summary>Refills the ByteBuffer starting at the given index. If the underlying byte
- /// stream contains enough bytes, the ByteBuffer is filled up to the ByteBuffer.Length.
- /// Returns the number of bytes available for consumption in the refilled ByteBuffer.</summary>
- private int ClearAndRefillByteBuffer(int byteBufferIndex) {
- Debug.Assert(byteBufferIndex >= 0 && byteBufferIndex <= ByteBuffer.Length);
- // Stream.Read is not guaranteed to use all the provided output buffer, so we need
- // to call it in a loop when we want to rely on the buffer being fully filled
- // (unless we reach the end of the stream). Knowing that the buffer always gets
- // completely filled allows us to calculate the buffer utilization after skipping
- // a certain number of input bytes. For most streams there will be only one loop
- // iteration anyway (or two at the end of the stream).
- int i = byteBufferIndex;
- int m = ByteBuffer.Length - byteBufferIndex;
- while (m != 0) {
- int c = Stream.Read(ByteBuffer, i, m);
- if (c == 0) break;
- i += c;
- m -= c;
- }
- int n = i - byteBufferIndex;
- ByteBufferIndex = byteBufferIndex;
- ByteBufferCount = byteBufferIndex + n;
- StreamPosition += n;
- return n;
- }
- /// <summary>Reads up to the given maximum number of chars into the given buffer.
- /// If more than the maximum number of chars have to be read from the stream in order to
- /// fill the buffer (due to the way the Decoder API works), the overhang chars are
- /// returned through the output parameter.
- /// Returns a pointer to one char after the last char read.</summary>
- private char* ReadCharsFromStream(char* buffer, int maxCount, out string overhangChars) {
- Debug.Assert(maxCount >= 0);
- fixed (byte* byteBuffer = ByteBuffer) {
- overhangChars = null;
- try {
- while (maxCount >= MaxCharCountForOneByte) {// if maxCount < MaxCharCountForOneByte, Convert could throw
- int nBytesInByteBuffer = FillByteBuffer();
- bool flush = nBytesInByteBuffer == 0;
- int bytesUsed, charsUsed; bool completed = false;
- Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
- buffer, maxCount, flush,
- out bytesUsed, out charsUsed, out completed);
- ByteBufferIndex += bytesUsed; // GetChars consumed bytesUsed bytes from the byte buffer
- buffer += charsUsed;
- maxCount -= charsUsed;
- if (flush && completed) return buffer;
- }
- if (maxCount == 0) return buffer;
- char* cs = stackalloc char[MaxCharCountForOneByte];
- for (;;) {
- int nBytesInByteBuffer = FillByteBuffer();
- bool flush = nBytesInByteBuffer == 0;
- int bytesUsed, charsUsed; bool completed;
- Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
- cs, MaxCharCountForOneByte, flush,
- out bytesUsed, out charsUsed, out completed);
- ByteBufferIndex += bytesUsed;
- if (charsUsed > 0) {
- int i = 0;
- do {
- *(buffer++) = cs[i++];
- if (--maxCount == 0) {
- if (i < charsUsed) overhangChars = new string(cs, i, charsUsed - i);
- return buffer;
- }
- } while (i < charsUsed);
- }
- if (flush && completed) return buffer;
- }
- } catch (DecoderFallbackException e) {
- e.Data.Add("Stream.Position", ByteIndex + e.Index);
- throw;
- }
- }
- }
- /// <summary> Reads a block of chars (must be different from the current block)
- /// into the BufferString. Returns a pointer to the first char of the new block,
- /// or null if no chars could be read.</summary>
- internal char* ReadBlock(int block) {
- if (block > anchor->LastBlock) return null;
- int prevBlock = anchor->Block;
- if (block == prevBlock) throw new InvalidOperationException();
- if (SerializableDecoderMembers == null && block > 0) {
- if (prevBlock > block)
- throw new NotSupportedException("The CharStream does not support seeking backwards over ranges longer than the block overlap because the Encoding's Decoder is not serializable.");
- while (prevBlock + 1 < block) ReadBlock(++prevBlock);
- }
- BlockInfo bi = Blocks[block];
- int blockSizeMinusOverlap = BlockSize - BlockOverlap;
- long charIndex = Math.BigMul(block, blockSizeMinusOverlap);
- char* bufferBegin = anchor->BufferBegin;
- char* begin, buffer;
- int nCharsToRead;
- // fill [0 ... BlockOverlap-1] if block > 0
- if (prevBlock == block - 1) {
- MemMove(bufferBegin, bufferBegin + blockSizeMinusOverlap, BlockOverlap*2);
- Debug.Assert(bufferBegin[BlockOverlap - 1] == bi.LastCharInOverlap);
- begin = buffer = bufferBegin + BlockOverlap;
- } else if (prevBlock >= 0) {
- Stream.Seek(bi.ByteIndex, SeekOrigin.Begin); // will throw if Stream can't seek
- // now that there was no exception, we can change the state...
- StreamPosition = bi.ByteIndex;
- ClearAndRefillByteBuffer(bi.ByteBufferIndex);
- bi.DecoderStateAtBlockBegin.WriteTo(ref Decoder, SerializableDecoderMembers); // will reset Decoder if block == 0
- if (prevBlock == block + 1) {
- // move the overlap into [BlockSize - BlockOverlap, BlockSize - 1] before it gets overwritten
- MemMove(bufferBegin + blockSizeMinusOverlap, bufferBegin, BlockOverlap*2);
- }
- begin = buffer = bufferBegin;
- if (block > 0) {
- nCharsToRead = BlockOverlap;
- if (bi.OverhangCharsAtBlockBegin != null) {
- nCharsToRead -= bi.OverhangCharsAtBlockBegin.Length;
- for (int i = 0; i < bi.OverhangCharsAtBlockBegin.Length; ++i)
- *(buffer++) = bi.OverhangCharsAtBlockBegin[i];
- }
- string overhangCharsAfterOverlap;
- buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlap);
- if ( buffer != bufferBegin + BlockOverlap
- || ByteIndex != bi.ByteIndex + bi.NumberOfBytesInOverlap
- || *(buffer - 1) != bi.LastCharInOverlap
- || overhangCharsAfterOverlap != bi.OverhangCharsAfterOverlap)
- throw new IOException("CharStream: stream integrity error");
- }
- } else { // ReadBlock was called from the constructor
- if (block != 0) throw new InvalidOperationException();
- begin = buffer = bufferBegin;
- }
- // fill [0 ... BlockSize-BlockOverlap-1] if block == 0
- // and [BlockOverlap ... BlockSize-BlockOverlap-1] otherwise
- if (block == 0) {
- nCharsToRead = blockSizeMinusOverlap;
- } else {
- nCharsToRead = blockSizeMinusOverlap - BlockOverlap;
- if (bi.OverhangCharsAfterOverlap != null) {
- nCharsToRead -= bi.OverhangCharsAfterOverlap.Length;
- for (int i = 0; i < bi.OverhangCharsAfterOverlap.Length; ++i)
- *(buffer++) = bi.OverhangCharsAfterOverlap[i];
- }
- }
- string overhangCharsAtNextBlockBegin;
- buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAtNextBlockBegin);
- long byteIndexAtNextBlockBegin = ByteIndex;
- int byteBufferIndexAtNextBlockBegin = ByteBufferIndex;
- // fill [BlockSize-BlockOverlap ... BlockSize-1]
- if (block == Blocks.Count - 1) { // next block hasn't yet been read
- DecoderState decoderStateAtNextBlockBegin = new DecoderState(Decoder, SerializableDecoderMembers);
- nCharsToRead = BlockOverlap;
- if (overhangCharsAtNextBlockBegin != null) {
- nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
- for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
- *(buffer++) = overhangCharsAtNextBlockBegin[i];
- }
- string overhangCharsAfterOverlapWithNextBlock;
- buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
- if (anchor->LastBlock == Int32.MaxValue) { // last block hasn't yet been detected
- if (buffer == bufferBegin + BlockSize) {
- DecoderState decoderStateAfterOverlapWithNextBlock = new DecoderState(Decoder, SerializableDecoderMembers);
- int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
- Blocks.Add(new BlockInfo(byteIndexAtNextBlockBegin, byteBufferIndexAtNextBlockBegin,
- nBytesInOverlapWithNextBlock, *(buffer - 1),
- overhangCharsAtNextBlockBegin, decoderStateAtNextBlockBegin,
- overhangCharsAfterOverlapWithNextBlock, decoderStateAfterOverlapWithNextBlock));
- } else { // we reached the end of the stream
- anchor->LastBlock = block;
- anchor->EndIndex = anchor->CharIndexOffset + charIndex + (buffer - bufferBegin);
- }
- } else if (anchor->EndIndex != anchor->CharIndexOffset + charIndex + (buffer - bufferBegin)) {
- throw new IOException("CharStream: stream integrity error");
- }
- } else {
- BlockInfo nbi = Blocks[block + 1];
- if (buffer != bufferBegin + blockSizeMinusOverlap
- || byteIndexAtNextBlockBegin != nbi.ByteIndex
- || byteBufferIndexAtNextBlockBegin != nbi.ByteBufferIndex
- || overhangCharsAtNextBlockBegin != nbi.OverhangCharsAtBlockBegin)
- throw new IOException("CharStream: stream integrity error");
- if (prevBlock != block + 1 || (block == 0 && SerializableDecoderMembers == null)) { // jumping back to block 0 is supported even if the decoder is not serializable
- nCharsToRead = BlockOverlap;
- if (overhangCharsAtNextBlockBegin != null) {
- nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
- for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
- *(buffer++) = overhangCharsAtNextBlockBegin[i];
- }
- string overhangCharsAfterOverlapWithNextBlock;
- buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
- int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
- if (buffer != bufferBegin + BlockSize
- || nBytesInOverlapWithNextBlock != nbi.NumberOfBytesInOverlap
- || *(buffer - 1) != nbi.LastCharInOverlap
- || overhangCharsAfterOverlapWithNextBlock != nbi.OverhangCharsAfterOverlap)
- throw new IOException("CharStream: stream integrity error");
- } else {
- Debug.Assert(bufferBegin[BlockSize - 1] == nbi.LastCharInOverlap);
- buffer += BlockOverlap; // we already copied the chars at the beginning of this function
- int off = nbi.NumberOfBytesInOverlap - (ByteBufferCount - ByteBufferIndex);
- if (off > 0) {
- // we wouldn't have gotten here if the Stream didn't support seeking
- Stream.Seek(off, SeekOrigin.Current);
- StreamPosition += off;
- ClearAndRefillByteBuffer(off%ByteBuffer.Length);
- } else {
- ByteBufferIndex += nbi.NumberOfBytesInOverlap;
- }
- nbi.DecoderStateAfterOverlap.WriteTo(ref Decoder, SerializableDecoderMembers);
- }
- }
- anchor->Block = block;
- anchor->CharIndex = charIndex;
- anchor->CharIndexPlusOffset = anchor->CharIndexOffset + charIndex;
- anchor->BufferEnd = buffer;
- return begin == buffer ? null : begin;
- }
- }
- /// <summary>Reads all remaining chars into the given buffer. If the remaining stream
- /// content holds more than the given maximum number of chars, an exception will be thrown.</summary>
- private static int ReadAllRemainingCharsFromStream(char* buffer, int maxCount, byte[] byteBuffer, int byteBufferIndex, int byteBufferCount, Stream stream, long streamPosition, Decoder decoder) {
- Debug.Assert(maxCount > 0 && byteBufferIndex >= 0 && byteBufferIndex < byteBufferCount);
- fixed (byte* pByteBuffer = byteBuffer) {
- bool flush = false;
- int bufferCount = 0;
- for (;;) {
- try {
- bufferCount += decoder.GetChars(pByteBuffer + byteBufferIndex, byteBufferCount - byteBufferIndex,
- buffer + bufferCount, maxCount - bufferCount, flush);
- } catch (DecoderFallbackException e) {
- e.Data.Add("Stream.Position", streamPosition - (byteBufferCount - byteBufferIndex) + e.Index);
- throw;
- }
- if (flush) break;
- byteBufferIndex = 0; // GetChars consumed all bytes in the byte buffer
- byteBufferCount = stream.Read(byteBuffer, 0, byteBuffer.Length);
- streamPosition += byteBufferCount;
- flush = byteBufferCount == 0;
- }
- return bufferCount;
- }
- }
- /// <summary>The current block in BufferString.</summary>
- private int Block { get { return anchor->Block; } }
- /// <summary>The number of chars in BufferString.</summary>
- private int BufferCount { get { return PositiveDistance(anchor->BufferBegin, anchor->BufferEnd); } }
- /// <summary>The index of the first char in the stream, i.e. Begin.Index.
- /// This value is determined by the streamBeginIndex argument of some of the CharStream constructors.
- /// By default this value is 0.</summary>
- public long BeginIndex { get { return anchor->CharIndexOffset; } }
- /// <summary>The index of the last char of the stream plus 1,
- /// or Int64.MaxValue if the end of stream has not yet been detected.</summary>
- public long EndIndex { get { return anchor->EndIndex; } }
- [Obsolete("CharStream.IndexOffset has been renamed to CharStream.BeginIndex.")]
- public long IndexOffset { get { return BeginIndex; } }
- [Obsolete("CharStream.EndOfStream has been renamed to CharStream.EndIndex.")]
- public long EndOfStream { get { return EndIndex; } }
- // we don't have a public constructor that only takes a string to avoid potential confusion with a filepath constructor
- internal CharStream(string chars) {
- Debug.Assert(chars != null);
- BufferString = chars;
- // ByteBufferIndex = 0; // we recycle ByteBufferIndex for BufferStringIndex
- BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
- char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject();
- BufferStringPointer = bufferBegin;
- CharConstructorContinue(bufferBegin, chars.Length, 0);
- }
- /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive).</summary>
- /// <exception cref="ArgumentNullException">chars is null.</exception>
- /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
- public CharStream(string chars, int index, int length) : this(chars, index, length, 0) {}
- /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
- /// <exception cref="ArgumentNullException">chars is null.</exception>
- /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex < 2^60.</exception>
- public CharStream(string chars, int index, int length, long streamBeginIndex) {
- if (chars == null) throw new ArgumentNullException("chars");
- if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
- if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
- if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
- BufferString = chars;
- BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
- char* pBufferString = (char*)BufferHandle.AddrOfPinnedObject();
- BufferStringPointer = pBufferString;
- CharConstructorContinue(pBufferString + index, length, streamBeginIndex);
- }
- /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive).</summary>
- /// <exception cref="ArgumentNullException">chars is null.</exception>
- /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
- public CharStream(char[] chars, int index, int length) : this(chars, index, length, 0) { }
- /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
- /// <exception cref="NullReferenceException">chars is null.</exception>
- /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex < 2^60.</exception>
- public CharStream(char[] chars, int index, int length, long streamBeginIndex) {
- if (chars == null) throw new ArgumentNullException("chars");
- if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
- if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
- if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
- BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
- char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject() + index;
- CharConstructorContinue(bufferBegin, length, streamBeginIndex);
- }
- /// <summary>Constructs a CharStream from the length chars at the pointer address.</summary>
- /// <exception cref="ArgumentNullException">chars is null.</exception>
- /// <exception cref="ArgumentOutOfRangeException">length is negative.</exception>
- public CharStream(char* chars, int length) : this(chars, length, 0) {}
- /// <summary>Constructs a CharStream from the length chars at the pointer address. The first char in the stream is assigned the index streamBeginIndex.</summary>
- /// <exception cref="ArgumentNullException">chars is null.</exception>
- /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: length ≥ 0 and 0 ≤ streamBeginIndex < 2^60.</exception>
- public CharStream(char* chars, int length, long streamBeginIndex) {
- if (chars == null) throw new ArgumentNullException("chars");
- if (length < 0) throw new ArgumentOutOfRangeException("length", "The length is negative.");
- if (chars > unchecked(chars + length))
- throw new ArgumentOutOfRangeException("length", "The length is out of range.");
- if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
- CharConstructorContinue(chars, length, streamBeginIndex);
- }
- private void CharConstructorContinue(char* bufferBegin, int length, long streamBeginIndex) {
- Debug.Assert((bufferBegin != null || length == 0) && length >= 0 && bufferBegin <= unchecked(bufferBegin + length) && streamBeginIndex >= 0 && streamBeginIndex < (1L << 60));
- Encoding = Encoding.Unicode;
- var anchor = Anchor.Create(this);
- this.anchor = anchor;
- if (length != 0) {
- anchor->BufferBegin = bufferBegin;
- anchor->BufferEnd = bufferBegin + length;
- anchor->BlockSizeMinusOverlap = length;
- } else {
- anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
- anchor->BufferEnd = null;
- anchor->BlockSizeMinusOverlap = 0;
- }
- anchor->Block = 0;
- anchor->LastBlock = 0;
- anchor->CharIndex = 0;
- anchor->CharIndexPlusOffset = streamBeginIndex;
- anchor->CharIndexOffset = streamBeginIndex;
- anchor->EndIndex = streamBeginIndex + length;
- }
- internal CharStream(string chars, char* pChars, char* begin, int length, long streamIndexOffset, Anchor* newUninitializedAnchor) {
- Debug.Assert((chars == null ? pChars == null : pChars <= begin)
- && (begin != null || length == 0) && length >= 0 && begin <= unchecked(begin + length) && streamIndexOffset >= 0 && streamIndexOffset < (1L << 60));
- Debug.Assert(newUninitializedAnchor->NeedToFree == false && !newUninitializedAnchor->StreamHandle.IsAllocated
- && newUninitializedAnchor->Block == 0 && newUninitializedAnchor->LastBlock == 0 && newUninitializedAnchor->CharIndex == 0);
- BufferString = chars;
- BufferStringPointer = pChars;
- Encoding = Encoding.Unicode;
- var anchor = newUninitializedAnchor;
- this.anchor = anchor;
- if (length != 0) {
- anchor->BufferBegin = begin;
- anchor->BufferEnd = begin + length;
- anchor->BlockSizeMinusOverlap = length;
- } else {
- anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
- anchor->BufferEnd = null;
- anchor->BlockSizeMinusOverlap = 0;
- }
- anchor->CharIndexPlusOffset = streamIndexOffset;
- anchor->CharIndexOffset = streamIndexOffset;
- anchor->EndIndex = streamIndexOffset + length;
- anchor->StreamHandle = GCHandle.Alloc(this, GCHandleType.Normal);
- }
- /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
- public CharStream(string path, Encoding encoding)
- : this(path, encoding, true,
- DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
- /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
- public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
- : this(path, encoding, detectEncodingFromByteOrderMarks,
- DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
- /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, blockSize, blockOverlap, minRegexSpace, byteBufferLength).</summary>
- public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks,
- int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
- {
- if (encoding == null) throw new ArgumentNullException("encoding");
- var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan);
- try {
- StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
- blockSize, blockOverlap, minRegexSpace, byteBufferLength);
- } catch {
- stream.Dispose();
- throw;
- }
- }
- /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
- public CharStream(Stream stream, Encoding encoding)
- : this(stream,
- false, encoding, true,
- DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
- /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
- public CharStream(Stream stream, bool leaveOpen, Encoding encoding)
- : this(stream,
- leaveOpen, encoding, true,
- DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
- /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
- public CharStream(Stream stream, bool leaveOpen, Encoding encoding, bool detectEncodingFromByteOrderMarks)
- : this(stream,
- leaveOpen, encoding, detectEncodingFromByteOrderMarks,
- DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
- /// <summary>Constructs a CharStream from a byte Stream.</summary>
- /// <param name="stream">The byte stream providing the input.</param>
- /// <param name="leaveOpen">Indicates whether the byte Stream should be left open when the CharStream has finished reading it.</param>
- /// <param name="encoding">The (default) Encoding used for decoding the byte Stream into chars.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether the constructor should detect the encoding from a unicode byte-order mark at the beginning of the stream. An encoding detected from a byte-order mark overrides the default encoding.</param>
- /// <param name="blockSize">The number of chars per block. The default is 3×2^16 ≈ 200k.</param>
- /// <param name="blockOverlap">The number of chars at the end of a block that are preserved when reading the next block into the char buffer. It must be less than blockSize/2, but not less than encoding.GetMaxCharCount(1). The default is blockSize/3.</param>
- /// <param name="minRegexSpace">The number of chars that are guaranteed to be visible to a regular expression when it is matched on the stream (assuming there are enough chars remaining in the stream). Must not be greater than blockOverlap. The default is 2/3 of blockOverlap.</param>
- /// <param name="byteBufferLength">The size of the byte buffer used for decoding purposes. The default is 2^12 = 4KB.</param>
- public CharStream(Stream stream, bool leaveOpen,
- Encoding encoding, bool detectEncodingFromByteOrderMarks,
- int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
- {
- if (stream == null) throw new ArgumentNullException("stream");
- if (!stream.CanRead) throw new ArgumentException("stream is not readable");
- if (encoding == null) throw new ArgumentNullException("encoding");
- StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
- blockSize, blockOverlap, minRegexSpace, byteBufferLength);
- }
- /// <summary>we modify this flag via reflection in the unit test</summary>
- private static bool DoNotRoundUpBlockSizeToSimplifyTesting = false;
- private void StreamConstructorContinue(Stream stream, bool leaveOpen,
- Encoding encoding, bool detectEncodingFromByteOrderMarks,
- int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
- {
- if (byteBufferLength < MinimumByteBufferLength) byteBufferLength = MinimumByteBufferLength;
- int bytesInStream = -1;
- long streamPosition;
- if (stream.CanSeek) {
- streamPosition = stream.Position;
- long streamLength = stream.Length - streamPosition;
- if (streamLength <= Int32.MaxValue) {
- bytesInStream = (int)streamLength;
- if (bytesInStream < byteBufferLength) byteBufferLength = bytesInStream;
- }
- } else {
- streamPosition = 0;
- }
- byte[] byteBuffer = new byte[byteBufferLength];
- int byteBufferCount = 0;
- do {
- int n = stream.Read(byteBuffer, byteBufferCount, byteBufferLength - byteBufferCount);
- if (n == 0) {
- bytesInStream = byteBufferCount;
- break;
- }
- byteBufferCount += n;
- } while (byteBufferCount < MinimumByteBufferLength);
- streamPosition += byteBufferCount;
- int preambleLength = Helper.DetectPreamble(byteBuffer, byteBufferCount, ref encoding, detectEncodingFromByteOrderMarks);
- bytesInStream -= preambleLength;
- Encoding = encoding;
- Decoder decoder = encoding.GetDecoder();
- // we allow such small block sizes only to simplify testing
- if (blockSize < 8) blockSize = DefaultBlockSize;
- bool allCharsFitIntoOneBlock = false;
- if (bytesInStream >= 0 && bytesInStream/4 <= blockSize) {
- if (bytesInStream != 0) {
- try {
- int maxCharCount = Encoding.GetMaxCharCount(bytesInStream); // may throw ArgumentOutOfRangeException
- if (blockSize >= maxCharCount) {
- allCharsFitIntoOneBlock = true;
- blockSize = maxCharCount;
- }
- } catch (ArgumentOutOfRangeException) { }
- } else {
- allCharsFitIntoOneBlock = true;
- blockSize = 0;
- }
- }
- var buffer = StringBuffer.Create(blockSize);
- Debug.Assert(buffer.Length >= blockSize && (blockSize > 0 || buffer.StringPointer == null));
- Buffer = buffer;
- BufferString = buffer.String;
- BufferStringPointer = buffer.StringPointer;
- char* bufferBegin = buffer.StringPointer + buffer.Index;
- try {
- if (allCharsFitIntoOneBlock) {
- int bufferCount = preambleLength == byteBufferCount
- ? 0
- : ReadAllRemainingCharsFromStream(bufferBegin, buffer.Length, byteBuffer, preambleLength, byteBufferCount, stream, streamPosition, decoder);
- if (!leaveOpen) stream.Close();
- var anchor = Anchor.Create(this);
- this.anchor = anchor;
- anchor->BlockSizeMinusOverlap = bufferCount;
- anchor->EndIndex = bufferCount;
- if (bufferCount != 0) {
- anchor->BufferBegin = bufferBegin;
- anchor->BufferEnd = bufferBegin + bufferCount;
- } else {
- anchor->BufferBegin = null;
- anchor->BufferEnd = null;
- }
- anchor->Block = 0;
- anchor->LastBlock = 0;
- anchor->CharIndex = 0;
- anchor->CharIndexOffset = 0;
- anchor->CharIndexPlusOffset = 0;
- } else {
- if (!DoNotRoundUpBlockSizeToSimplifyTesting) blockSize = buffer.Length;
- var d = new MultiBlockData();
- Data = d;
- d.Stream = stream;
- d.StreamPosition = streamPosition;
- d.LeaveOpen = leaveOpen;
- d.Decoder = decoder;
- d.ByteBuffer = byteBuffer;
- d.ByteBufferIndex = preambleLength;
- d.ByteBufferCount = byteBufferCount;
- d.MaxCharCountForOneByte = Math.Max(1, Encoding.GetMaxCharCount(1));
- d.SerializableDecoderMembers = GetSerializableDecoderMemberInfo(decoder);
- if (blockSize < 3*d.MaxCharCountForOneByte) blockSize = 3*d.MaxCharCountForOneByte;
- // MaxCharCountForOneByte == the maximum number of overhang chars
- if( Math.Min(blockOverlap, blockSize - 2*blockOverlap) < d.MaxCharCountForOneByte
- || blockOverlap >= blockSize/2) blockOverlap = blockSize/3;
- if (minRegexSpace < 0 || minRegexSpace > blockOverlap) minRegexSpace = 2*blockOverlap/3;
- d.BlockSize = blockSize;
- d.BlockOverlap = blockOverlap;
- d.RegexSpaceThreshold = bufferBegin + (blockSize - minRegexSpace);
- var anchor = Anchor.Create(this);
- this.anchor = anchor;
- d.anchor = anchor;
- anchor->BlockSizeMinusOverlap = blockSize - blockOverlap;
- anchor->EndIndex = Int64.MaxValue;
- anchor->BufferBegin = bufferBegin;
- anchor->BufferEnd = bufferBegin;
- anchor->Block = -2; // special value recognized by ReadBlock
- anchor->LastBlock = Int32.MaxValue;
- anchor->CharIndex = 0;
- anchor->CharIndexOffset = 0;
- anchor->CharIndexPlusOffset = 0;
- d.Blocks = new List<BlockInfo>();
- // the first block has no overlap with a previous block
- d.Blocks.Add(new BlockInfo(preambleLength, preambleLength, 0, EOS, null, new DecoderState(), null, new Decod…