CharStream.cs - License: Simplified BSD License. See accomp…

/fparsec/main/FParsecCS/CharStream.cs

http://github.com/sandersn/fing · C# · 2073 lines · 1467 code · 161 blank · 445 comment · 673 complexity · cbb1fa5f9620a84c92c17759d91abb76 MD5 · raw file
Large files are truncated click here to view the full file

// Copyright (c) Stephan Tolksdorf 2007-2009
// License: Simplified BSD License. See accompanying documentation.

#if !LOW_TRUST

using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Diagnostics;
using System.Reflection;
using System.Runtime.Serialization;
using System.Runtime.InteropServices;
using System.Runtime.CompilerServices;


namespace FParsec {
/// <summary>Provides access to the char content of a binary Stream (or a String) through
/// an iterator-based interface that is especially well suited for parser applications.</summary>
public unsafe sealed class CharStream : IDisposable {

    // In order to facilitate efficient backtracking we divide the stream into overlapping
    // blocks with equal number of chars. The blocks are overlapping, so that
    // backtracking over short distances at a block boundary doesn't trigger a reread of the
    // previous block.
    //
    //              Block 0
    //
    //    -----------------|--------  Block 1
    //                       Overlap
    //                      --------|--------|--------  Block 2
    //                                        Overlap
    //                                        --------|--------|--------
    //                                                                  (...)
    //  a '-' symbolizes a char, a '|' a block boundary.
    //
    //
    // In general there's no fixed relationship between the number of input bytes and the
    // number of input chars. Worse, the encoding can be stateful, which makes it necessary
    // to persist the decoder state over block boundaries. If we later want to
    // be able to reread a certain block, we therefore need to keep record of various
    // bits of information describing the state of the input stream at the beginning of a block:

    private class BlockInfo {
        /// <summary>the byte stream index of the first char in the block after the OverhangCharsAtBlockBegin</summary>
        public long ByteIndex;
        /// <summary>the value of the CharStream's ByteBufferIndex before the block is read</summary>
        public int ByteBufferIndex;

        /// <summary>the number of bytes in the stream from ByteIndex to the first char after the OverhangCharsAfterOverlap</summary>
        public int NumberOfBytesInOverlap;

        /// <summary>the last char in the overlap with the previous block (used for integrity checking)</summary>
        public char LastCharInOverlap;

        /// <summary>chars at the block begin that were already read together with chars of the last block before the overlap</summary>
        public string OverhangCharsAtBlockBegin;
        /// <summary>chars after the overlap with the previous block that were already read together with the overlap chars</summary>
        public string OverhangCharsAfterOverlap;

        public DecoderState DecoderStateAtBlockBegin;
        public DecoderState DecoderStateAfterOverlap;


        public BlockInfo(long byteIndex, int byteBufferIndex,
                         int nBytesInOverlapCount, char lastCharInOverlap,
                         string overhangCharsAtBlockBegin, DecoderState decoderStateAtBlockBegin,
                         string overhangCharsAfterOverlap, DecoderState decoderStateAfterOverlap)
        {
            this.ByteIndex = byteIndex;
            this.ByteBufferIndex = byteBufferIndex;
            this.NumberOfBytesInOverlap = nBytesInOverlapCount;
            this.LastCharInOverlap = lastCharInOverlap;
            this.OverhangCharsAtBlockBegin = overhangCharsAtBlockBegin;
            this.OverhangCharsAfterOverlap = overhangCharsAfterOverlap;
            this.DecoderStateAtBlockBegin = decoderStateAtBlockBegin;
            this.DecoderStateAfterOverlap = decoderStateAfterOverlap;
        }
    }

    // Unfortunately the Decoder API has no explicit methods for managing the state,
    // which forces us to abuse the comparatively inefficient serialization API for this purpose.
    // (The absence of explicit state management or at least a deep cloning method in the Decoder interface
    // is almost as puzzling as the absence of such methods in System.Random).

    private static Dictionary<Type, MemberInfo[]> SerializableMemberInfoCache;

    private static MemberInfo[] GetSerializableDecoderMemberInfo(Decoder decoder) {
        Type type = decoder.GetType();
        if (!type.IsSerializable) return null;
        MemberInfo[] smis;
        if (SerializableMemberInfoCache == null) {
            SerializableMemberInfoCache = new Dictionary<Type,MemberInfo[]>(8);
        }
        lock (SerializableMemberInfoCache) {
            if (!SerializableMemberInfoCache.TryGetValue(type, out smis) ) {
                smis = FormatterServices.GetSerializableMembers(type, new StreamingContext(StreamingContextStates.Clone));
                SerializableMemberInfoCache.Add(type, smis);
            }
        }
        return smis;
    }

    private struct DecoderState {
        private object[] DecoderData;

        public DecoderState(Decoder decoder, MemberInfo[] serializableDecoderMembers) {
            DecoderData = serializableDecoderMembers != null
                          ? FormatterServices.GetObjectData(decoder, serializableDecoderMembers)
                          : null;
        }

        public void WriteTo(ref Decoder decoder, MemberInfo[] serializableDecoderMembers) {
            if (DecoderData != null) {
                //Decoder newDecoder = (Decoder) FormatterServices.GetUninitializedObject(decoder.GetType());
                //FormatterServices.PopulateObjectMembers(newDecoder, serializableDecoderMembers, DecoderData);
                //decoder = newDecoder;
                FormatterServices.PopulateObjectMembers(decoder, serializableDecoderMembers, DecoderData);
            } else {
                decoder.Reset();
            }
        }
    }

    private const int DefaultBlockSize = 3*(1 << 16); // 3*2^16 = 200k
    private const int DefaultByteBufferLength = (1 << 12);
    private static int MinimumByteBufferLength = 128; // must be larger than longest detectable preamble (we can only guess here)
    private const char EOS = '\uFFFF';

    // For ease of use, we need the iterators to hold a reference to the CharStream. If we stored
    // a CharStream reference directly in the iterator, the JIT would emit a call to the write barrier
    // thunk for each write to the reference field. As we want to use iterators mainly as immutable values,
    // we need them to be structs for performance reasons, and since structs are constantly copied
    // by design, we would get frequent write barrier calls*.  Redirecting the CharStream
    // access through an "Anchor" allows us to relieve the GC from having to keep track of all the
    // CharStream references in the iterators. The trick is that an Anchor instance does not contain
    // any reference to a managed object, only a GCHandle to the CharStream and other value type members
    // important to the Iterators. Because the Anchor struct only has primitive members, we can take
    // an unmanaged pointer which the GC doesn't need to track. To avoid most GCHandle.Target accesses,
    // the CharStream stores pieces of information important to the iterators directly in the Anchor.
    //
    // * Just to be clear: Write barrier calls are rather cheap (about the cost of a virtual function
    //   call) and overall FParsec performance is only marginally influenced by this optimization.
    //   (Each Reply<_,_> value alone currently triggers 2-3 write barrier calls, even when it is
    //   allocated on the stack and all fields are initialized to 0/null!).

    internal Anchor* anchor; // allocated and assigned during construction,
                             // freed and set to null during disposal

    /// <summary>Represents the link between a CharStream and its Iterators.
    /// May be allocated on the unmanaged heap and holds a GCHandle, hence must be properly freed.</summary>
    internal struct Anchor {
        public int Block;
        /// <summary>The index of the last block of the stream, or Int32.MaxValue if the end of stream has not yet been detected.</summary>
        public int LastBlock;
        public GCHandle StreamHandle;
        /// <summary>Begin of the used part of the char buffer (stays constant). Is null if the CharStream is empty.</summary>
        public char* BufferBegin;
        /// <summary>End of the used part of the char buffer (varies for a multi-block stream). Is null if the CharStream is empty.</summary>
        public char* BufferEnd;
        public long CharIndex;
        public long CharIndexPlusOffset;
        public long CharIndexOffset;
        public long EndIndex;
        public int BlockSizeMinusOverlap;
        public bool NeedToFree;

        public static Anchor* Create(CharStream stream) {
            // We create the anchor instance on the unmanaged heap. An alternative would be to use a
            // pinned pointer, but that would carry the risk of fragmenting the managed heap
            // (because an Anchor is a small object that can be long-lived).
            // (If AllocHGlobal becomes a bottleneck, we could replace it with a pool allocator.)
            Anchor* p = (Anchor*) Marshal.AllocHGlobal(sizeof(Anchor));
            p->NeedToFree = true;
            p->StreamHandle = GCHandle.Alloc(stream, GCHandleType.Normal);
            return p;
        }

        public static void Free(Anchor *p) {
            p->StreamHandle.Free();
            if (p->NeedToFree) Marshal.FreeHGlobal((IntPtr) p);
        }
    }

    /// <summary>The Encoding that is used for decoding the underlying byte stream, or
    /// System.Text.UnicodeEncoding in case the stream was directly constructed
    /// from a string.</summary>
    public  Encoding Encoding { get; private set; }

    // If the CharStream is constructed from a binary stream, we use a managed string as the char
    // buffer. This allows us to apply regular expressions directly to the input.
    // In the case of multi-block CharStreams we thus have to mutate the buffer string through pointers.
    // This is safe as long as we use a newly constructed string and we don't pass a reference
    // to the internal buffer string to the "outside world". (The one instance where we have to pass
    // a reference to the buffer string is regex matching. See the docs for Iterator.Match(regex) for more info.)
    //
    // Apart from Iter.Match(regex) we access the internal buffer only through a pinned pointer.
    // This way we avoid the overhead of redundant bounds checking and can support strings, char arrays
    // and unmanaged char buffers through the same interface. Accessing the buffer through pointers
    // is also a requirement for accessing the CharStream data through an Anchor pointer (see above).
    //
    // Pinning a string or char array makes life more difficult for the GC. However, as long as
    // the buffer is only short-lived or large enough to be allocated on the large object heap,
    // there shouldn't be a problem. Furthermore, the buffer strings for CharStreams constructed
    // from a binary stream are allocated through the StringBuffer interface and hence always live
    // on the large object heap. Thus, the only scenario to really worry about (and which the
    // documentation explicitly warns about) is when a large number of small CharStreams
    // are constructed directly from strings or char arrays and are used for an extended period of time.

    /// <summary>The string holding the char buffer, or null if the buffer is not part of a .NET string.</summary>
    internal string BufferString;
    /// <summary>A pointer to the beginning of BufferString, or null if BufferString is null.</summary>
    internal char* BufferStringPointer;

    /// <summary>Holds the GCHandle for CharStreams directly constructed from strings or char arrays.</summary>
    private GCHandle BufferHandle;
    /// <summary>Holds the StringBuffer for CharStreams constructed from a binary stream.</summary>
    private StringBuffer Buffer;

    private MultiBlockData Data;

    /// <summary>Contains the data and methods needed in case the input byte stream
    /// is large enough to span multiple blocks of the CharStream.</summary>
    private class MultiBlockData {
        public Anchor* anchor;

        public Stream Stream;
        // we keep a seperate record of the Stream.Position, so that we don't need to require Stream.CanSeek
        public long StreamPosition;
        public bool LeaveOpen;

        public int MaxCharCountForOneByte;
        public Decoder Decoder;
        public MemberInfo[] SerializableDecoderMembers;

        public int BlockSize;
        public int BlockOverlap;
        /// <summary>BufferBegin + BlockSize - minRegexSpace</summary>
        public char* RegexSpaceThreshold;

        /// <summary>The byte stream index of the first unused byte in the ByteBuffer.</summary>
        public long ByteIndex { get { return StreamPosition - (ByteBufferCount - ByteBufferIndex); } }

        public List<BlockInfo> Blocks;

        public byte[] ByteBuffer;
        public int ByteBufferIndex;
        public int ByteBufferCount;

         /// <summary>Refills the ByteBuffer if no unused byte is remaining.
        /// Returns the number of unused bytes in the (refilled) ByteBuffer.</summary>
        private int FillByteBuffer() {
            int n = ByteBufferCount - ByteBufferIndex;
            if (n > 0) return n;
            return ClearAndRefillByteBuffer(0);
        }

        /// <summary>Refills the ByteBuffer starting at the given index. If the underlying byte
        /// stream contains enough bytes, the ByteBuffer is filled up to the ByteBuffer.Length.
        /// Returns the number of bytes available for consumption in the refilled ByteBuffer.</summary>
        private int ClearAndRefillByteBuffer(int byteBufferIndex) {
            Debug.Assert(byteBufferIndex >= 0 && byteBufferIndex <= ByteBuffer.Length);
            // Stream.Read is not guaranteed to use all the provided output buffer, so we need
            // to call it in a loop when we want to rely on the buffer being fully filled
            // (unless we reach the end of the stream). Knowing that the buffer always gets
            // completely filled allows us to calculate the buffer utilization after skipping
            // a certain number of input bytes. For most streams there will be only one loop
            // iteration anyway (or two at the end of the stream).
            int i = byteBufferIndex;
            int m = ByteBuffer.Length - byteBufferIndex;
            while (m != 0) {
                int c = Stream.Read(ByteBuffer, i, m);
                if (c == 0) break;
                i += c;
                m -= c;
            }
            int n = i - byteBufferIndex;
            ByteBufferIndex = byteBufferIndex;
            ByteBufferCount = byteBufferIndex + n;
            StreamPosition += n;
            return n;
        }

        /// <summary>Reads up to the given maximum number of chars into the given buffer.
        /// If more than the maximum number of chars have to be read from the stream in order to
        /// fill the buffer (due to	the way the Decoder API works), the overhang chars are
        /// returned through the output parameter.
        /// Returns a pointer to one char after the last char read.</summary>
        private char* ReadCharsFromStream(char* buffer, int maxCount, out string overhangChars) {
            Debug.Assert(maxCount >= 0);
            fixed (byte* byteBuffer = ByteBuffer) {
                overhangChars = null;
                try {
                    while (maxCount >= MaxCharCountForOneByte) {// if maxCount < MaxCharCountForOneByte, Convert could throw
                        int nBytesInByteBuffer = FillByteBuffer();
                        bool flush = nBytesInByteBuffer == 0;
                        int bytesUsed, charsUsed; bool completed = false;
                        Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
                                        buffer, maxCount, flush,
                                        out bytesUsed, out charsUsed, out completed);
                        ByteBufferIndex += bytesUsed; // GetChars consumed bytesUsed bytes from the byte buffer
                        buffer += charsUsed;
                        maxCount -= charsUsed;
                        if (flush && completed) return buffer;
                    }
                    if (maxCount == 0) return buffer;

                    char* cs = stackalloc char[MaxCharCountForOneByte];
                    for (;;) {
                        int nBytesInByteBuffer = FillByteBuffer();
                        bool flush = nBytesInByteBuffer == 0;
                        int bytesUsed, charsUsed; bool completed;
                        Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
                                        cs, MaxCharCountForOneByte, flush,
                                        out bytesUsed, out charsUsed, out completed);
                        ByteBufferIndex += bytesUsed;
                        if (charsUsed > 0) {
                            int i = 0;
                            do {
                                *(buffer++) = cs[i++];
                                if (--maxCount == 0) {
                                    if (i < charsUsed) overhangChars = new string(cs, i, charsUsed - i);
                                    return buffer;
                                }
                            } while (i < charsUsed);
                        }
                        if (flush && completed) return buffer;
                    }
                } catch (DecoderFallbackException e) {
                    e.Data.Add("Stream.Position", ByteIndex + e.Index);
                    throw;
                }
            }
        }

        /// <summary> Reads a block of chars (must be different from the current block)
        /// into the BufferString. Returns a pointer to the first char of the new block,
        /// or null if no chars could be read.</summary>
        internal char* ReadBlock(int block) {
            if (block > anchor->LastBlock) return null;
            int prevBlock = anchor->Block;
            if (block == prevBlock) throw new InvalidOperationException();
            if (SerializableDecoderMembers == null && block > 0) {
                if (prevBlock > block)
                    throw new NotSupportedException("The CharStream does not support seeking backwards over ranges longer than the block overlap because the Encoding's Decoder is not serializable.");
                while (prevBlock + 1 < block) ReadBlock(++prevBlock);
            }

            BlockInfo bi = Blocks[block];
            int blockSizeMinusOverlap = BlockSize - BlockOverlap;
            long charIndex = Math.BigMul(block, blockSizeMinusOverlap);
            char* bufferBegin = anchor->BufferBegin;
            char* begin, buffer;
            int nCharsToRead;

            // fill [0 ... BlockOverlap-1] if block > 0
            if (prevBlock == block - 1) {
                MemMove(bufferBegin, bufferBegin + blockSizeMinusOverlap, BlockOverlap*2);
                Debug.Assert(bufferBegin[BlockOverlap - 1] == bi.LastCharInOverlap);
                begin = buffer = bufferBegin + BlockOverlap;
            } else if (prevBlock >= 0) {
                Stream.Seek(bi.ByteIndex, SeekOrigin.Begin); // will throw if Stream can't seek
                // now that there was no exception, we can change the state...
                StreamPosition = bi.ByteIndex;
                ClearAndRefillByteBuffer(bi.ByteBufferIndex);
                bi.DecoderStateAtBlockBegin.WriteTo(ref Decoder, SerializableDecoderMembers); // will reset Decoder if block == 0
                if (prevBlock == block + 1) {
                    // move the overlap into [BlockSize - BlockOverlap, BlockSize - 1] before it gets overwritten
                    MemMove(bufferBegin + blockSizeMinusOverlap, bufferBegin, BlockOverlap*2);
                }
                begin = buffer = bufferBegin;
                if (block > 0) {
                    nCharsToRead = BlockOverlap;
                    if (bi.OverhangCharsAtBlockBegin != null) {
                        nCharsToRead -= bi.OverhangCharsAtBlockBegin.Length;
                        for (int i = 0; i < bi.OverhangCharsAtBlockBegin.Length; ++i)
                            *(buffer++) = bi.OverhangCharsAtBlockBegin[i];
                    }
                    string overhangCharsAfterOverlap;
                    buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlap);
                    if (   buffer != bufferBegin + BlockOverlap
                        || ByteIndex != bi.ByteIndex + bi.NumberOfBytesInOverlap
                        || *(buffer - 1) != bi.LastCharInOverlap
                        || overhangCharsAfterOverlap != bi.OverhangCharsAfterOverlap)
                        throw new IOException("CharStream: stream integrity error");
                }
            } else { // ReadBlock was called from the constructor
                if (block != 0) throw new InvalidOperationException();
                begin = buffer = bufferBegin;
            }

            // fill [0            ... BlockSize-BlockOverlap-1] if block == 0
            // and  [BlockOverlap ... BlockSize-BlockOverlap-1] otherwise
            if (block == 0) {
                nCharsToRead = blockSizeMinusOverlap;
            } else {
                nCharsToRead = blockSizeMinusOverlap - BlockOverlap;
                if (bi.OverhangCharsAfterOverlap != null) {
                    nCharsToRead -= bi.OverhangCharsAfterOverlap.Length;
                    for (int i = 0; i < bi.OverhangCharsAfterOverlap.Length; ++i)
                        *(buffer++) = bi.OverhangCharsAfterOverlap[i];
                }
            }
            string overhangCharsAtNextBlockBegin;
            buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAtNextBlockBegin);

            long byteIndexAtNextBlockBegin = ByteIndex;
            int byteBufferIndexAtNextBlockBegin = ByteBufferIndex;

            // fill [BlockSize-BlockOverlap ... BlockSize-1]
            if (block == Blocks.Count - 1) { // next block hasn't yet been read
                DecoderState decoderStateAtNextBlockBegin = new DecoderState(Decoder, SerializableDecoderMembers);
                nCharsToRead = BlockOverlap;
                if (overhangCharsAtNextBlockBegin != null) {
                    nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
                    for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
                        *(buffer++) = overhangCharsAtNextBlockBegin[i];
                }
                string overhangCharsAfterOverlapWithNextBlock;
                buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
                if (anchor->LastBlock == Int32.MaxValue) { // last block hasn't yet been detected
                    if (buffer == bufferBegin + BlockSize) {
                        DecoderState decoderStateAfterOverlapWithNextBlock = new DecoderState(Decoder, SerializableDecoderMembers);
                        int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
                        Blocks.Add(new BlockInfo(byteIndexAtNextBlockBegin, byteBufferIndexAtNextBlockBegin,
                                                 nBytesInOverlapWithNextBlock, *(buffer - 1),
                                                 overhangCharsAtNextBlockBegin, decoderStateAtNextBlockBegin,
                                                 overhangCharsAfterOverlapWithNextBlock, decoderStateAfterOverlapWithNextBlock));
                    } else { // we reached the end of the stream
                        anchor->LastBlock = block;
                        anchor->EndIndex = anchor->CharIndexOffset + charIndex + (buffer - bufferBegin);
                    }
                } else if (anchor->EndIndex != anchor->CharIndexOffset + charIndex + (buffer - bufferBegin)) {
                    throw new IOException("CharStream: stream integrity error");
                }
            } else {
                BlockInfo nbi = Blocks[block + 1];
                if (buffer != bufferBegin + blockSizeMinusOverlap
                    || byteIndexAtNextBlockBegin != nbi.ByteIndex
                    || byteBufferIndexAtNextBlockBegin != nbi.ByteBufferIndex
                    || overhangCharsAtNextBlockBegin != nbi.OverhangCharsAtBlockBegin)
                    throw new IOException("CharStream: stream integrity error");

                if (prevBlock != block + 1 || (block == 0 && SerializableDecoderMembers == null)) { // jumping back to block 0 is supported even if the decoder is not serializable
                    nCharsToRead = BlockOverlap;
                    if (overhangCharsAtNextBlockBegin != null) {
                        nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
                        for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
                            *(buffer++) = overhangCharsAtNextBlockBegin[i];
                    }
                    string overhangCharsAfterOverlapWithNextBlock;
                    buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
                    int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
                    if (buffer != bufferBegin + BlockSize
                        || nBytesInOverlapWithNextBlock != nbi.NumberOfBytesInOverlap
                        || *(buffer - 1) != nbi.LastCharInOverlap
                        || overhangCharsAfterOverlapWithNextBlock != nbi.OverhangCharsAfterOverlap)
                        throw new IOException("CharStream: stream integrity error");
                } else {
                    Debug.Assert(bufferBegin[BlockSize - 1] == nbi.LastCharInOverlap);
                    buffer += BlockOverlap; // we already copied the chars at the beginning of this function
                    int off = nbi.NumberOfBytesInOverlap - (ByteBufferCount - ByteBufferIndex);
                    if (off > 0) {
                        // we wouldn't have gotten here if the Stream didn't support seeking
                        Stream.Seek(off, SeekOrigin.Current);
                        StreamPosition += off;
                        ClearAndRefillByteBuffer(off%ByteBuffer.Length);
                    } else {
                        ByteBufferIndex += nbi.NumberOfBytesInOverlap;
                    }
                    nbi.DecoderStateAfterOverlap.WriteTo(ref Decoder, SerializableDecoderMembers);
                }
            }

            anchor->Block = block;
            anchor->CharIndex = charIndex;
            anchor->CharIndexPlusOffset = anchor->CharIndexOffset + charIndex;
            anchor->BufferEnd = buffer;
            return begin == buffer ? null : begin;
        }
    }

    /// <summary>Reads all remaining chars into the given buffer. If the remaining stream
    /// content holds more than the given maximum number of chars, an exception will be thrown.</summary>
    private static int ReadAllRemainingCharsFromStream(char* buffer, int maxCount, byte[] byteBuffer, int byteBufferIndex, int byteBufferCount, Stream stream, long streamPosition, Decoder decoder) {
        Debug.Assert(maxCount > 0 && byteBufferIndex >= 0 && byteBufferIndex < byteBufferCount);
        fixed (byte* pByteBuffer = byteBuffer) {
            bool flush = false;
            int bufferCount = 0;
            for (;;) {
                try {
                    bufferCount += decoder.GetChars(pByteBuffer + byteBufferIndex, byteBufferCount - byteBufferIndex,
                                                    buffer + bufferCount, maxCount - bufferCount, flush);
                } catch (DecoderFallbackException e) {
                    e.Data.Add("Stream.Position", streamPosition - (byteBufferCount - byteBufferIndex) + e.Index);
                    throw;
                }
                if (flush) break;
                byteBufferIndex = 0; // GetChars consumed all bytes in the byte buffer
                byteBufferCount = stream.Read(byteBuffer, 0, byteBuffer.Length);
                streamPosition += byteBufferCount;
                flush = byteBufferCount == 0;
            }
            return bufferCount;
        }
    }

    /// <summary>The current block in BufferString.</summary>
    private int Block { get { return anchor->Block; } }

    /// <summary>The number of chars in BufferString.</summary>
    private int BufferCount { get { return PositiveDistance(anchor->BufferBegin, anchor->BufferEnd); } }

    /// <summary>The index of the first char in the stream, i.e. Begin.Index.
    /// This value is determined by the streamBeginIndex argument of some of the CharStream constructors.
    /// By default this value is 0.</summary>
    public long BeginIndex { get { return anchor->CharIndexOffset; } }

    /// <summary>The index of the last char of the stream plus 1,
    /// or Int64.MaxValue if the end of stream has not yet been detected.</summary>
    public long EndIndex { get { return anchor->EndIndex; } }

    [Obsolete("CharStream.IndexOffset has been renamed to CharStream.BeginIndex.")]
    public long IndexOffset { get { return BeginIndex; } }

    [Obsolete("CharStream.EndOfStream has been renamed to CharStream.EndIndex.")]
    public long EndOfStream { get { return EndIndex; } }

    // we don't have a public constructor that only takes a string to avoid potential confusion with a filepath constructor
    internal CharStream(string chars) {
        Debug.Assert(chars != null);
        BufferString = chars;
        // ByteBufferIndex = 0; // we recycle ByteBufferIndex for BufferStringIndex
        BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
        char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject();
        BufferStringPointer = bufferBegin;
        CharConstructorContinue(bufferBegin, chars.Length, 0);
    }
    /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive).</summary>
    /// <exception cref="ArgumentNullException">chars is null.</exception>
    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
    public CharStream(string chars, int index, int length) : this(chars, index, length, 0) {}

    /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
    /// <exception cref="ArgumentNullException">chars is null.</exception>
    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
    public CharStream(string chars, int index, int length, long streamBeginIndex) {
        if (chars == null) throw new ArgumentNullException("chars");
        if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
        if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
        if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");

        BufferString = chars;
        BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
        char* pBufferString = (char*)BufferHandle.AddrOfPinnedObject();
        BufferStringPointer = pBufferString;
        CharConstructorContinue(pBufferString + index, length, streamBeginIndex);
    }

    /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive).</summary>
    /// <exception cref="ArgumentNullException">chars is null.</exception>
    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
    public CharStream(char[] chars, int index, int length) : this(chars, index, length, 0) { }

    /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
    /// <exception cref="NullReferenceException">chars is null.</exception>
    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
    public CharStream(char[] chars, int index, int length, long streamBeginIndex) {
        if (chars == null) throw new ArgumentNullException("chars");
        if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
        if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
        if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");

        BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
        char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject() + index;
        CharConstructorContinue(bufferBegin, length, streamBeginIndex);
    }

    /// <summary>Constructs a CharStream from the length chars at the pointer address.</summary>
    /// <exception cref="ArgumentNullException">chars is null.</exception>
    /// <exception cref="ArgumentOutOfRangeException">length is negative.</exception>
    public CharStream(char* chars, int length) : this(chars, length, 0) {}

    /// <summary>Constructs a CharStream from the length chars at the pointer address. The first char in the stream is assigned the index streamBeginIndex.</summary>
    /// <exception cref="ArgumentNullException">chars is null.</exception>
    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: length ≥ 0 and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
    public CharStream(char* chars, int length, long streamBeginIndex) {
        if (chars == null) throw new ArgumentNullException("chars");
        if (length < 0) throw new ArgumentOutOfRangeException("length", "The length is negative.");
        if (chars > unchecked(chars + length))
            throw new ArgumentOutOfRangeException("length", "The length is out of range.");
        if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");

        CharConstructorContinue(chars, length, streamBeginIndex);
    }

    private void CharConstructorContinue(char* bufferBegin, int length, long streamBeginIndex) {
        Debug.Assert((bufferBegin != null || length == 0) && length >= 0 && bufferBegin <= unchecked(bufferBegin + length) && streamBeginIndex >= 0 && streamBeginIndex < (1L << 60));
        Encoding = Encoding.Unicode;
        var anchor = Anchor.Create(this);
        this.anchor = anchor;
        if (length != 0) {
            anchor->BufferBegin = bufferBegin;
            anchor->BufferEnd = bufferBegin + length;
            anchor->BlockSizeMinusOverlap = length;
        } else {
            anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
            anchor->BufferEnd = null;
            anchor->BlockSizeMinusOverlap = 0;
        }
        anchor->Block = 0;
        anchor->LastBlock = 0;
        anchor->CharIndex = 0;
        anchor->CharIndexPlusOffset = streamBeginIndex;
        anchor->CharIndexOffset = streamBeginIndex;
        anchor->EndIndex = streamBeginIndex + length;
    }

    internal CharStream(string chars, char* pChars, char* begin, int length, long streamIndexOffset, Anchor* newUninitializedAnchor) {
        Debug.Assert((chars == null ? pChars == null : pChars <= begin)
                     && (begin != null || length == 0) && length >= 0 && begin <= unchecked(begin + length) && streamIndexOffset >= 0 && streamIndexOffset < (1L << 60));
        Debug.Assert(newUninitializedAnchor->NeedToFree == false && !newUninitializedAnchor->StreamHandle.IsAllocated
                     && newUninitializedAnchor->Block == 0 && newUninitializedAnchor->LastBlock == 0 && newUninitializedAnchor->CharIndex == 0);
        BufferString = chars;
        BufferStringPointer = pChars;
        Encoding = Encoding.Unicode;
        var anchor = newUninitializedAnchor;
        this.anchor = anchor;
        if (length != 0) {
            anchor->BufferBegin = begin;
            anchor->BufferEnd = begin + length;
            anchor->BlockSizeMinusOverlap = length;
        } else {
            anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
            anchor->BufferEnd = null;
            anchor->BlockSizeMinusOverlap = 0;
        }
        anchor->CharIndexPlusOffset = streamIndexOffset;
        anchor->CharIndexOffset = streamIndexOffset;
        anchor->EndIndex = streamIndexOffset + length;
        anchor->StreamHandle = GCHandle.Alloc(this, GCHandleType.Normal);
    }

    /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
    public CharStream(string path, Encoding encoding)
           : this(path, encoding, true,
                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }

    /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
    public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
           : this(path, encoding, detectEncodingFromByteOrderMarks,
                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }

    /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, blockSize, blockOverlap, minRegexSpace, byteBufferLength).</summary>
    public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks,
                      int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
    {
        if (encoding == null) throw new ArgumentNullException("encoding");
        var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan);
        try {
           StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
                                     blockSize, blockOverlap, minRegexSpace, byteBufferLength);
        } catch {
            stream.Dispose();
            throw;
        }
    }

    /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
    public CharStream(Stream stream, Encoding encoding)
           : this(stream,
                  false, encoding, true,
                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }

    /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
    public CharStream(Stream stream, bool leaveOpen, Encoding encoding)
           : this(stream,
                  leaveOpen, encoding, true,
                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }

    /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
    public CharStream(Stream stream, bool leaveOpen, Encoding encoding, bool detectEncodingFromByteOrderMarks)
           : this(stream,
                  leaveOpen, encoding, detectEncodingFromByteOrderMarks,
                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }

    /// <summary>Constructs a CharStream from a byte Stream.</summary>
    /// <param name="stream">The byte stream providing the input.</param>
    /// <param name="leaveOpen">Indicates whether the byte Stream should be left open when the CharStream has finished reading it.</param>
    /// <param name="encoding">The (default) Encoding used for decoding the byte Stream into chars.</param>
    /// <param name="detectEncodingFromByteOrderMarks">Indicates whether the constructor should detect the encoding from a unicode byte-order mark at the beginning of the stream. An encoding detected from a byte-order mark overrides the default encoding.</param>
    /// <param name="blockSize">The number of chars per block. The default is 3×2^16 ≈ 200k.</param>
    /// <param name="blockOverlap">The number of chars at the end of a block that are preserved when reading the next block into the char buffer. It must be less than blockSize/2, but not less than encoding.GetMaxCharCount(1). The default is blockSize/3.</param>
    /// <param name="minRegexSpace">The number of chars that are guaranteed to be visible to a regular expression when it is matched on the stream (assuming there are enough chars remaining in the stream). Must not be greater than blockOverlap. The default is 2/3 of blockOverlap.</param>
    /// <param name="byteBufferLength">The size of the byte buffer used for decoding purposes. The default is 2^12 = 4KB.</param>
    public CharStream(Stream stream, bool leaveOpen,
                      Encoding encoding, bool detectEncodingFromByteOrderMarks,
                      int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
    {
        if (stream == null) throw new ArgumentNullException("stream");
        if (!stream.CanRead) throw new ArgumentException("stream is not readable");
        if (encoding == null) throw new ArgumentNullException("encoding");
        StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
                                  blockSize, blockOverlap, minRegexSpace, byteBufferLength);
    }

    /// <summary>we modify this flag via reflection in the unit test</summary>
    private static bool DoNotRoundUpBlockSizeToSimplifyTesting = false;

    private void StreamConstructorContinue(Stream stream, bool leaveOpen,
                                           Encoding encoding, bool detectEncodingFromByteOrderMarks,
                                           int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
    {
        if (byteBufferLength < MinimumByteBufferLength) byteBufferLength = MinimumByteBufferLength;

        int bytesInStream = -1;
        long streamPosition;
        if (stream.CanSeek) {
            streamPosition = stream.Position;
            long streamLength = stream.Length - streamPosition;
            if (streamLength <= Int32.MaxValue) {
                bytesInStream = (int)streamLength;
                if (bytesInStream < byteBufferLength) byteBufferLength = bytesInStream;
            }
        } else {
            streamPosition = 0;
        }

        byte[] byteBuffer = new byte[byteBufferLength];
        int byteBufferCount = 0;
        do {
            int n = stream.Read(byteBuffer, byteBufferCount, byteBufferLength - byteBufferCount);
            if (n == 0) {
                bytesInStream = byteBufferCount;
                break;
            }
            byteBufferCount += n;
        } while (byteBufferCount < MinimumByteBufferLength);
        streamPosition += byteBufferCount;

        int preambleLength = Helper.DetectPreamble(byteBuffer, byteBufferCount, ref encoding, detectEncodingFromByteOrderMarks);
        bytesInStream -= preambleLength;

        Encoding = encoding;
        Decoder decoder = encoding.GetDecoder();

        // we allow such small block sizes only to simplify testing
        if (blockSize < 8) blockSize = DefaultBlockSize;

        bool allCharsFitIntoOneBlock = false;
        if (bytesInStream >= 0 && bytesInStream/4 <= blockSize) {
            if (bytesInStream != 0) {
                try {
                    int maxCharCount = Encoding.GetMaxCharCount(bytesInStream); // may throw ArgumentOutOfRangeException
                    if (blockSize >= maxCharCount) {
                        allCharsFitIntoOneBlock = true;
                        blockSize = maxCharCount;
                    }
                } catch (ArgumentOutOfRangeException) { }
            } else {
                allCharsFitIntoOneBlock = true;
                blockSize = 0;
            }
        }
        var buffer = StringBuffer.Create(blockSize);
        Debug.Assert(buffer.Length >= blockSize && (blockSize > 0 || buffer.StringPointer == null));
        Buffer = buffer;
        BufferString = buffer.String;
        BufferStringPointer = buffer.StringPointer;
        char* bufferBegin = buffer.StringPointer + buffer.Index;
        try {
            if (allCharsFitIntoOneBlock) {
                int bufferCount = preambleLength == byteBufferCount
                                  ? 0
                                  : ReadAllRemainingCharsFromStream(bufferBegin, buffer.Length, byteBuffer, preambleLength, byteBufferCount, stream, streamPosition, decoder);
                if (!leaveOpen) stream.Close();
                var anchor = Anchor.Create(this);
                this.anchor = anchor;
                anchor->BlockSizeMinusOverlap = bufferCount;
                anchor->EndIndex = bufferCount;
                if (bufferCount != 0) {
                    anchor->BufferBegin = bufferBegin;
                    anchor->BufferEnd = bufferBegin + bufferCount;
                } else {
                    anchor->BufferBegin = null;
                    anchor->BufferEnd = null;
                }
                anchor->Block = 0;
                anchor->LastBlock = 0;
                anchor->CharIndex = 0;
                anchor->CharIndexOffset = 0;
                anchor->CharIndexPlusOffset = 0;
            } else {
                if (!DoNotRoundUpBlockSizeToSimplifyTesting) blockSize = buffer.Length;
                var d = new MultiBlockData();
                Data = d;
                d.Stream = stream;
                d.StreamPosition = streamPosition;
                d.LeaveOpen = leaveOpen;
                d.Decoder = decoder;
                d.ByteBuffer = byteBuffer;
                d.ByteBufferIndex = preambleLength;
                d.ByteBufferCount = byteBufferCount;
                d.MaxCharCountForOneByte = Math.Max(1, Encoding.GetMaxCharCount(1));
                d.SerializableDecoderMembers = GetSerializableDecoderMemberInfo(decoder);
                if (blockSize < 3*d.MaxCharCountForOneByte) blockSize = 3*d.MaxCharCountForOneByte;
                // MaxCharCountForOneByte == the maximum number of overhang chars
                if(    Math.Min(blockOverlap, blockSize - 2*blockOverlap) < d.MaxCharCountForOneByte
                    || blockOverlap >= blockSize/2) blockOverlap = blockSize/3;
                if (minRegexSpace < 0 || minRegexSpace > blockOverlap) minRegexSpace = 2*blockOverlap/3;
                d.BlockSize     = blockSize;
                d.BlockOverlap  = blockOverlap;
                d.RegexSpaceThreshold = bufferBegin + (blockSize - minRegexSpace);
                var anchor = Anchor.Create(this);
                this.anchor = anchor;
                d.anchor = anchor;
                anchor->BlockSizeMinusOverlap = blockSize - blockOverlap;
                anchor->EndIndex = Int64.MaxValue;
                anchor->BufferBegin = bufferBegin;
                anchor->BufferEnd = bufferBegin;
                anchor->Block = -2; // special value recognized by ReadBlock
                anchor->LastBlock = Int32.MaxValue;
                anchor->CharIndex = 0;
                anchor->CharIndexOffset = 0;
                anchor->CharIndexPlusOffset = 0;
                d.Blocks = new List<BlockInfo>();
                // the first block has no overlap with a previous block
                d.Blocks.Add(new BlockInfo(preambleLength, preambleLength, 0, EOS, null, new DecoderState(), null, new Decod…
Tech Fingerprint

.NET Base Class Library
Alerts (12)

'static' Mutable static field detected; use readonly or const to prevent race conditions
87 89 128 169 180 486 709
Complexity hotspot; lines 621 to 624 (total complexity: 18)
621 622 623 624
'new FileStream(' Disposable object detected; wrap in using statement or using declaration to ensure disposal
660