PageRenderTime 91ms CodeModel.GetById 15ms app.highlight 62ms RepoModel.GetById 1ms app.codeStats 1ms

/fparsec/main/FParsecCS/CharStream.cs

http://github.com/sandersn/fing
C# | 2073 lines | 1467 code | 161 blank | 445 comment | 673 complexity | cbb1fa5f9620a84c92c17759d91abb76 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1// Copyright (c) Stephan Tolksdorf 2007-2009
  2// License: Simplified BSD License. See accompanying documentation.
  3
  4#if !LOW_TRUST
  5
  6using System;
  7using System.IO;
  8using System.Collections.Generic;
  9using System.Text;
 10using System.Text.RegularExpressions;
 11using System.Diagnostics;
 12using System.Reflection;
 13using System.Runtime.Serialization;
 14using System.Runtime.InteropServices;
 15using System.Runtime.CompilerServices;
 16
 17
 18namespace FParsec {
 19/// <summary>Provides access to the char content of a binary Stream (or a String) through
 20/// an iterator-based interface that is especially well suited for parser applications.</summary>
 21public unsafe sealed class CharStream : IDisposable {
 22
 23    // In order to facilitate efficient backtracking we divide the stream into overlapping
 24    // blocks with equal number of chars. The blocks are overlapping, so that
 25    // backtracking over short distances at a block boundary doesn't trigger a reread of the
 26    // previous block.
 27    //
 28    //              Block 0
 29    //
 30    //    -----------------|--------  Block 1
 31    //                       Overlap
 32    //                      --------|--------|--------  Block 2
 33    //                                        Overlap
 34    //                                        --------|--------|--------
 35    //                                                                  (...)
 36    //  a '-' symbolizes a char, a '|' a block boundary.
 37    //
 38    //
 39    // In general there's no fixed relationship between the number of input bytes and the
 40    // number of input chars. Worse, the encoding can be stateful, which makes it necessary
 41    // to persist the decoder state over block boundaries. If we later want to
 42    // be able to reread a certain block, we therefore need to keep record of various
 43    // bits of information describing the state of the input stream at the beginning of a block:
 44
 45    private class BlockInfo {
 46        /// <summary>the byte stream index of the first char in the block after the OverhangCharsAtBlockBegin</summary>
 47        public long ByteIndex;
 48        /// <summary>the value of the CharStream's ByteBufferIndex before the block is read</summary>
 49        public int ByteBufferIndex;
 50
 51        /// <summary>the number of bytes in the stream from ByteIndex to the first char after the OverhangCharsAfterOverlap</summary>
 52        public int NumberOfBytesInOverlap;
 53
 54        /// <summary>the last char in the overlap with the previous block (used for integrity checking)</summary>
 55        public char LastCharInOverlap;
 56
 57        /// <summary>chars at the block begin that were already read together with chars of the last block before the overlap</summary>
 58        public string OverhangCharsAtBlockBegin;
 59        /// <summary>chars after the overlap with the previous block that were already read together with the overlap chars</summary>
 60        public string OverhangCharsAfterOverlap;
 61
 62        public DecoderState DecoderStateAtBlockBegin;
 63        public DecoderState DecoderStateAfterOverlap;
 64
 65
 66        public BlockInfo(long byteIndex, int byteBufferIndex,
 67                         int nBytesInOverlapCount, char lastCharInOverlap,
 68                         string overhangCharsAtBlockBegin, DecoderState decoderStateAtBlockBegin,
 69                         string overhangCharsAfterOverlap, DecoderState decoderStateAfterOverlap)
 70        {
 71            this.ByteIndex = byteIndex;
 72            this.ByteBufferIndex = byteBufferIndex;
 73            this.NumberOfBytesInOverlap = nBytesInOverlapCount;
 74            this.LastCharInOverlap = lastCharInOverlap;
 75            this.OverhangCharsAtBlockBegin = overhangCharsAtBlockBegin;
 76            this.OverhangCharsAfterOverlap = overhangCharsAfterOverlap;
 77            this.DecoderStateAtBlockBegin = decoderStateAtBlockBegin;
 78            this.DecoderStateAfterOverlap = decoderStateAfterOverlap;
 79        }
 80    }
 81
 82    // Unfortunately the Decoder API has no explicit methods for managing the state,
 83    // which forces us to abuse the comparatively inefficient serialization API for this purpose.
 84    // (The absence of explicit state management or at least a deep cloning method in the Decoder interface
 85    // is almost as puzzling as the absence of such methods in System.Random).
 86
 87    private static Dictionary<Type, MemberInfo[]> SerializableMemberInfoCache;
 88
 89    private static MemberInfo[] GetSerializableDecoderMemberInfo(Decoder decoder) {
 90        Type type = decoder.GetType();
 91        if (!type.IsSerializable) return null;
 92        MemberInfo[] smis;
 93        if (SerializableMemberInfoCache == null) {
 94            SerializableMemberInfoCache = new Dictionary<Type,MemberInfo[]>(8);
 95        }
 96        lock (SerializableMemberInfoCache) {
 97            if (!SerializableMemberInfoCache.TryGetValue(type, out smis) ) {
 98                smis = FormatterServices.GetSerializableMembers(type, new StreamingContext(StreamingContextStates.Clone));
 99                SerializableMemberInfoCache.Add(type, smis);
100            }
101        }
102        return smis;
103    }
104
105    private struct DecoderState {
106        private object[] DecoderData;
107
108        public DecoderState(Decoder decoder, MemberInfo[] serializableDecoderMembers) {
109            DecoderData = serializableDecoderMembers != null
110                          ? FormatterServices.GetObjectData(decoder, serializableDecoderMembers)
111                          : null;
112        }
113
114        public void WriteTo(ref Decoder decoder, MemberInfo[] serializableDecoderMembers) {
115            if (DecoderData != null) {
116                //Decoder newDecoder = (Decoder) FormatterServices.GetUninitializedObject(decoder.GetType());
117                //FormatterServices.PopulateObjectMembers(newDecoder, serializableDecoderMembers, DecoderData);
118                //decoder = newDecoder;
119                FormatterServices.PopulateObjectMembers(decoder, serializableDecoderMembers, DecoderData);
120            } else {
121                decoder.Reset();
122            }
123        }
124    }
125
126    private const int DefaultBlockSize = 3*(1 << 16); // 3*2^16 = 200k
127    private const int DefaultByteBufferLength = (1 << 12);
128    private static int MinimumByteBufferLength = 128; // must be larger than longest detectable preamble (we can only guess here)
129    private const char EOS = '\uFFFF';
130
131    // For ease of use, we need the iterators to hold a reference to the CharStream. If we stored
132    // a CharStream reference directly in the iterator, the JIT would emit a call to the write barrier
133    // thunk for each write to the reference field. As we want to use iterators mainly as immutable values,
134    // we need them to be structs for performance reasons, and since structs are constantly copied
135    // by design, we would get frequent write barrier calls*.  Redirecting the CharStream
136    // access through an "Anchor" allows us to relieve the GC from having to keep track of all the
137    // CharStream references in the iterators. The trick is that an Anchor instance does not contain
138    // any reference to a managed object, only a GCHandle to the CharStream and other value type members
139    // important to the Iterators. Because the Anchor struct only has primitive members, we can take
140    // an unmanaged pointer which the GC doesn't need to track. To avoid most GCHandle.Target accesses,
141    // the CharStream stores pieces of information important to the iterators directly in the Anchor.
142    //
143    // * Just to be clear: Write barrier calls are rather cheap (about the cost of a virtual function
144    //   call) and overall FParsec performance is only marginally influenced by this optimization.
145    //   (Each Reply<_,_> value alone currently triggers 2-3 write barrier calls, even when it is
146    //   allocated on the stack and all fields are initialized to 0/null!).
147
148    internal Anchor* anchor; // allocated and assigned during construction,
149                             // freed and set to null during disposal
150
151    /// <summary>Represents the link between a CharStream and its Iterators.
152    /// May be allocated on the unmanaged heap and holds a GCHandle, hence must be properly freed.</summary>
153    internal struct Anchor {
154        public int Block;
155        /// <summary>The index of the last block of the stream, or Int32.MaxValue if the end of stream has not yet been detected.</summary>
156        public int LastBlock;
157        public GCHandle StreamHandle;
158        /// <summary>Begin of the used part of the char buffer (stays constant). Is null if the CharStream is empty.</summary>
159        public char* BufferBegin;
160        /// <summary>End of the used part of the char buffer (varies for a multi-block stream). Is null if the CharStream is empty.</summary>
161        public char* BufferEnd;
162        public long CharIndex;
163        public long CharIndexPlusOffset;
164        public long CharIndexOffset;
165        public long EndIndex;
166        public int BlockSizeMinusOverlap;
167        public bool NeedToFree;
168
169        public static Anchor* Create(CharStream stream) {
170            // We create the anchor instance on the unmanaged heap. An alternative would be to use a
171            // pinned pointer, but that would carry the risk of fragmenting the managed heap
172            // (because an Anchor is a small object that can be long-lived).
173            // (If AllocHGlobal becomes a bottleneck, we could replace it with a pool allocator.)
174            Anchor* p = (Anchor*) Marshal.AllocHGlobal(sizeof(Anchor));
175            p->NeedToFree = true;
176            p->StreamHandle = GCHandle.Alloc(stream, GCHandleType.Normal);
177            return p;
178        }
179
180        public static void Free(Anchor *p) {
181            p->StreamHandle.Free();
182            if (p->NeedToFree) Marshal.FreeHGlobal((IntPtr) p);
183        }
184    }
185
186    /// <summary>The Encoding that is used for decoding the underlying byte stream, or
187    /// System.Text.UnicodeEncoding in case the stream was directly constructed
188    /// from a string.</summary>
189    public  Encoding Encoding { get; private set; }
190
191    // If the CharStream is constructed from a binary stream, we use a managed string as the char
192    // buffer. This allows us to apply regular expressions directly to the input.
193    // In the case of multi-block CharStreams we thus have to mutate the buffer string through pointers.
194    // This is safe as long as we use a newly constructed string and we don't pass a reference
195    // to the internal buffer string to the "outside world". (The one instance where we have to pass
196    // a reference to the buffer string is regex matching. See the docs for Iterator.Match(regex) for more info.)
197    //
198    // Apart from Iter.Match(regex) we access the internal buffer only through a pinned pointer.
199    // This way we avoid the overhead of redundant bounds checking and can support strings, char arrays
200    // and unmanaged char buffers through the same interface. Accessing the buffer through pointers
201    // is also a requirement for accessing the CharStream data through an Anchor pointer (see above).
202    //
203    // Pinning a string or char array makes life more difficult for the GC. However, as long as
204    // the buffer is only short-lived or large enough to be allocated on the large object heap,
205    // there shouldn't be a problem. Furthermore, the buffer strings for CharStreams constructed
206    // from a binary stream are allocated through the StringBuffer interface and hence always live
207    // on the large object heap. Thus, the only scenario to really worry about (and which the
208    // documentation explicitly warns about) is when a large number of small CharStreams
209    // are constructed directly from strings or char arrays and are used for an extended period of time.
210
211    /// <summary>The string holding the char buffer, or null if the buffer is not part of a .NET string.</summary>
212    internal string BufferString;
213    /// <summary>A pointer to the beginning of BufferString, or null if BufferString is null.</summary>
214    internal char* BufferStringPointer;
215
216    /// <summary>Holds the GCHandle for CharStreams directly constructed from strings or char arrays.</summary>
217    private GCHandle BufferHandle;
218    /// <summary>Holds the StringBuffer for CharStreams constructed from a binary stream.</summary>
219    private StringBuffer Buffer;
220
221    private MultiBlockData Data;
222
223    /// <summary>Contains the data and methods needed in case the input byte stream
224    /// is large enough to span multiple blocks of the CharStream.</summary>
225    private class MultiBlockData {
226        public Anchor* anchor;
227
228        public Stream Stream;
229        // we keep a seperate record of the Stream.Position, so that we don't need to require Stream.CanSeek
230        public long StreamPosition;
231        public bool LeaveOpen;
232
233        public int MaxCharCountForOneByte;
234        public Decoder Decoder;
235        public MemberInfo[] SerializableDecoderMembers;
236
237        public int BlockSize;
238        public int BlockOverlap;
239        /// <summary>BufferBegin + BlockSize - minRegexSpace</summary>
240        public char* RegexSpaceThreshold;
241
242        /// <summary>The byte stream index of the first unused byte in the ByteBuffer.</summary>
243        public long ByteIndex { get { return StreamPosition - (ByteBufferCount - ByteBufferIndex); } }
244
245        public List<BlockInfo> Blocks;
246
247        public byte[] ByteBuffer;
248        public int ByteBufferIndex;
249        public int ByteBufferCount;
250
251         /// <summary>Refills the ByteBuffer if no unused byte is remaining.
252        /// Returns the number of unused bytes in the (refilled) ByteBuffer.</summary>
253        private int FillByteBuffer() {
254            int n = ByteBufferCount - ByteBufferIndex;
255            if (n > 0) return n;
256            return ClearAndRefillByteBuffer(0);
257        }
258
259        /// <summary>Refills the ByteBuffer starting at the given index. If the underlying byte
260        /// stream contains enough bytes, the ByteBuffer is filled up to the ByteBuffer.Length.
261        /// Returns the number of bytes available for consumption in the refilled ByteBuffer.</summary>
262        private int ClearAndRefillByteBuffer(int byteBufferIndex) {
263            Debug.Assert(byteBufferIndex >= 0 && byteBufferIndex <= ByteBuffer.Length);
264            // Stream.Read is not guaranteed to use all the provided output buffer, so we need
265            // to call it in a loop when we want to rely on the buffer being fully filled
266            // (unless we reach the end of the stream). Knowing that the buffer always gets
267            // completely filled allows us to calculate the buffer utilization after skipping
268            // a certain number of input bytes. For most streams there will be only one loop
269            // iteration anyway (or two at the end of the stream).
270            int i = byteBufferIndex;
271            int m = ByteBuffer.Length - byteBufferIndex;
272            while (m != 0) {
273                int c = Stream.Read(ByteBuffer, i, m);
274                if (c == 0) break;
275                i += c;
276                m -= c;
277            }
278            int n = i - byteBufferIndex;
279            ByteBufferIndex = byteBufferIndex;
280            ByteBufferCount = byteBufferIndex + n;
281            StreamPosition += n;
282            return n;
283        }
284
285        /// <summary>Reads up to the given maximum number of chars into the given buffer.
286        /// If more than the maximum number of chars have to be read from the stream in order to
287        /// fill the buffer (due to	the way the Decoder API works), the overhang chars are
288        /// returned through the output parameter.
289        /// Returns a pointer to one char after the last char read.</summary>
290        private char* ReadCharsFromStream(char* buffer, int maxCount, out string overhangChars) {
291            Debug.Assert(maxCount >= 0);
292            fixed (byte* byteBuffer = ByteBuffer) {
293                overhangChars = null;
294                try {
295                    while (maxCount >= MaxCharCountForOneByte) {// if maxCount < MaxCharCountForOneByte, Convert could throw
296                        int nBytesInByteBuffer = FillByteBuffer();
297                        bool flush = nBytesInByteBuffer == 0;
298                        int bytesUsed, charsUsed; bool completed = false;
299                        Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
300                                        buffer, maxCount, flush,
301                                        out bytesUsed, out charsUsed, out completed);
302                        ByteBufferIndex += bytesUsed; // GetChars consumed bytesUsed bytes from the byte buffer
303                        buffer += charsUsed;
304                        maxCount -= charsUsed;
305                        if (flush && completed) return buffer;
306                    }
307                    if (maxCount == 0) return buffer;
308
309                    char* cs = stackalloc char[MaxCharCountForOneByte];
310                    for (;;) {
311                        int nBytesInByteBuffer = FillByteBuffer();
312                        bool flush = nBytesInByteBuffer == 0;
313                        int bytesUsed, charsUsed; bool completed;
314                        Decoder.Convert(byteBuffer + ByteBufferIndex, nBytesInByteBuffer,
315                                        cs, MaxCharCountForOneByte, flush,
316                                        out bytesUsed, out charsUsed, out completed);
317                        ByteBufferIndex += bytesUsed;
318                        if (charsUsed > 0) {
319                            int i = 0;
320                            do {
321                                *(buffer++) = cs[i++];
322                                if (--maxCount == 0) {
323                                    if (i < charsUsed) overhangChars = new string(cs, i, charsUsed - i);
324                                    return buffer;
325                                }
326                            } while (i < charsUsed);
327                        }
328                        if (flush && completed) return buffer;
329                    }
330                } catch (DecoderFallbackException e) {
331                    e.Data.Add("Stream.Position", ByteIndex + e.Index);
332                    throw;
333                }
334            }
335        }
336
337        /// <summary> Reads a block of chars (must be different from the current block)
338        /// into the BufferString. Returns a pointer to the first char of the new block,
339        /// or null if no chars could be read.</summary>
340        internal char* ReadBlock(int block) {
341            if (block > anchor->LastBlock) return null;
342            int prevBlock = anchor->Block;
343            if (block == prevBlock) throw new InvalidOperationException();
344            if (SerializableDecoderMembers == null && block > 0) {
345                if (prevBlock > block)
346                    throw new NotSupportedException("The CharStream does not support seeking backwards over ranges longer than the block overlap because the Encoding's Decoder is not serializable.");
347                while (prevBlock + 1 < block) ReadBlock(++prevBlock);
348            }
349
350            BlockInfo bi = Blocks[block];
351            int blockSizeMinusOverlap = BlockSize - BlockOverlap;
352            long charIndex = Math.BigMul(block, blockSizeMinusOverlap);
353            char* bufferBegin = anchor->BufferBegin;
354            char* begin, buffer;
355            int nCharsToRead;
356
357            // fill [0 ... BlockOverlap-1] if block > 0
358            if (prevBlock == block - 1) {
359                MemMove(bufferBegin, bufferBegin + blockSizeMinusOverlap, BlockOverlap*2);
360                Debug.Assert(bufferBegin[BlockOverlap - 1] == bi.LastCharInOverlap);
361                begin = buffer = bufferBegin + BlockOverlap;
362            } else if (prevBlock >= 0) {
363                Stream.Seek(bi.ByteIndex, SeekOrigin.Begin); // will throw if Stream can't seek
364                // now that there was no exception, we can change the state...
365                StreamPosition = bi.ByteIndex;
366                ClearAndRefillByteBuffer(bi.ByteBufferIndex);
367                bi.DecoderStateAtBlockBegin.WriteTo(ref Decoder, SerializableDecoderMembers); // will reset Decoder if block == 0
368                if (prevBlock == block + 1) {
369                    // move the overlap into [BlockSize - BlockOverlap, BlockSize - 1] before it gets overwritten
370                    MemMove(bufferBegin + blockSizeMinusOverlap, bufferBegin, BlockOverlap*2);
371                }
372                begin = buffer = bufferBegin;
373                if (block > 0) {
374                    nCharsToRead = BlockOverlap;
375                    if (bi.OverhangCharsAtBlockBegin != null) {
376                        nCharsToRead -= bi.OverhangCharsAtBlockBegin.Length;
377                        for (int i = 0; i < bi.OverhangCharsAtBlockBegin.Length; ++i)
378                            *(buffer++) = bi.OverhangCharsAtBlockBegin[i];
379                    }
380                    string overhangCharsAfterOverlap;
381                    buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlap);
382                    if (   buffer != bufferBegin + BlockOverlap
383                        || ByteIndex != bi.ByteIndex + bi.NumberOfBytesInOverlap
384                        || *(buffer - 1) != bi.LastCharInOverlap
385                        || overhangCharsAfterOverlap != bi.OverhangCharsAfterOverlap)
386                        throw new IOException("CharStream: stream integrity error");
387                }
388            } else { // ReadBlock was called from the constructor
389                if (block != 0) throw new InvalidOperationException();
390                begin = buffer = bufferBegin;
391            }
392
393            // fill [0            ... BlockSize-BlockOverlap-1] if block == 0
394            // and  [BlockOverlap ... BlockSize-BlockOverlap-1] otherwise
395            if (block == 0) {
396                nCharsToRead = blockSizeMinusOverlap;
397            } else {
398                nCharsToRead = blockSizeMinusOverlap - BlockOverlap;
399                if (bi.OverhangCharsAfterOverlap != null) {
400                    nCharsToRead -= bi.OverhangCharsAfterOverlap.Length;
401                    for (int i = 0; i < bi.OverhangCharsAfterOverlap.Length; ++i)
402                        *(buffer++) = bi.OverhangCharsAfterOverlap[i];
403                }
404            }
405            string overhangCharsAtNextBlockBegin;
406            buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAtNextBlockBegin);
407
408            long byteIndexAtNextBlockBegin = ByteIndex;
409            int byteBufferIndexAtNextBlockBegin = ByteBufferIndex;
410
411            // fill [BlockSize-BlockOverlap ... BlockSize-1]
412            if (block == Blocks.Count - 1) { // next block hasn't yet been read
413                DecoderState decoderStateAtNextBlockBegin = new DecoderState(Decoder, SerializableDecoderMembers);
414                nCharsToRead = BlockOverlap;
415                if (overhangCharsAtNextBlockBegin != null) {
416                    nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
417                    for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
418                        *(buffer++) = overhangCharsAtNextBlockBegin[i];
419                }
420                string overhangCharsAfterOverlapWithNextBlock;
421                buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
422                if (anchor->LastBlock == Int32.MaxValue) { // last block hasn't yet been detected
423                    if (buffer == bufferBegin + BlockSize) {
424                        DecoderState decoderStateAfterOverlapWithNextBlock = new DecoderState(Decoder, SerializableDecoderMembers);
425                        int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
426                        Blocks.Add(new BlockInfo(byteIndexAtNextBlockBegin, byteBufferIndexAtNextBlockBegin,
427                                                 nBytesInOverlapWithNextBlock, *(buffer - 1),
428                                                 overhangCharsAtNextBlockBegin, decoderStateAtNextBlockBegin,
429                                                 overhangCharsAfterOverlapWithNextBlock, decoderStateAfterOverlapWithNextBlock));
430                    } else { // we reached the end of the stream
431                        anchor->LastBlock = block;
432                        anchor->EndIndex = anchor->CharIndexOffset + charIndex + (buffer - bufferBegin);
433                    }
434                } else if (anchor->EndIndex != anchor->CharIndexOffset + charIndex + (buffer - bufferBegin)) {
435                    throw new IOException("CharStream: stream integrity error");
436                }
437            } else {
438                BlockInfo nbi = Blocks[block + 1];
439                if (buffer != bufferBegin + blockSizeMinusOverlap
440                    || byteIndexAtNextBlockBegin != nbi.ByteIndex
441                    || byteBufferIndexAtNextBlockBegin != nbi.ByteBufferIndex
442                    || overhangCharsAtNextBlockBegin != nbi.OverhangCharsAtBlockBegin)
443                    throw new IOException("CharStream: stream integrity error");
444
445                if (prevBlock != block + 1 || (block == 0 && SerializableDecoderMembers == null)) { // jumping back to block 0 is supported even if the decoder is not serializable
446                    nCharsToRead = BlockOverlap;
447                    if (overhangCharsAtNextBlockBegin != null) {
448                        nCharsToRead -= overhangCharsAtNextBlockBegin.Length;
449                        for (int i = 0; i < overhangCharsAtNextBlockBegin.Length; ++i)
450                            *(buffer++) = overhangCharsAtNextBlockBegin[i];
451                    }
452                    string overhangCharsAfterOverlapWithNextBlock;
453                    buffer = ReadCharsFromStream(buffer, nCharsToRead, out overhangCharsAfterOverlapWithNextBlock);
454                    int nBytesInOverlapWithNextBlock = (int)(ByteIndex - byteIndexAtNextBlockBegin);
455                    if (buffer != bufferBegin + BlockSize
456                        || nBytesInOverlapWithNextBlock != nbi.NumberOfBytesInOverlap
457                        || *(buffer - 1) != nbi.LastCharInOverlap
458                        || overhangCharsAfterOverlapWithNextBlock != nbi.OverhangCharsAfterOverlap)
459                        throw new IOException("CharStream: stream integrity error");
460                } else {
461                    Debug.Assert(bufferBegin[BlockSize - 1] == nbi.LastCharInOverlap);
462                    buffer += BlockOverlap; // we already copied the chars at the beginning of this function
463                    int off = nbi.NumberOfBytesInOverlap - (ByteBufferCount - ByteBufferIndex);
464                    if (off > 0) {
465                        // we wouldn't have gotten here if the Stream didn't support seeking
466                        Stream.Seek(off, SeekOrigin.Current);
467                        StreamPosition += off;
468                        ClearAndRefillByteBuffer(off%ByteBuffer.Length);
469                    } else {
470                        ByteBufferIndex += nbi.NumberOfBytesInOverlap;
471                    }
472                    nbi.DecoderStateAfterOverlap.WriteTo(ref Decoder, SerializableDecoderMembers);
473                }
474            }
475
476            anchor->Block = block;
477            anchor->CharIndex = charIndex;
478            anchor->CharIndexPlusOffset = anchor->CharIndexOffset + charIndex;
479            anchor->BufferEnd = buffer;
480            return begin == buffer ? null : begin;
481        }
482    }
483
484    /// <summary>Reads all remaining chars into the given buffer. If the remaining stream
485    /// content holds more than the given maximum number of chars, an exception will be thrown.</summary>
486    private static int ReadAllRemainingCharsFromStream(char* buffer, int maxCount, byte[] byteBuffer, int byteBufferIndex, int byteBufferCount, Stream stream, long streamPosition, Decoder decoder) {
487        Debug.Assert(maxCount > 0 && byteBufferIndex >= 0 && byteBufferIndex < byteBufferCount);
488        fixed (byte* pByteBuffer = byteBuffer) {
489            bool flush = false;
490            int bufferCount = 0;
491            for (;;) {
492                try {
493                    bufferCount += decoder.GetChars(pByteBuffer + byteBufferIndex, byteBufferCount - byteBufferIndex,
494                                                    buffer + bufferCount, maxCount - bufferCount, flush);
495                } catch (DecoderFallbackException e) {
496                    e.Data.Add("Stream.Position", streamPosition - (byteBufferCount - byteBufferIndex) + e.Index);
497                    throw;
498                }
499                if (flush) break;
500                byteBufferIndex = 0; // GetChars consumed all bytes in the byte buffer
501                byteBufferCount = stream.Read(byteBuffer, 0, byteBuffer.Length);
502                streamPosition += byteBufferCount;
503                flush = byteBufferCount == 0;
504            }
505            return bufferCount;
506        }
507    }
508
509    /// <summary>The current block in BufferString.</summary>
510    private int Block { get { return anchor->Block; } }
511
512    /// <summary>The number of chars in BufferString.</summary>
513    private int BufferCount { get { return PositiveDistance(anchor->BufferBegin, anchor->BufferEnd); } }
514
515    /// <summary>The index of the first char in the stream, i.e. Begin.Index.
516    /// This value is determined by the streamBeginIndex argument of some of the CharStream constructors.
517    /// By default this value is 0.</summary>
518    public long BeginIndex { get { return anchor->CharIndexOffset; } }
519
520    /// <summary>The index of the last char of the stream plus 1,
521    /// or Int64.MaxValue if the end of stream has not yet been detected.</summary>
522    public long EndIndex { get { return anchor->EndIndex; } }
523
524    [Obsolete("CharStream.IndexOffset has been renamed to CharStream.BeginIndex.")]
525    public long IndexOffset { get { return BeginIndex; } }
526
527    [Obsolete("CharStream.EndOfStream has been renamed to CharStream.EndIndex.")]
528    public long EndOfStream { get { return EndIndex; } }
529
530    // we don't have a public constructor that only takes a string to avoid potential confusion with a filepath constructor
531    internal CharStream(string chars) {
532        Debug.Assert(chars != null);
533        BufferString = chars;
534        // ByteBufferIndex = 0; // we recycle ByteBufferIndex for BufferStringIndex
535        BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
536        char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject();
537        BufferStringPointer = bufferBegin;
538        CharConstructorContinue(bufferBegin, chars.Length, 0);
539    }
540    /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive).</summary>
541    /// <exception cref="ArgumentNullException">chars is null.</exception>
542    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
543    public CharStream(string chars, int index, int length) : this(chars, index, length, 0) {}
544
545    /// <summary>Constructs a CharStream from the chars in the string argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
546    /// <exception cref="ArgumentNullException">chars is null.</exception>
547    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
548    public CharStream(string chars, int index, int length, long streamBeginIndex) {
549        if (chars == null) throw new ArgumentNullException("chars");
550        if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
551        if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
552        if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
553
554        BufferString = chars;
555        BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
556        char* pBufferString = (char*)BufferHandle.AddrOfPinnedObject();
557        BufferStringPointer = pBufferString;
558        CharConstructorContinue(pBufferString + index, length, streamBeginIndex);
559    }
560
561    /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive).</summary>
562    /// <exception cref="ArgumentNullException">chars is null.</exception>
563    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0 and index + length ≤ chars.Length.</exception>
564    public CharStream(char[] chars, int index, int length) : this(chars, index, length, 0) { }
565
566    /// <summary>Constructs a CharStream from the chars in the char array argument between the indices index (inclusive) and index + length (exclusive). The first char in the stream is assigned the index streamBeginIndex.</summary>
567    /// <exception cref="NullReferenceException">chars is null.</exception>
568    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: index ≥ 0, length ≥ 0, index + length ≤ chars.Length and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
569    public CharStream(char[] chars, int index, int length, long streamBeginIndex) {
570        if (chars == null) throw new ArgumentNullException("chars");
571        if (index < 0) throw new ArgumentOutOfRangeException("index", "The index is negative.");
572        if (length < 0 || length > chars.Length - index) throw new ArgumentOutOfRangeException("length", "The length is out of range.");
573        if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
574
575        BufferHandle = GCHandle.Alloc(chars, GCHandleType.Pinned);
576        char* bufferBegin = (char*)BufferHandle.AddrOfPinnedObject() + index;
577        CharConstructorContinue(bufferBegin, length, streamBeginIndex);
578    }
579
580    /// <summary>Constructs a CharStream from the length chars at the pointer address.</summary>
581    /// <exception cref="ArgumentNullException">chars is null.</exception>
582    /// <exception cref="ArgumentOutOfRangeException">length is negative.</exception>
583    public CharStream(char* chars, int length) : this(chars, length, 0) {}
584
585    /// <summary>Constructs a CharStream from the length chars at the pointer address. The first char in the stream is assigned the index streamBeginIndex.</summary>
586    /// <exception cref="ArgumentNullException">chars is null.</exception>
587    /// <exception cref="ArgumentOutOfRangeException">At least one of the following conditions is not satisfied: length ≥ 0 and 0 ≤ streamBeginIndex &lt; 2^60.</exception>
588    public CharStream(char* chars, int length, long streamBeginIndex) {
589        if (chars == null) throw new ArgumentNullException("chars");
590        if (length < 0) throw new ArgumentOutOfRangeException("length", "The length is negative.");
591        if (chars > unchecked(chars + length))
592            throw new ArgumentOutOfRangeException("length", "The length is out of range.");
593        if (streamBeginIndex < 0 || streamBeginIndex >= (1L << 60)) throw new ArgumentOutOfRangeException("streamBeginIndex", "streamBeginIndex must be non-negative and less than 2^60.");
594
595        CharConstructorContinue(chars, length, streamBeginIndex);
596    }
597
598    private void CharConstructorContinue(char* bufferBegin, int length, long streamBeginIndex) {
599        Debug.Assert((bufferBegin != null || length == 0) && length >= 0 && bufferBegin <= unchecked(bufferBegin + length) && streamBeginIndex >= 0 && streamBeginIndex < (1L << 60));
600        Encoding = Encoding.Unicode;
601        var anchor = Anchor.Create(this);
602        this.anchor = anchor;
603        if (length != 0) {
604            anchor->BufferBegin = bufferBegin;
605            anchor->BufferEnd = bufferBegin + length;
606            anchor->BlockSizeMinusOverlap = length;
607        } else {
608            anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
609            anchor->BufferEnd = null;
610            anchor->BlockSizeMinusOverlap = 0;
611        }
612        anchor->Block = 0;
613        anchor->LastBlock = 0;
614        anchor->CharIndex = 0;
615        anchor->CharIndexPlusOffset = streamBeginIndex;
616        anchor->CharIndexOffset = streamBeginIndex;
617        anchor->EndIndex = streamBeginIndex + length;
618    }
619
620    internal CharStream(string chars, char* pChars, char* begin, int length, long streamIndexOffset, Anchor* newUninitializedAnchor) {
621        Debug.Assert((chars == null ? pChars == null : pChars <= begin)
622                     && (begin != null || length == 0) && length >= 0 && begin <= unchecked(begin + length) && streamIndexOffset >= 0 && streamIndexOffset < (1L << 60));
623        Debug.Assert(newUninitializedAnchor->NeedToFree == false && !newUninitializedAnchor->StreamHandle.IsAllocated
624                     && newUninitializedAnchor->Block == 0 && newUninitializedAnchor->LastBlock == 0 && newUninitializedAnchor->CharIndex == 0);
625        BufferString = chars;
626        BufferStringPointer = pChars;
627        Encoding = Encoding.Unicode;
628        var anchor = newUninitializedAnchor;
629        this.anchor = anchor;
630        if (length != 0) {
631            anchor->BufferBegin = begin;
632            anchor->BufferEnd = begin + length;
633            anchor->BlockSizeMinusOverlap = length;
634        } else {
635            anchor->BufferBegin = null; // ensure that BufferBegin is null if length is 0
636            anchor->BufferEnd = null;
637            anchor->BlockSizeMinusOverlap = 0;
638        }
639        anchor->CharIndexPlusOffset = streamIndexOffset;
640        anchor->CharIndexOffset = streamIndexOffset;
641        anchor->EndIndex = streamIndexOffset + length;
642        anchor->StreamHandle = GCHandle.Alloc(this, GCHandleType.Normal);
643    }
644
645    /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
646    public CharStream(string path, Encoding encoding)
647           : this(path, encoding, true,
648                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
649
650    /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
651    public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
652           : this(path, encoding, detectEncodingFromByteOrderMarks,
653                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
654
655    /// <summary>Constructs a CharStream from the file at the given path.<br/>Is equivalent to CharStream(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan), false, encoding, detectEncodingFromByteOrderMarks, blockSize, blockOverlap, minRegexSpace, byteBufferLength).</summary>
656    public CharStream(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks,
657                      int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
658    {
659        if (encoding == null) throw new ArgumentNullException("encoding");
660        var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan);
661        try {
662           StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
663                                     blockSize, blockOverlap, minRegexSpace, byteBufferLength);
664        } catch {
665            stream.Dispose();
666            throw;
667        }
668    }
669
670    /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, false, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
671    public CharStream(Stream stream, Encoding encoding)
672           : this(stream,
673                  false, encoding, true,
674                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
675
676    /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, true, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
677    public CharStream(Stream stream, bool leaveOpen, Encoding encoding)
678           : this(stream,
679                  leaveOpen, encoding, true,
680                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
681
682    /// <summary>Constructs a CharStream from a byte Stream.<br/>Is equivalent to CharStream(stream, leaveOpen, encoding, detectEncodingFromByteOrderMarks, defaultBlockSize, defaultBlockSize/3, ((defaultBlockSize/3)*2)/3, defaultByteBufferLength).</summary>
683    public CharStream(Stream stream, bool leaveOpen, Encoding encoding, bool detectEncodingFromByteOrderMarks)
684           : this(stream,
685                  leaveOpen, encoding, detectEncodingFromByteOrderMarks,
686                  DefaultBlockSize, DefaultBlockSize/3, ((DefaultBlockSize/3)*2)/3, DefaultByteBufferLength) { }
687
688    /// <summary>Constructs a CharStream from a byte Stream.</summary>
689    /// <param name="stream">The byte stream providing the input.</param>
690    /// <param name="leaveOpen">Indicates whether the byte Stream should be left open when the CharStream has finished reading it.</param>
691    /// <param name="encoding">The (default) Encoding used for decoding the byte Stream into chars.</param>
692    /// <param name="detectEncodingFromByteOrderMarks">Indicates whether the constructor should detect the encoding from a unicode byte-order mark at the beginning of the stream. An encoding detected from a byte-order mark overrides the default encoding.</param>
693    /// <param name="blockSize">The number of chars per block. The default is 3×2^16 ≈ 200k.</param>
694    /// <param name="blockOverlap">The number of chars at the end of a block that are preserved when reading the next block into the char buffer. It must be less than blockSize/2, but not less than encoding.GetMaxCharCount(1). The default is blockSize/3.</param>
695    /// <param name="minRegexSpace">The number of chars that are guaranteed to be visible to a regular expression when it is matched on the stream (assuming there are enough chars remaining in the stream). Must not be greater than blockOverlap. The default is 2/3 of blockOverlap.</param>
696    /// <param name="byteBufferLength">The size of the byte buffer used for decoding purposes. The default is 2^12 = 4KB.</param>
697    public CharStream(Stream stream, bool leaveOpen,
698                      Encoding encoding, bool detectEncodingFromByteOrderMarks,
699                      int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
700    {
701        if (stream == null) throw new ArgumentNullException("stream");
702        if (!stream.CanRead) throw new ArgumentException("stream is not readable");
703        if (encoding == null) throw new ArgumentNullException("encoding");
704        StreamConstructorContinue(stream, false, encoding, detectEncodingFromByteOrderMarks,
705                                  blockSize, blockOverlap, minRegexSpace, byteBufferLength);
706    }
707
708    /// <summary>we modify this flag via reflection in the unit test</summary>
709    private static bool DoNotRoundUpBlockSizeToSimplifyTesting = false;
710
711    private void StreamConstructorContinue(Stream stream, bool leaveOpen,
712                                           Encoding encoding, bool detectEncodingFromByteOrderMarks,
713                                           int blockSize, int blockOverlap, int minRegexSpace, int byteBufferLength)
714    {
715        if (byteBufferLength < MinimumByteBufferLength) byteBufferLength = MinimumByteBufferLength;
716
717        int bytesInStream = -1;
718        long streamPosition;
719        if (stream.CanSeek) {
720            streamPosition = stream.Position;
721            long streamLength = stream.Length - streamPosition;
722            if (streamLength <= Int32.MaxValue) {
723                bytesInStream = (int)streamLength;
724                if (bytesInStream < byteBufferLength) byteBufferLength = bytesInStream;
725            }
726        } else {
727            streamPosition = 0;
728        }
729
730        byte[] byteBuffer = new byte[byteBufferLength];
731        int byteBufferCount = 0;
732        do {
733            int n = stream.Read(byteBuffer, byteBufferCount, byteBufferLength - byteBufferCount);
734            if (n == 0) {
735                bytesInStream = byteBufferCount;
736                break;
737            }
738            byteBufferCount += n;
739        } while (byteBufferCount < MinimumByteBufferLength);
740        streamPosition += byteBufferCount;
741
742        int preambleLength = Helper.DetectPreamble(byteBuffer, byteBufferCount, ref encoding, detectEncodingFromByteOrderMarks);
743        bytesInStream -= preambleLength;
744
745        Encoding = encoding;
746        Decoder decoder = encoding.GetDecoder();
747
748        // we allow such small block sizes only to simplify testing
749        if (blockSize < 8) blockSize = DefaultBlockSize;
750
751        bool allCharsFitIntoOneBlock = false;
752        if (bytesInStream >= 0 && bytesInStream/4 <= blockSize) {
753            if (bytesInStream != 0) {
754                try {
755                    int maxCharCount = Encoding.GetMaxCharCount(bytesInStream); // may throw ArgumentOutOfRangeException
756                    if (blockSize >= maxCharCount) {
757                        allCharsFitIntoOneBlock = true;
758                        blockSize = maxCharCount;
759                    }
760                } catch (ArgumentOutOfRangeException) { }
761            } else {
762                allCharsFitIntoOneBlock = true;
763                blockSize = 0;
764            }
765        }
766        var buffer = StringBuffer.Create(blockSize);
767        Debug.Assert(buffer.Length >= blockSize && (blockSize > 0 || buffer.StringPointer == null));
768        Buffer = buffer;
769        BufferString = buffer.String;
770        BufferStringPointer = buffer.StringPointer;
771        char* bufferBegin = buffer.StringPointer + buffer.Index;
772        try {
773            if (allCharsFitIntoOneBlock) {
774                int bufferCount = preambleLength == byteBufferCount
775                                  ? 0
776                                  : ReadAllRemainingCharsFromStream(bufferBegin, buffer.Length, byteBuffer, preambleLength, byteBufferCount, stream, streamPosition, decoder);
777                if (!leaveOpen) stream.Close();
778                var anchor = Anchor.Create(this);
779                this.anchor = anchor;
780                anchor->BlockSizeMinusOverlap = bufferCount;
781                anchor->EndIndex = bufferCount;
782                if (bufferCount != 0) {
783                    anchor->BufferBegin = bufferBegin;
784                    anchor->BufferEnd = bufferBegin + bufferCount;
785                } else {
786                    anchor->BufferBegin = null;
787                    anchor->BufferEnd = null;
788                }
789                anchor->Block = 0;
790                anchor->LastBlock = 0;
791                anchor->CharIndex = 0;
792                anchor->CharIndexOffset = 0;
793                anchor->CharIndexPlusOffset = 0;
794            } else {
795                if (!DoNotRoundUpBlockSizeToSimplifyTesting) blockSize = buffer.Length;
796                var d = new MultiBlockData();
797                Data = d;
798                d.Stream = stream;
799                d.StreamPosition = streamPosition;
800                d.LeaveOpen = leaveOpen;
801                d.Decoder = decoder;
802                d.ByteBuffer = byteBuffer;
803                d.ByteBufferIndex = preambleLength;
804                d.ByteBufferCount = byteBufferCount;
805                d.MaxCharCountForOneByte = Math.Max(1, Encoding.GetMaxCharCount(1));
806                d.SerializableDecoderMembers = GetSerializableDecoderMemberInfo(decoder);
807                if (blockSize < 3*d.MaxCharCountForOneByte) blockSize = 3*d.MaxCharCountForOneByte;
808                // MaxCharCountForOneByte == the maximum number of overhang chars
809                if(    Math.Min(blockOverlap, blockSize - 2*blockOverlap) < d.MaxCharCountForOneByte
810                    || blockOverlap >= blockSize/2) blockOverlap = blockSize/3;
811                if (minRegexSpace < 0 || minRegexSpace > blockOverlap) minRegexSpace = 2*blockOverlap/3;
812                d.BlockSize     = blockSize;
813                d.BlockOverlap  = blockOverlap;
814                d.RegexSpaceThreshold = bufferBegin + (blockSize - minRegexSpace);
815                var anchor = Anchor.Create(this);
816                this.anchor = anchor;
817                d.anchor = anchor;
818                anchor->BlockSizeMinusOverlap = blockSize - blockOverlap;
819                anchor->EndIndex = Int64.MaxValue;
820                anchor->BufferBegin = bufferBegin;
821                anchor->BufferEnd = bufferBegin;
822                anchor->Block = -2; // special value recognized by ReadBlock
823                anchor->LastBlock = Int32.MaxValue;
824                anchor->CharIndex = 0;
825                anchor->CharIndexOffset = 0;
826                anchor->CharIndexPlusOffset = 0;
827                d.Blocks = new List<BlockInfo>();
828                // the first block has no overlap with a previous block
829                d.Blocks.Add(new BlockInfo(preambleLength, preambleLength, 0, EOS, null, new DecoderState(), null, new Decod

Large files files are truncated, but you can click here to view the full file