/NBoilerpipePortable/Util/SGML/SgmlParser.cs
C# | 3222 lines | 2856 code | 121 blank | 245 comment | 356 complexity | 8cdcc85c0288b0626b85742f57e3d3e3 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- /*
- *
- * Copyright (c) 2007-2013 MindTouch. All rights reserved.
- * www.mindtouch.com oss@mindtouch.com
- *
- * For community documentation and downloads visit wiki.developer.mindtouch.com;
- * please review the licensing section.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
- using System;
- using System.Collections;
- using System.Collections.Generic;
- using System.Diagnostics.CodeAnalysis;
- using System.Globalization;
- using System.IO;
- using System.Net;
- using System.Runtime.Serialization;
- using System.Text;
- using System.Xml;
- namespace Sgml {
- /// <summary>
- /// Thrown if any errors occur while parsing the source.
- /// </summary>
- public class SgmlParseException : Exception
- {
- private string m_entityContext;
- /// <summary>
- /// Instantiates a new instance of SgmlParseException with no specific error information.
- /// </summary>
- public SgmlParseException()
- {
- }
- /// <summary>
- /// Instantiates a new instance of SgmlParseException with an error message describing the problem.
- /// </summary>
- /// <param name="message">A message describing the error that occurred</param>
- public SgmlParseException(string message)
- : base(message)
- {
- }
- /// <summary>
- /// Instantiates a new instance of SgmlParseException with an error message describing the problem.
- /// </summary>
- /// <param name="message">A message describing the error that occurred</param>
- /// <param name="e">The entity on which the error occurred.</param>
- public SgmlParseException(string message, Entity e)
- : base(message)
- {
- if (e != null)
- m_entityContext = e.Context();
- }
- /// <summary>
- /// Instantiates a new instance of SgmlParseException with an error message describing the problem.
- /// </summary>
- /// <param name="message">A message describing the error that occurred</param>
- /// <param name="innerException">The original exception that caused the problem.</param>
- public SgmlParseException(string message, Exception innerException)
- : base(message, innerException)
- {
- }
- /// <summary>
- /// Contextual information detailing the entity on which the error occurred.
- /// </summary>
- public string EntityContext
- {
- get
- {
- return m_entityContext;
- }
- }
- }
- /// <summary>
- /// The different types of literal text returned by the SgmlParser.
- /// </summary>
- public enum LiteralType
- {
- /// <summary>
- /// CDATA text literals.
- /// </summary>
- [SuppressMessage("Microsoft.Naming", "CA1705", Justification = "This capitalisation is appropriate since the value it represents has all upper-case capitalisation.")]
- CDATA,
- /// <summary>
- /// SDATA entities.
- /// </summary>
- [SuppressMessage("Microsoft.Naming", "CA1705", Justification = "This capitalisation is appropriate since the value it represents has all upper-case capitalisation.")]
- SDATA,
- /// <summary>
- /// The contents of a Processing Instruction.
- /// </summary>
- [SuppressMessage("Microsoft.Naming", "CA1705", Justification = "This capitalisation is appropriate since the value it represents has all upper-case capitalisation.")]
- PI
- };
- /// <summary>
- /// An Entity declared in a DTD.
- /// </summary>
- public class Entity : IDisposable
- {
- /// <summary>
- /// The character indicating End Of File.
- /// </summary>
- [SuppressMessage("Microsoft.Naming", "CA1705", Justification = "The capitalisation is correct since EOF is an acronym.")]
- public const char EOF = (char)65535;
- private string m_proxy;
- private string m_name;
- private bool m_isInternal;
- private string m_publicId;
- private string m_uri;
- private string m_literal;
- private LiteralType m_literalType;
- private Entity m_parent;
- private bool m_isHtml;
- private int m_line;
- private char m_lastchar;
- private bool m_isWhitespace;
- private Encoding m_encoding;
- private Uri m_resolvedUri;
- private TextReader m_stm;
- private bool m_weOwnTheStream;
- private int m_lineStart;
- private int m_absolutePos;
- /// <summary>
- /// Initialises a new instance of an Entity declared in a DTD.
- /// </summary>
- /// <param name="name">The name of the entity.</param>
- /// <param name="pubid">The public id of the entity.</param>
- /// <param name="uri">The uri of the entity.</param>
- /// <param name="proxy">The proxy server to use when retrieving any web content.</param>
- public Entity(string name, string pubid, string uri, string proxy)
- {
- m_name = name;
- m_publicId = pubid;
- m_uri = uri;
- m_proxy = proxy;
- m_isHtml = (name != null && StringUtilities.EqualsIgnoreCase(name, "html"));
- }
- /// <summary>
- /// Initialises a new instance of an Entity declared in a DTD.
- /// </summary>
- /// <param name="name">The name of the entity.</param>
- /// <param name="literal">The literal value of the entity.</param>
- public Entity(string name, string literal)
- {
- m_name = name;
- m_literal = literal;
- m_isInternal = true;
- }
- /// <summary>
- /// Initialises a new instance of an Entity declared in a DTD.
- /// </summary>
- /// <param name="name">The name of the entity.</param>
- /// <param name="baseUri">The baseUri for the entity to read from the TextReader.</param>
- /// <param name="stm">The TextReader to read the entity from.</param>
- /// <param name="proxy">The proxy server to use when retrieving any web content.</param>
- public Entity(string name, Uri baseUri, TextReader stm, string proxy)
- {
- m_name = name;
- m_isInternal = true;
- m_stm = stm;
- m_resolvedUri = baseUri;
- m_proxy = proxy;
- m_isHtml = string.Equals(name, "html", StringComparison.OrdinalIgnoreCase);
- }
- /// <summary>
- /// The name of the entity.
- /// </summary>
- public string Name
- {
- get
- {
- return m_name;
- }
- }
- /// <summary>
- /// True if the entity is the html element entity.
- /// </summary>
- public bool IsHtml
- {
- get
- {
- return m_isHtml;
- }
- set
- {
- m_isHtml = value;
- }
- }
- /// <summary>
- /// The public identifier of this entity.
- /// </summary>
- public string PublicId
- {
- get
- {
- return m_publicId;
- }
- }
- /// <summary>
- /// The Uri that is the source for this entity.
- /// </summary>
- public string Uri
- {
- get
- {
- return m_uri;
- }
- }
- /// <summary>
- /// The resolved location of the DTD this entity is from.
- /// </summary>
- public Uri ResolvedUri
- {
- get
- {
- if (this.m_resolvedUri != null)
- return this.m_resolvedUri;
- else if (m_parent != null)
- return m_parent.ResolvedUri;
- else
- return null;
- }
- }
- /// <summary>
- /// Gets the parent Entity of this Entity.
- /// </summary>
- public Entity Parent
- {
- get
- {
- return m_parent;
- }
- }
- /// <summary>
- /// The last character read from the input stream for this entity.
- /// </summary>
- public char Lastchar
- {
- get
- {
- return m_lastchar;
- }
- }
- /// <summary>
- /// The line on which this entity was defined.
- /// </summary>
- public int Line
- {
- get
- {
- return m_line;
- }
- }
- /// <summary>
- /// The index into the line where this entity is defined.
- /// </summary>
- public int LinePosition
- {
- get
- {
- return this.m_absolutePos - this.m_lineStart + 1;
- }
- }
- /// <summary>
- /// Whether this entity is an internal entity or not.
- /// </summary>
- /// <value>true if this entity is internal, otherwise false.</value>
- public bool IsInternal
- {
- get
- {
- return m_isInternal;
- }
- }
- /// <summary>
- /// The literal value of this entity.
- /// </summary>
- public string Literal
- {
- get
- {
- return m_literal;
- }
- }
- /// <summary>
- /// The <see cref="LiteralType"/> of this entity.
- /// </summary>
- public LiteralType LiteralType
- {
- get
- {
- return m_literalType;
- }
- }
- /// <summary>
- /// Whether the last char read for this entity is a whitespace character.
- /// </summary>
- public bool IsWhitespace
- {
- get
- {
- return m_isWhitespace;
- }
- }
- /// <summary>
- /// The proxy server to use when making web requests to resolve entities.
- /// </summary>
- public string Proxy
- {
- get
- {
- return m_proxy;
- }
- }
- /// <summary>
- /// Reads the next character from the DTD stream.
- /// </summary>
- /// <returns>The next character from the DTD stream.</returns>
- public char ReadChar()
- {
- char ch = (char)this.m_stm.Read();
- if (ch == 0)
- {
- // convert nulls to whitespace, since they are not valid in XML anyway.
- ch = ' ';
- }
- this.m_absolutePos++;
- if (ch == 0xa)
- {
- m_isWhitespace = true;
- this.m_lineStart = this.m_absolutePos + 1;
- this.m_line++;
- }
- else if (ch == ' ' || ch == '\t')
- {
- m_isWhitespace = true;
- if (m_lastchar == 0xd)
- {
- this.m_lineStart = this.m_absolutePos;
- m_line++;
- }
- }
- else if (ch == 0xd)
- {
- m_isWhitespace = true;
- }
- else
- {
- m_isWhitespace = false;
- if (m_lastchar == 0xd)
- {
- m_line++;
- this.m_lineStart = this.m_absolutePos;
- }
- }
- m_lastchar = ch;
- return ch;
- }
- /// <summary>
- /// Begins processing an entity.
- /// </summary>
- /// <param name="parent">The parent of this entity.</param>
- /// <param name="baseUri">The base Uri for processing this entity within.</param>
- public void Open(Entity parent, Uri baseUri)
- {
- this.m_parent = parent;
- if (parent != null)
- this.m_isHtml = parent.IsHtml;
- this.m_line = 1;
- if (m_isInternal)
- {
- if (this.m_literal != null)
- this.m_stm = new StringReader(this.m_literal);
- }
- else if (this.m_uri == null)
- {
- this.Error("Unresolvable entity '{0}'", this.m_name);
- }
- else
- {
- if (baseUri != null)
- {
- this.m_resolvedUri = new Uri(baseUri, this.m_uri);
- }
- else
- {
- this.m_resolvedUri = new Uri(this.m_uri);
- }
- Stream stream = null;
- Encoding e = Encoding.UTF8;
- //this stuff should be happening but i dont quite know whats going on
- throw new NotImplementedException();
- this.m_weOwnTheStream = true;
- HtmlStream html = new HtmlStream(stream, e);
- this.m_encoding = html.Encoding;
- this.m_stm = html;
- }
- }
- /// <summary>
- /// Gets the character encoding for this entity.
- /// </summary>
- public Encoding Encoding
- {
- get
- {
- return this.m_encoding;
- }
- }
-
- /// <summary>
- /// Closes the reader from which the entity is being read.
- /// </summary>
- public void Close()
- {
- if (this.m_weOwnTheStream)
- this.m_stm.Dispose();
- }
- /// <summary>
- /// Returns the next character after any whitespace.
- /// </summary>
- /// <returns>The next character that is not whitespace.</returns>
- public char SkipWhitespace()
- {
- char ch = m_lastchar;
- while (ch != Entity.EOF && (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'))
- {
- ch = ReadChar();
- }
- return ch;
- }
- /// <summary>
- /// Scans a token from the input stream and returns the result.
- /// </summary>
- /// <param name="sb">The <see cref="StringBuilder"/> to use to process the token.</param>
- /// <param name="term">A set of characters to look for as terminators for the token.</param>
- /// <param name="nmtoken">true if the token should be a NMToken, otherwise false.</param>
- /// <returns>The scanned token.</returns>
- public string ScanToken(StringBuilder sb, string term, bool nmtoken)
- {
- if (sb == null)
- throw new ArgumentNullException("sb");
- if (term == null)
- throw new ArgumentNullException("term");
- sb.Length = 0;
- char ch = m_lastchar;
- if (nmtoken && ch != '_' && !char.IsLetter(ch))
- {
- throw new SgmlParseException(string.Format(CultureInfo.CurrentUICulture, "Invalid name start character '{0}'", ch));
- }
- while (ch != Entity.EOF && term.IndexOf(ch) < 0)
- {
- if (!nmtoken || ch == '_' || ch == '.' || ch == '-' || ch == ':' || char.IsLetterOrDigit(ch)) {
- sb.Append(ch);
- }
- else {
- throw new SgmlParseException(
- string.Format(CultureInfo.CurrentUICulture, "Invalid name character '{0}'", ch));
- }
- ch = ReadChar();
- }
- return sb.ToString();
- }
- /// <summary>
- /// Read a literal from the input stream.
- /// </summary>
- /// <param name="sb">The <see cref="StringBuilder"/> to use to build the literal.</param>
- /// <param name="quote">The delimiter for the literal.</param>
- /// <returns>The literal scanned from the input stream.</returns>
- public string ScanLiteral(StringBuilder sb, char quote)
- {
- if (sb == null)
- throw new ArgumentNullException("sb");
- sb.Length = 0;
- char ch = ReadChar();
- while (ch != Entity.EOF && ch != quote)
- {
- if (ch == '&')
- {
- ch = ReadChar();
- if (ch == '#')
- {
- string charent = ExpandCharEntity();
- sb.Append(charent);
- ch = this.m_lastchar;
- }
- else
- {
- sb.Append('&');
- sb.Append(ch);
- ch = ReadChar();
- }
- }
- else
- {
- sb.Append(ch);
- ch = ReadChar();
- }
- }
- ReadChar(); // consume end quote.
- return sb.ToString();
- }
- /// <summary>
- /// Reads input until the end of the input stream or until a string of terminator characters is found.
- /// </summary>
- /// <param name="sb">The <see cref="StringBuilder"/> to use to build the string.</param>
- /// <param name="type">The type of the element being read (only used in reporting errors).</param>
- /// <param name="terminators">The string of terminator characters to look for.</param>
- /// <returns>The string read from the input stream.</returns>
- public string ScanToEnd(StringBuilder sb, string type, string terminators)
- {
- if (terminators == null)
- throw new ArgumentNullException("terminators");
- if (sb != null)
- sb.Length = 0;
- int start = m_line;
- // This method scans over a chunk of text looking for the
- // termination sequence specified by the 'terminators' parameter.
- char ch = ReadChar();
- int state = 0;
- char next = terminators[state];
- while (ch != Entity.EOF)
- {
- if (ch == next)
- {
- state++;
- if (state >= terminators.Length)
- {
- // found it!
- break;
- }
- next = terminators[state];
- }
- else if (state > 0)
- {
- // char didn't match, so go back and see how much does still match.
- int i = state - 1;
- int newstate = 0;
- while (i >= 0 && newstate == 0)
- {
- if (terminators[i] == ch)
- {
- // character is part of the terminators pattern, ok, so see if we can
- // match all the way back to the beginning of the pattern.
- int j = 1;
- while (i - j >= 0)
- {
- if (terminators[i - j] != terminators[state - j])
- break;
- j++;
- }
- if (j > i)
- {
- newstate = i + 1;
- }
- }
- else
- {
- i--;
- }
- }
- if (sb != null)
- {
- i = (i < 0) ? 1 : 0;
- for (int k = 0; k <= state - newstate - i; k++)
- {
- sb.Append(terminators[k]);
- }
- if (i > 0) // see if we've matched this char or not
- sb.Append(ch); // if not then append it to buffer.
- }
- state = newstate;
- next = terminators[newstate];
- }
- else
- {
- if (sb != null)
- sb.Append(ch);
- }
- ch = ReadChar();
- }
- if (ch == 0)
- Error(type + " starting on line {0} was never closed", start);
- ReadChar(); // consume last char in termination sequence.
- if (sb != null)
- return sb.ToString();
- else
- return string.Empty;
- }
- /// <summary>
- /// Expands a character entity to be read from the input stream.
- /// </summary>
- /// <returns>The string for the character entity.</returns>
- public string ExpandCharEntity()
- {
- string value;
- int v = ReadNumericEntityCode(out value);
- if(v == -1)
- {
- return value;
- }
- // HACK ALERT: IE and Netscape map the unicode characters
- if (this.m_isHtml && v >= 0x80 & v <= 0x9F)
- {
- // This range of control characters is mapped to Windows-1252!
- int i = v - 0x80;
- int unicode = CtrlMap[i];
- return Convert.ToChar(unicode).ToString();
- }
- if (0xD800 <= v && v <= 0xDBFF)
- {
- // high surrogate
- if (m_lastchar == '&')
- {
- char ch = ReadChar();
- if (ch == '#')
- {
- string value2;
- int v2 = ReadNumericEntityCode(out value2);
- if(v2 == -1)
- {
- return value + ";" + value2;
- }
- if (0xDC00 <= v2 && v2 <= 0xDFFF)
- {
- // low surrogate
- v = char.ConvertToUtf32((char)v, (char)v2);
- }
- }
- else
- {
- Error("Premature {0} parsing surrogate pair", ch);
- }
- }
- else
- {
- Error("Premature {0} parsing surrogate pair", m_lastchar);
- }
- }
- // NOTE (steveb): we need to use ConvertFromUtf32 to allow for extended numeric encodings
- return char.ConvertFromUtf32(v);
- }
- private int ReadNumericEntityCode(out string value)
- {
- int v = 0;
- char ch = ReadChar();
- value = "&#";
- if (ch == 'x')
- {
- bool sawHexDigit = false;
- value += "x";
- ch = ReadChar();
- for (; ch != Entity.EOF && ch != ';'; ch = ReadChar())
- {
- int p = 0;
- if (ch >= '0' && ch <= '9')
- {
- p = (int)(ch - '0');
- sawHexDigit = true;
- }
- else if (ch >= 'a' && ch <= 'f')
- {
- p = (int)(ch - 'a') + 10;
- sawHexDigit = true;
- }
- else if (ch >= 'A' && ch <= 'F')
- {
- p = (int)(ch - 'A') + 10;
- sawHexDigit = true;
- }
- else
- {
- break; //we must be done!
- //Error("Hex digit out of range '{0}'", (int)ch);
- }
- value += ch;
- v = (v*16) + p;
- }
- if (!sawHexDigit)
- {
- return -1;
- }
- }
- else
- {
- bool sawDigit = false;
- for (; ch != Entity.EOF && ch != ';'; ch = ReadChar())
- {
- if (ch >= '0' && ch <= '9')
- {
- v = (v*10) + (int)(ch - '0');
- sawDigit = true;
- }
- else
- {
- break; // we must be done!
- //Error("Decimal digit out of range '{0}'", (int)ch);
- }
- value += ch;
- }
- if (!sawDigit)
- {
- return -1;
- }
- }
- if (ch == 0)
- {
- Error("Premature {0} parsing entity reference", ch);
- }
- else if (ch == ';')
- {
- ReadChar();
- }
- return v;
- }
- static int[] CtrlMap = new int[] {
- // This is the windows-1252 mapping of the code points 0x80 through 0x9f.
- 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141,
- 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250,
- 339, 157, 382, 376
- };
- /// <summary>
- /// Raise a processing error.
- /// </summary>
- /// <param name="msg">The error message to use in the exception.</param>
- /// <exception cref="SgmlParseException">Always thrown.</exception>
- public void Error(string msg)
- {
- throw new SgmlParseException(msg, this);
- }
- /// <summary>
- /// Raise a processing error.
- /// </summary>
- /// <param name="msg">The error message to use in the exception.</param>
- /// <param name="ch">The unexpected character causing the error.</param>
- /// <exception cref="SgmlParseException">Always thrown.</exception>
- public void Error(string msg, char ch)
- {
- string str = (ch == Entity.EOF) ? "EOF" : char.ToString(ch);
- throw new SgmlParseException(string.Format(CultureInfo.CurrentUICulture, msg, str), this);
- }
- /// <summary>
- /// Raise a processing error.
- /// </summary>
- /// <param name="msg">The error message to use in the exception.</param>
- /// <param name="x">The value causing the error.</param>
- /// <exception cref="SgmlParseException">Always thrown.</exception>
- public void Error(string msg, int x)
- {
- throw new SgmlParseException(string.Format(CultureInfo.CurrentUICulture, msg, x), this);
- }
- /// <summary>
- /// Raise a processing error.
- /// </summary>
- /// <param name="msg">The error message to use in the exception.</param>
- /// <param name="arg">The argument for the error.</param>
- /// <exception cref="SgmlParseException">Always thrown.</exception>
- public void Error(string msg, string arg)
- {
- throw new SgmlParseException(string.Format(CultureInfo.CurrentUICulture, msg, arg), this);
- }
- /// <summary>
- /// Returns a string giving information on how the entity is referenced and declared, walking up the parents until the top level parent entity is found.
- /// </summary>
- /// <returns>Contextual information for the entity.</returns>
- public string Context()
- {
- Entity p = this;
- StringBuilder sb = new StringBuilder();
- while (p != null)
- {
- string msg;
- if (p.m_isInternal)
- {
- msg = string.Format(CultureInfo.InvariantCulture, "\nReferenced on line {0}, position {1} of internal entity '{2}'", p.m_line, p.LinePosition, p.m_name);
- }
- else {
- msg = string.Format(CultureInfo.InvariantCulture, "\nReferenced on line {0}, position {1} of '{2}' entity at [{3}]", p.m_line, p.LinePosition, p.m_name, p.ResolvedUri.AbsolutePath);
- }
- sb.Append(msg);
- p = p.Parent;
- }
- return sb.ToString();
- }
- /// <summary>
- /// Checks whether a token denotes a literal entity or not.
- /// </summary>
- /// <param name="token">The token to check.</param>
- /// <returns>true if the token is "CDATA", "SDATA" or "PI", otherwise false.</returns>
- public static bool IsLiteralType(string token)
- {
- return string.Equals(token, "CDATA", StringComparison.OrdinalIgnoreCase) ||
- string.Equals(token, "SDATA", StringComparison.OrdinalIgnoreCase) ||
- string.Equals(token, "PI", StringComparison.OrdinalIgnoreCase);
- }
- /// <summary>
- /// Sets the entity to be a literal of the type specified.
- /// </summary>
- /// <param name="token">One of "CDATA", "SDATA" or "PI".</param>
- public void SetLiteralType(string token)
- {
- switch (token)
- {
- case "CDATA":
- this.m_literalType = LiteralType.CDATA;
- break;
- case "SDATA":
- this.m_literalType = LiteralType.SDATA;
- break;
- case "PI":
- this.m_literalType = LiteralType.PI;
- break;
- }
- }
- #region IDisposable Members
- /// <summary>
- /// The finalizer for the Entity class.
- /// </summary>
- ~Entity()
- {
- Dispose(false);
- }
- /// <summary>
- /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
- /// </summary>
- public void Dispose()
- {
- Dispose(true);
- GC.SuppressFinalize(this);
- }
- /// <summary>
- /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
- /// </summary>
- /// <param name="isDisposing">true if this method has been called by user code, false if it has been called through a finalizer.</param>
- protected virtual void Dispose(bool isDisposing)
- {
- if (isDisposing)
- {
- if (m_stm != null)
- {
- m_stm.Dispose();
- m_stm = null;
- }
- }
- }
- #endregion
- }
- // This class decodes an HTML/XML stream correctly.
- internal class HtmlStream : TextReader
- {
- private Stream stm;
- private byte[] rawBuffer;
- private int rawPos;
- private int rawUsed;
- private Encoding m_encoding;
- private Decoder m_decoder;
- private char[] m_buffer;
- private int used;
- private int pos;
- private const int BUFSIZE = 16384;
- private const int EOF = -1;
- public HtmlStream(Stream stm, Encoding defaultEncoding)
- {
- if (defaultEncoding == null) defaultEncoding = Encoding.UTF8; // default is UTF8
- if (!stm.CanSeek){
- // Need to be able to seek to sniff correctly.
- stm = CopyToMemoryStream(stm);
- }
- this.stm = stm;
- rawBuffer = new Byte[BUFSIZE];
- rawUsed = stm.Read(rawBuffer, 0, 4); // maximum byte order mark
- this.m_buffer = new char[BUFSIZE];
- // Check byte order marks
- this.m_decoder = AutoDetectEncoding(rawBuffer, ref rawPos, rawUsed);
- int bom = rawPos;
- if (this.m_decoder == null)
- {
- this.m_decoder = defaultEncoding.GetDecoder();
- rawUsed += stm.Read(rawBuffer, 4, BUFSIZE-4);
- DecodeBlock();
- // Now sniff to see if there is an XML declaration or HTML <META> tag.
- Decoder sd = SniffEncoding();
- if (sd != null) {
- this.m_decoder = sd;
- }
- }
- // Reset to get ready for Read()
- this.stm.Seek(0, SeekOrigin.Begin);
- this.pos = this.used = 0;
- // skip bom
- if (bom>0){
- stm.Read(this.rawBuffer, 0, bom);
- }
- this.rawPos = this.rawUsed = 0;
-
- }
- public Encoding Encoding
- {
- get
- {
- return this.m_encoding;
- }
- }
- private static Stream CopyToMemoryStream(Stream s)
- {
- int size = 100000; // large heap is more efficient
- byte[] copyBuff = new byte[size];
- int len;
- MemoryStream r = new MemoryStream();
- while ((len = s.Read(copyBuff, 0, size)) > 0)
- r.Write(copyBuff, 0, len);
- r.Seek(0, SeekOrigin.Begin);
- s.Dispose();
- return r;
- }
- internal void DecodeBlock() {
- // shift current chars to beginning.
- if (pos > 0) {
- if (pos < used) {
- System.Array.Copy(m_buffer, pos, m_buffer, 0, used - pos);
- }
- used -= pos;
- pos = 0;
- }
- int len = m_decoder.GetCharCount(rawBuffer, rawPos, rawUsed - rawPos);
- int available = m_buffer.Length - used;
- if (available < len) {
- char[] newbuf = new char[m_buffer.Length + len];
- System.Array.Copy(m_buffer, pos, newbuf, 0, used - pos);
- m_buffer = newbuf;
- }
- used = pos + m_decoder.GetChars(rawBuffer, rawPos, rawUsed - rawPos, m_buffer, pos);
- rawPos = rawUsed; // consumed the whole buffer!
- }
- internal static Decoder AutoDetectEncoding(byte[] buffer, ref int index, int length) {
- if (4 <= (length - index)) {
- uint w = (uint)buffer[index + 0] << 24 | (uint)buffer[index + 1] << 16 | (uint)buffer[index + 2] << 8 | (uint)buffer[index + 3];
- // see if it's a 4-byte encoding
- switch (w) {
- case 0xfefffeff:
- index += 4;
- return new Ucs4DecoderBigEngian();
- case 0xfffefffe:
- index += 4;
- return new Ucs4DecoderLittleEndian();
- case 0x3c000000:
- goto case 0xfefffeff;
- case 0x0000003c:
- goto case 0xfffefffe;
- }
- w >>= 8;
- if (w == 0xefbbbf) {
- index += 3;
- return Encoding.UTF8.GetDecoder();
- }
- w >>= 8;
- switch (w) {
- case 0xfeff:
- index += 2;
- return UnicodeEncoding.BigEndianUnicode.GetDecoder();
- case 0xfffe:
- index += 2;
- return new UnicodeEncoding(false, false).GetDecoder();
- case 0x3c00:
- goto case 0xfeff;
- case 0x003c:
- goto case 0xfffe;
- }
- }
- return null;
- }
- private int ReadChar() {
- // Read only up to end of current buffer then stop.
- if (pos < used) return m_buffer[pos++];
- return EOF;
- }
- private int PeekChar() {
- int ch = ReadChar();
- if (ch != EOF) {
- pos--;
- }
- return ch;
- }
- private bool SniffPattern(string pattern) {
- int ch = PeekChar();
- if (ch != pattern[0]) return false;
- for (int i = 0, n = pattern.Length; ch != EOF && i < n; i++) {
- ch = ReadChar();
- char m = pattern[i];
- if (ch != m) {
- return false;
- }
- }
- return true;
- }
- private void SniffWhitespace() {
- char ch = (char)PeekChar();
- while (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
- int i = pos;
- ch = (char)ReadChar();
- if (ch != ' ' && ch != '\t' && ch != '\r' && ch != '\n')
- pos = i;
- }
- }
- private string SniffLiteral() {
- int quoteChar = PeekChar();
- if (quoteChar == '\'' || quoteChar == '"') {
- ReadChar();// consume quote char
- int i = this.pos;
- int ch = ReadChar();
- while (ch != EOF && ch != quoteChar) {
- ch = ReadChar();
- }
- return (pos>i) ? new string(m_buffer, i, pos - i - 1) : "";
- }
- return null;
- }
- private string SniffAttribute(string name) {
- SniffWhitespace();
- string id = SniffName();
- if (string.Equals(name, id, StringComparison.OrdinalIgnoreCase)) {
- SniffWhitespace();
- if (SniffPattern("=")) {
- SniffWhitespace();
- return SniffLiteral();
- }
- }
- return null;
- }
- private string SniffAttribute(out string name) {
- SniffWhitespace();
- name = SniffName();
- if (name != null){
- SniffWhitespace();
- if (SniffPattern("=")) {
- SniffWhitespace();
- return SniffLiteral();
- }
- }
- return null;
- }
- private void SniffTerminator(string term) {
- int ch = ReadChar();
- int i = 0;
- int n = term.Length;
- while (i < n && ch != EOF) {
- if (term[i] == ch) {
- i++;
- if (i == n) break;
- } else {
- i = 0; // reset.
- }
- ch = ReadChar();
- }
- }
- internal Decoder SniffEncoding()
- {
- Decoder decoder = null;
- if (SniffPattern("<?xml"))
- {
- string version = SniffAttribute("version");
- if (version != null)
- {
- string encoding = SniffAttribute("encoding");
- if (encoding != null)
- {
- try
- {
- Encoding enc = Encoding.GetEncoding(encoding);
- if (enc != null)
- {
- this.m_encoding = enc;
- return enc.GetDecoder();
- }
- }
- catch (ArgumentException)
- {
- // oh well then.
- }
- }
- SniffTerminator(">");
- }
- }
- if (decoder == null) {
- return SniffMeta();
- }
- return null;
- }
- internal Decoder SniffMeta()
- {
- int i = ReadChar();
- while (i != EOF)
- {
- char ch = (char)i;
- if (ch == '<')
- {
- string name = SniffName();
- if (name != null && StringUtilities.EqualsIgnoreCase(name, "meta"))
- {
- string httpequiv = null;
- string content = null;
- while (true)
- {
- string value = SniffAttribute(out name);
- if (name == null)
- break;
- if (StringUtilities.EqualsIgnoreCase(name, "http-equiv"))
- {
- httpequiv = value;
- }
- else if (StringUtilities.EqualsIgnoreCase(name, "content"))
- {
- content = value;
- }
- }
- if (httpequiv != null && StringUtilities.EqualsIgnoreCase(httpequiv, "content-type") && content != null)
- {
- int j = content.IndexOf("charset");
- if (j >= 0)
- {
- //charset=utf-8
- j = content.IndexOf("=", j);
- if (j >= 0)
- {
- j++;
- int k = content.IndexOf(";", j);
- if (k<0) k = content.Length;
- string charset = content.Substring(j, k-j).Trim();
- try
- {
- Encoding e = Encoding.GetEncoding(charset);
- this.m_encoding = e;
- return e.GetDecoder();
- } catch (ArgumentException) {}
- }
- }
- }
- }
- }
- i = ReadChar();
- }
- return null;
- }
- internal string SniffName()
- {
- int c = PeekChar();
- if (c == EOF)
- return null;
- char ch = (char)c;
- int start = pos;
- while (pos < used - 1 && (char.IsLetterOrDigit(ch) || ch == '-' || ch == '_' || ch == ':'))
- ch = m_buffer[++pos];
- if (start == pos)
- return null;
- return new string(m_buffer, start, pos - start);
- }
- [SuppressMessage("Microsoft.Performance", "CA1811", Justification = "Kept for potential future usage.")]
- internal void SkipWhitespace()
- {
- char ch = (char)PeekChar();
- while (pos < used - 1 && (ch == ' ' || ch == '\r' || ch == '\n'))
- ch = m_buffer[++pos];
- }
- [SuppressMessage("Microsoft.Performance", "CA1811", Justification = "Kept for potential future usage.")]
- internal void SkipTo(char what)
- {
- char ch = (char)PeekChar();
- while (pos < used - 1 && (ch != what))
- ch = m_buffer[++pos];
- }
- [SuppressMessage("Microsoft.Performance", "CA1811", Justification = "Kept for potential future usage.")]
- internal string ParseAttribute()
- {
- SkipTo('=');
- if (pos < used)
- {
- pos++;
- SkipWhitespace();
- if (pos < used) {
- char quote = m_buffer[pos];
- pos++;
- int start = pos;
- SkipTo(quote);
- if (pos < used) {
- string result = new string(m_buffer, start, pos - start);
- pos++;
- return result;
- }
- }
- }
- return null;
- }
- public override int Peek() {
- int result = Read();
- if (result != EOF) {
- pos--;
- }
- return result;
- }
- public override int Read()
- {
- if (pos == used)
- {
- rawUsed = stm.Read(rawBuffer, 0, rawBuffer.Length);
- rawPos = 0;
- if (rawUsed == 0) return EOF;
- DecodeBlock();
- }
- if (pos < used) return m_buffer[pos++];
- return -1;
- }
- public override int Read(char[] buffer, int start, int length) {
- if (pos == used) {
- rawUsed = stm.Read(rawBuffer, 0, rawBuffer.Length);
- rawPos = 0;
- if (rawUsed == 0) return -1;
- DecodeBlock();
- }
- if (pos < used) {
- length = Math.Min(used - pos, length);
- Array.Copy(this.m_buffer, pos, buffer, start, length);
- pos += length;
- return length;
- }
- return 0;
- }
- public override int ReadBlock(char[] data, int index, int count)
- {
- return Read(data, index, count);
- }
- // Read up to end of line, or full buffer, whichever comes first.
- [SuppressMessage("Microsoft.Performance", "CA1811", Justification = "Kept for potential future usage.")]
- public int ReadLine(char[] buffer, int start, int length)
- {
- int i = 0;
- int ch = ReadChar();
- while (ch != EOF) {
- buffer[i+start] = (char)ch;
- i++;
- if (i+start == length)
- break; // buffer is full
- if (ch == '\r' ) {
- if (PeekChar() == '\n') {
- ch = ReadChar();
- buffer[i + start] = (char)ch;
- i++;
- }
- break;
- } else if (ch == '\n') {
- break;
- }
- ch = ReadChar();
- }
- return i;
- }
- public override string ReadToEnd() {
- char[] buffer = new char[100000]; // large block heap is more efficient
- int len = 0;
- StringBuilder sb = new StringBuilder();
- while ((len = Read(buffer, 0, buffer.Length)) > 0) {
- sb.Append(buffer, 0, len);
- }
- return sb.ToString();
- }
- protected override void Dispose(bool disposing)
- {
- if(stm != null)
- stm.Dispose();
- base.Dispose(disposing);
- }
- }
- internal abstract class Ucs4Decoder : Decoder {
- internal byte[] temp = new byte[4];
- internal int tempBytes = 0;
- public override int GetCharCount(byte[] bytes, int index, int count) {
- return (count + tempBytes) / 4;
- }
- internal abstract int GetFullChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex);
- public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) {
- int i = tempBytes;
- if (tempBytes > 0) {
- for (; i < 4; i++) {
- temp[i] = bytes[byteIndex];
- byteIndex++;
- byteCount--;
- }
- i = 1;
- GetFullChars(temp, 0, 4, chars, charIndex);
- charIndex++;
- } else
- i = 0;
- i = GetFullChars(bytes, byteIndex, byteCount, chars, charIndex) + i;
- int j = (tempBytes + byteCount) % 4;
- byteCount += byteIndex;
- byteIndex = byteCount - j;
- tempBytes = 0;
- if (byteIndex >= 0)
- for (; byteIndex < byteCount; byteIndex++) {
- temp[tempBytes] = bytes[byteIndex];
- tempBytes++;
- }
- return i;
- }
- internal static char UnicodeToUTF16(UInt32 code) {
- byte lowerByte, higherByte;
- lowerByte = (byte)(0xD7C0 + (code >> 10));
- higherByte = (byte)(0xDC00 | code & 0x3ff);
- return ((char)((higherByte << 8) | lowerByte));
- }
- }
- internal class Ucs4DecoderBigEngian : Ucs4Decoder {
- internal override int GetFullChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) {
- UInt32 code;
- int i, j;
- byteCount += byteIndex;
- for (i = byteIndex, j = charIndex; i + 3 < byteCount; ) {
- code = (UInt32)(((bytes[i + 3]) << 24) | (bytes[i + 2] << 16) | (bytes[i + 1] << 8) | (bytes[i]));
- if (code > 0x10FFFF) {
- throw new SgmlParseException(string.Format(CultureInfo.CurrentUICulture, "Invalid character 0x{0:x} in encoding", code));
- } else if (code > 0xFFFF) {
- chars[j] = UnicodeToUTF16(code);
- j++;
- } else {
- if (code >= 0xD800 && code <= 0xDFFF) {
- throw new SgmlParseException(string.Format(CultureInfo.CurrentUICulture, "Invalid character 0x{0:x} in encoding", code));
- } else {
- chars[j] = (char)code;
- }
- }
- j++;
- i += 4;
- }
- return j - charIndex;
- }
- }
- internal class Ucs4DecoderLittleEndian : Ucs4Decoder {
- internal override int GetFullChars(byte[] bytes, int byteIndex, int byteCount, char[] chars,…
Large files files are truncated, but you can click here to view the full file