PageRenderTime 129ms CodeModel.GetById 3ms app.highlight 114ms RepoModel.GetById 1ms app.codeStats 1ms

/parser/html/nsHtml5StreamParser.cpp

http://github.com/zpao/v8monkey
C++ | 1622 lines | 1253 code | 125 blank | 244 comment | 235 complexity | 8a4384fa50368e7683dcce1be9d410ae MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2/* vim: set sw=2 ts=2 et tw=79: */
   3/* ***** BEGIN LICENSE BLOCK *****
   4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5 *
   6 * The contents of this file are subject to the Mozilla Public License Version
   7 * 1.1 (the "License"); you may not use this file except in compliance with
   8 * the License. You may obtain a copy of the License at
   9 * http://www.mozilla.org/MPL/
  10 *
  11 * Software distributed under the License is distributed on an "AS IS" basis,
  12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13 * for the specific language governing rights and limitations under the
  14 * License.
  15 *
  16 * The Original Code is mozilla.org code.
  17 *
  18 * The Initial Developer of the Original Code is
  19 * Netscape Communications Corporation.
  20 * Portions created by the Initial Developer are Copyright (C) 1998
  21 * the Initial Developer. All Rights Reserved.
  22 *
  23 * Contributor(s):
  24 *   Pierre Phaneuf <pp@ludusdesign.com>
  25 *   Henri Sivonen <hsivonen@iki.fi>
  26 *
  27 * Alternatively, the contents of this file may be used under the terms of
  28 * either of the GNU General Public License Version 2 or later (the "GPL"),
  29 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  30 * in which case the provisions of the GPL or the LGPL are applicable instead
  31 * of those above. If you wish to allow use of your version of this file only
  32 * under the terms of either the GPL or the LGPL, and not to allow others to
  33 * use your version of this file under the terms of the MPL, indicate your
  34 * decision by deleting the provisions above and replace them with the notice
  35 * and other provisions required by the GPL or the LGPL. If you do not delete
  36 * the provisions above, a recipient may use your version of this file under
  37 * the terms of any one of the MPL, the GPL or the LGPL.
  38 *
  39 * ***** END LICENSE BLOCK ***** */
  40
  41#include "nsHtml5StreamParser.h"
  42#include "nsICharsetConverterManager.h"
  43#include "nsICharsetAlias.h"
  44#include "nsServiceManagerUtils.h"
  45#include "nsEncoderDecoderUtils.h"
  46#include "nsContentUtils.h"
  47#include "nsHtml5Tokenizer.h"
  48#include "nsIHttpChannel.h"
  49#include "nsHtml5Parser.h"
  50#include "nsHtml5TreeBuilder.h"
  51#include "nsHtml5AtomTable.h"
  52#include "nsHtml5Module.h"
  53#include "nsHtml5RefPtr.h"
  54#include "nsIScriptError.h"
  55#include "mozilla/Preferences.h"
  56#include "nsHtml5Highlighter.h"
  57#include "expat_config.h"
  58#include "expat.h"
  59#include "nsINestedURI.h"
  60
  61using namespace mozilla;
  62
  63static NS_DEFINE_CID(kCharsetAliasCID, NS_CHARSETALIAS_CID);
  64
  65PRInt32 nsHtml5StreamParser::sTimerInitialDelay = 120;
  66PRInt32 nsHtml5StreamParser::sTimerSubsequentDelay = 120;
  67
  68// static
  69void
  70nsHtml5StreamParser::InitializeStatics()
  71{
  72  Preferences::AddIntVarCache(&sTimerInitialDelay,
  73                              "html5.flushtimer.initialdelay");
  74  Preferences::AddIntVarCache(&sTimerSubsequentDelay,
  75                              "html5.flushtimer.subsequentdelay");
  76}
  77
  78/*
  79 * Note that nsHtml5StreamParser implements cycle collecting AddRef and
  80 * Release. Therefore, nsHtml5StreamParser must never be refcounted from
  81 * the parser thread!
  82 *
  83 * To work around this limitation, runnables posted by the main thread to the
  84 * parser thread hold their reference to the stream parser in an
  85 * nsHtml5RefPtr. Upon creation, nsHtml5RefPtr addrefs the object it holds
  86 * just like a regular nsRefPtr. This is OK, since the creation of the
  87 * runnable and the nsHtml5RefPtr happens on the main thread.
  88 *
  89 * When the runnable is done on the parser thread, the destructor of
  90 * nsHtml5RefPtr runs there. It doesn't call Release on the held object
  91 * directly. Instead, it posts another runnable back to the main thread where
  92 * that runnable calls Release on the wrapped object.
  93 *
  94 * When posting runnables in the other direction, the runnables have to be
  95 * created on the main thread when nsHtml5StreamParser is instantiated and
  96 * held for the lifetime of the nsHtml5StreamParser. This works, because the
  97 * same runnabled can be dispatched multiple times and currently runnables
  98 * posted from the parser thread to main thread don't need to wrap any
  99 * runnable-specific data. (In the other direction, the runnables most notably
 100 * wrap the byte data of the stream.)
 101 */
 102NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
 103NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
 104
 105NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
 106  NS_INTERFACE_TABLE2(nsHtml5StreamParser, 
 107                      nsIStreamListener, 
 108                      nsICharsetDetectionObserver)
 109  NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
 110NS_INTERFACE_MAP_END
 111
 112NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
 113
 114NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
 115  tmp->DropTimer();
 116  NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mObserver)
 117  NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mRequest)
 118  tmp->mOwner = nsnull;
 119  tmp->mExecutorFlusher = nsnull;
 120  tmp->mLoadFlusher = nsnull;
 121  tmp->mExecutor = nsnull;
 122  NS_IMPL_CYCLE_COLLECTION_UNLINK_NSCOMPTR(mChardet)
 123NS_IMPL_CYCLE_COLLECTION_UNLINK_END
 124
 125NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
 126  NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mObserver)
 127  NS_IMPL_CYCLE_COLLECTION_TRAVERSE_NSCOMPTR(mRequest)
 128  if (tmp->mOwner) {
 129    NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mOwner");
 130    cb.NoteXPCOMChild(static_cast<nsIParser*> (tmp->mOwner));
 131  }
 132  // hack: count the strongly owned edge wrapped in the runnable
 133  if (tmp->mExecutorFlusher) {
 134    NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor");
 135    cb.NoteXPCOMChild(static_cast<nsIContentSink*> (tmp->mExecutor));
 136  }
 137  // hack: count the strongly owned edge wrapped in the runnable
 138  if (tmp->mLoadFlusher) {
 139    NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
 140    cb.NoteXPCOMChild(static_cast<nsIContentSink*> (tmp->mExecutor));
 141  }
 142  // hack: count self if held by mChardet
 143  if (tmp->mChardet) {
 144    NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, 
 145      "mChardet->mObserver");
 146    cb.NoteXPCOMChild(static_cast<nsIStreamListener*>(tmp));
 147  }
 148NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
 149
 150class nsHtml5ExecutorFlusher : public nsRunnable
 151{
 152  private:
 153    nsRefPtr<nsHtml5TreeOpExecutor> mExecutor;
 154  public:
 155    nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor)
 156      : mExecutor(aExecutor)
 157    {}
 158    NS_IMETHODIMP Run()
 159    {
 160      mExecutor->RunFlushLoop();
 161      return NS_OK;
 162    }
 163};
 164
 165class nsHtml5LoadFlusher : public nsRunnable
 166{
 167  private:
 168    nsRefPtr<nsHtml5TreeOpExecutor> mExecutor;
 169  public:
 170    nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor)
 171      : mExecutor(aExecutor)
 172    {}
 173    NS_IMETHODIMP Run()
 174    {
 175      mExecutor->FlushSpeculativeLoads();
 176      return NS_OK;
 177    }
 178};
 179
 180nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
 181                                         nsHtml5Parser* aOwner,
 182                                         eParserMode aMode)
 183  : mFirstBuffer(nsnull) // Will be filled when starting
 184  , mLastBuffer(nsnull) // Will be filled when starting
 185  , mExecutor(aExecutor)
 186  , mTreeBuilder(new nsHtml5TreeBuilder((aMode == VIEW_SOURCE_HTML ||
 187                                         aMode == VIEW_SOURCE_XML) ?
 188                                             nsnull : mExecutor->GetStage(),
 189                                         aMode == NORMAL ?
 190                                             mExecutor->GetStage() : nsnull))
 191  , mTokenizer(new nsHtml5Tokenizer(mTreeBuilder, aMode == VIEW_SOURCE_XML))
 192  , mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex")
 193  , mOwner(aOwner)
 194  , mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex")
 195  , mTerminatedMutex("nsHtml5StreamParser mTerminatedMutex")
 196  , mThread(nsHtml5Module::GetStreamParserThread())
 197  , mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor))
 198  , mLoadFlusher(new nsHtml5LoadFlusher(aExecutor))
 199  , mFlushTimer(do_CreateInstance("@mozilla.org/timer;1"))
 200  , mMode(aMode)
 201{
 202  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
 203  mFlushTimer->SetTarget(mThread);
 204  mAtomTable.Init(); // we aren't checking for OOM anyway...
 205#ifdef DEBUG
 206  mAtomTable.SetPermittedLookupThread(mThread);
 207#endif
 208  mTokenizer->setInterner(&mAtomTable);
 209  mTokenizer->setEncodingDeclarationHandler(this);
 210
 211  if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) {
 212    nsHtml5Highlighter* highlighter =
 213      new nsHtml5Highlighter(mExecutor->GetStage());
 214    mTokenizer->EnableViewSource(highlighter); // takes ownership
 215    mTreeBuilder->EnableViewSource(highlighter); // doesn't own
 216  }
 217
 218  // Chardet instantiation adapted from nsDOMFile.
 219  // Chardet is initialized here even if it turns out to be useless
 220  // to make the chardet refcount its observer (nsHtml5StreamParser)
 221  // on the main thread.
 222  const nsAdoptingCString& detectorName =
 223    Preferences::GetLocalizedCString("intl.charset.detector");
 224  if (!detectorName.IsEmpty()) {
 225    nsCAutoString detectorContractID;
 226    detectorContractID.AssignLiteral(NS_CHARSET_DETECTOR_CONTRACTID_BASE);
 227    detectorContractID += detectorName;
 228    if ((mChardet = do_CreateInstance(detectorContractID.get()))) {
 229      (void) mChardet->Init(this);
 230      mFeedChardet = true;
 231    }
 232  }
 233
 234  // There's a zeroing operator new for everything else
 235}
 236
 237nsHtml5StreamParser::~nsHtml5StreamParser()
 238{
 239  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
 240  mTokenizer->end();
 241  NS_ASSERTION(!mFlushTimer, "Flush timer was not dropped before dtor!");
 242#ifdef DEBUG
 243  mRequest = nsnull;
 244  mObserver = nsnull;
 245  mUnicodeDecoder = nsnull;
 246  mSniffingBuffer = nsnull;
 247  mMetaScanner = nsnull;
 248  mFirstBuffer = nsnull;
 249  mExecutor = nsnull;
 250  mTreeBuilder = nsnull;
 251  mTokenizer = nsnull;
 252  mOwner = nsnull;
 253#endif
 254}
 255
 256nsresult
 257nsHtml5StreamParser::GetChannel(nsIChannel** aChannel)
 258{
 259  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
 260  return mRequest ? CallQueryInterface(mRequest, aChannel) :
 261                    NS_ERROR_NOT_AVAILABLE;
 262}
 263
 264NS_IMETHODIMP
 265nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
 266{
 267  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 268  if (aConf == eBestAnswer || aConf == eSureAnswer) {
 269    mFeedChardet = false; // just in case
 270    if (HasDecoder()) {
 271      if (mCharset.Equals(aCharset)) {
 272        NS_ASSERTION(mCharsetSource < kCharsetFromAutoDetection,
 273            "Why are we running chardet at all?");
 274        mCharsetSource = kCharsetFromAutoDetection;
 275        mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 276      } else {
 277        // We've already committed to a decoder. Request a reload from the
 278        // docshell.
 279        nsCAutoString charset(aCharset);
 280        mTreeBuilder->NeedsCharsetSwitchTo(charset, kCharsetFromAutoDetection);
 281        FlushTreeOpsAndDisarmTimer();
 282        Interrupt();
 283      }
 284    } else {
 285      // Got a confident answer from the sniffing buffer. That code will
 286      // take care of setting up the decoder.
 287      mCharset.Assign(aCharset);
 288      mCharsetSource = kCharsetFromAutoDetection;
 289      mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 290    }
 291  }
 292  return NS_OK;
 293}
 294
 295void
 296nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL)
 297{
 298  if (aURL) {
 299    nsCOMPtr<nsIURI> temp;
 300    bool isViewSource;
 301    aURL->SchemeIs("view-source", &isViewSource);
 302    if (isViewSource) {
 303      nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL);
 304      nested->GetInnerURI(getter_AddRefs(temp));
 305    } else {
 306      temp = aURL;
 307    }
 308    bool isData;
 309    temp->SchemeIs("data", &isData);
 310    if (isData) {
 311      // Avoid showing potentially huge data: URLs. The three last bytes are
 312      // UTF-8 for an ellipsis.
 313      mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6");
 314    } else {
 315      temp->GetSpec(mViewSourceTitle);
 316    }
 317  }
 318}
 319
 320nsresult
 321nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const PRUint8* aFromSegment, // can be null
 322                                                                          PRUint32 aCount,
 323                                                                          PRUint32* aWriteCount)
 324{
 325  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 326  nsresult rv = NS_OK;
 327  nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
 328  NS_ENSURE_SUCCESS(rv, rv);
 329  rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
 330  if (rv == NS_ERROR_UCONV_NOCONV) {
 331    mCharset.AssignLiteral("windows-1252"); // lower case is the raw form
 332    mCharsetSource = kCharsetFromWeakDocTypeDefault;
 333    rv = convManager->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
 334    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 335  }
 336  NS_ENSURE_SUCCESS(rv, rv);
 337  mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
 338  return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
 339}
 340
 341nsresult
 342nsHtml5StreamParser::WriteSniffingBufferAndCurrentSegment(const PRUint8* aFromSegment, // can be null
 343                                                          PRUint32 aCount,
 344                                                          PRUint32* aWriteCount)
 345{
 346  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 347  nsresult rv = NS_OK;
 348  if (mSniffingBuffer) {
 349    PRUint32 writeCount;
 350    rv = WriteStreamBytes(mSniffingBuffer, mSniffingLength, &writeCount);
 351    NS_ENSURE_SUCCESS(rv, rv);
 352    mSniffingBuffer = nsnull;
 353  }
 354  mMetaScanner = nsnull;
 355  if (aFromSegment) {
 356    rv = WriteStreamBytes(aFromSegment, aCount, aWriteCount);
 357  }
 358  return rv;
 359}
 360
 361nsresult
 362nsHtml5StreamParser::SetupDecodingFromBom(const char* aCharsetName, const char* aDecoderCharsetName)
 363{
 364  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 365  nsresult rv = NS_OK;
 366  nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
 367  NS_ENSURE_SUCCESS(rv, rv);
 368  rv = convManager->GetUnicodeDecoderRaw(aDecoderCharsetName, getter_AddRefs(mUnicodeDecoder));
 369  NS_ENSURE_SUCCESS(rv, rv);
 370  mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
 371  mCharset.Assign(aCharsetName);
 372  mCharsetSource = kCharsetFromByteOrderMark;
 373  mFeedChardet = false;
 374  mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 375  mSniffingBuffer = nsnull;
 376  mMetaScanner = nsnull;
 377  mBomState = BOM_SNIFFING_OVER;
 378  return rv;
 379}
 380
 381void
 382nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const PRUint8* aFromSegment,
 383                                                 PRUint32 aCountToSniffingLimit)
 384{
 385  // Avoid underspecified heuristic craziness for XHR
 386  if (mMode == LOAD_AS_DATA) {
 387    return;
 388  }
 389  // Make sure there's enough data. Require room for "<title></title>"
 390  if (mSniffingLength + aCountToSniffingLimit < 30) {
 391    return;
 392  }
 393  // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
 394  bool byteZero[2] = { false, false };
 395  bool byteNonZero[2] = { false, false };
 396  PRUint32 i = 0;
 397  if (mSniffingBuffer) {
 398    for (; i < mSniffingLength; ++i) {
 399      if (mSniffingBuffer[i]) {
 400        if (byteNonZero[1 - (i % 2)]) {
 401          return;
 402        }
 403        byteNonZero[i % 2] = true;
 404      } else {
 405        if (byteZero[1 - (i % 2)]) {
 406          return;
 407        }
 408        byteZero[i % 2] = true;
 409      }
 410    }
 411  }
 412  if (aFromSegment) {
 413    for (PRUint32 j = 0; j < aCountToSniffingLimit; ++j) {
 414      if (aFromSegment[j]) {
 415        if (byteNonZero[1 - ((i + j) % 2)]) {
 416          return;
 417        }
 418        byteNonZero[(i + j) % 2] = true;
 419      } else {
 420        if (byteZero[1 - ((i + j) % 2)]) {
 421          return;
 422        }
 423        byteZero[(i + j) % 2] = true;
 424      }
 425    }
 426  }
 427
 428  if (byteNonZero[0]) {
 429    mCharset.Assign("UTF-16LE");
 430  } else {
 431    mCharset.Assign("UTF-16BE");
 432  }
 433  mCharsetSource = kCharsetFromIrreversibleAutoDetection;
 434  mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 435  mFeedChardet = false;
 436}
 437
 438void
 439nsHtml5StreamParser::SetEncodingFromExpat(const PRUnichar* aEncoding)
 440{
 441  if (aEncoding) {
 442    nsDependentString utf16(aEncoding);
 443    nsCAutoString utf8;
 444    CopyUTF16toUTF8(utf16, utf8);
 445    if (PreferredForInternalEncodingDecl(utf8)) {
 446      mCharset.Assign(utf8);
 447      mCharsetSource = kCharsetFromMetaTag; // closest for XML
 448      return;
 449    }
 450    // else the page declared an encoding Gecko doesn't support and we'd
 451    // end up defaulting to UTF-8 anyway. Might as well fall through here
 452    // right away and let the encoding be set to UTF-8 which we'd default to
 453    // anyway.
 454  }
 455  mCharset.AssignLiteral("UTF-8"); // XML defaults to UTF-8 without a BOM
 456  mCharsetSource = kCharsetFromMetaTag; // means confident
 457}
 458
 459// A separate user data struct is used instead of passing the
 460// nsHtml5StreamParser instance as user data in order to avoid including
 461// expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
 462// Using a separate user data struct also avoids bloating nsHtml5StreamParser
 463// by one pointer.
 464struct UserData {
 465  XML_Parser mExpat;
 466  nsHtml5StreamParser* mStreamParser;
 467};
 468
 469// Using no-namespace handler callbacks to avoid including expat.h in
 470// nsHtml5StreamParser.h, since doing so would cause naming conclicts.
 471static void
 472HandleXMLDeclaration(void* aUserData,
 473                     const XML_Char* aVersion,
 474                     const XML_Char* aEncoding,
 475                     int aStandalone)
 476{
 477  UserData* ud = static_cast<UserData*>(aUserData);
 478  ud->mStreamParser->SetEncodingFromExpat(
 479      reinterpret_cast<const PRUnichar*>(aEncoding));
 480  XML_StopParser(ud->mExpat, false);
 481}
 482
 483static void
 484HandleStartElement(void* aUserData,
 485                   const XML_Char* aName,
 486                   const XML_Char **aAtts)
 487{
 488  UserData* ud = static_cast<UserData*>(aUserData);
 489  XML_StopParser(ud->mExpat, false);
 490}
 491
 492static void
 493HandleEndElement(void* aUserData,
 494                 const XML_Char* aName)
 495{
 496  UserData* ud = static_cast<UserData*>(aUserData);
 497  XML_StopParser(ud->mExpat, false);
 498}
 499
 500static void
 501HandleComment(void* aUserData,
 502              const XML_Char* aName)
 503{
 504  UserData* ud = static_cast<UserData*>(aUserData);
 505  XML_StopParser(ud->mExpat, false);
 506}
 507
 508static void
 509HandleProcessingInstruction(void* aUserData,
 510                            const XML_Char* aTarget,
 511                            const XML_Char* aData)
 512{
 513  UserData* ud = static_cast<UserData*>(aUserData);
 514  XML_StopParser(ud->mExpat, false);
 515}
 516
 517nsresult
 518nsHtml5StreamParser::FinalizeSniffing(const PRUint8* aFromSegment, // can be null
 519                                      PRUint32 aCount,
 520                                      PRUint32* aWriteCount,
 521                                      PRUint32 aCountToSniffingLimit)
 522{
 523  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 524  NS_ASSERTION(mCharsetSource < kCharsetFromMetaTag,
 525      "Should not finalize sniffing when already confident.");
 526  if (mMode == VIEW_SOURCE_XML) {
 527    static const XML_Memory_Handling_Suite memsuite =
 528      {
 529        (void *(*)(size_t))moz_xmalloc,
 530        (void *(*)(void *, size_t))moz_xrealloc,
 531        moz_free
 532      };
 533
 534    static const PRUnichar kExpatSeparator[] = { 0xFFFF, '\0' };
 535
 536    static const PRUnichar kISO88591[] =
 537        { 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '\0' };
 538
 539    UserData ud;
 540    ud.mStreamParser = this;
 541
 542    // If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
 543    // documents MUST begin with a BOM. We don't support EBCDIC and such.
 544    // Thus, at this point, what we have is garbage or something encoded using
 545    // a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
 546    // without throwing errors when bytes have the most significant bit set
 547    // and without triggering expat's unknown encoding code paths. This is
 548    // enough to be able to use expat to parse the XML declaration in order
 549    // to extract the encoding name from it.
 550    ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
 551    XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
 552    XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
 553    XML_SetCommentHandler(ud.mExpat, HandleComment);
 554    XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
 555    XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
 556
 557    XML_Status status = XML_STATUS_OK;
 558
 559    // aFromSegment points to the data obtained from the current network
 560    // event. mSniffingBuffer (if it exists) contains the data obtained before
 561    // the current event. Thus, mSniffingLenth bytes of mSniffingBuffer
 562    // followed by aCountToSniffingLimit bytes from aFromSegment are the
 563    // first 1024 bytes of the file (or the file as a whole if the file is
 564    // 1024 bytes long or shorter). Thus, we parse both buffers, but if the
 565    // first call succeeds already, we skip parsing the second buffer.
 566    if (mSniffingBuffer) {
 567      status = XML_Parse(ud.mExpat,
 568                         reinterpret_cast<const char*>(mSniffingBuffer.get()),
 569                         mSniffingLength,
 570                         false);
 571    }
 572    if (status == XML_STATUS_OK &&
 573        mCharsetSource < kCharsetFromMetaTag &&
 574        aFromSegment) {
 575      status = XML_Parse(ud.mExpat,
 576                         reinterpret_cast<const char*>(aFromSegment),
 577                         aCountToSniffingLimit,
 578                         false);
 579    }
 580    XML_ParserFree(ud.mExpat);
 581
 582    if (mCharsetSource < kCharsetFromMetaTag) {
 583      // Failed to get an encoding from the XML declaration. XML defaults
 584      // confidently to UTF-8 in this case.
 585      // It is also possible that the document has an XML declaration that is
 586      // longer than 1024 bytes, but that case is not worth worrying about.
 587      mCharset.AssignLiteral("UTF-8");
 588      mCharsetSource = kCharsetFromMetaTag; // means confident
 589    }
 590
 591    return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
 592                                                                aCount,
 593                                                                aWriteCount);
 594  }
 595
 596  // meta scan failed.
 597  if (mCharsetSource >= kCharsetFromHintPrevDoc) {
 598    mFeedChardet = false;
 599    return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
 600  }
 601  // Check for BOMless UTF-16 with Basic
 602  // Latin content for compat with IE. See bug 631751.
 603  SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit);
 604  // the charset may have been set now
 605  // maybe try chardet now; 
 606  if (mFeedChardet) {
 607    bool dontFeed;
 608    nsresult rv;
 609    if (mSniffingBuffer) {
 610      rv = mChardet->DoIt((const char*)mSniffingBuffer.get(), mSniffingLength, &dontFeed);
 611      mFeedChardet = !dontFeed;
 612      NS_ENSURE_SUCCESS(rv, rv);
 613    }
 614    if (mFeedChardet && aFromSegment) {
 615      rv = mChardet->DoIt((const char*)aFromSegment,
 616                          // Avoid buffer boundary-dependent behavior when
 617                          // reparsing is forbidden. If reparse is forbidden,
 618                          // act as if we only saw the first 1024 bytes.
 619                          // When reparsing isn't forbidden, buffer boundaries
 620                          // can have an effect on whether the page is loaded
 621                          // once or twice. :-(
 622                          mReparseForbidden ? aCountToSniffingLimit : aCount,
 623                          &dontFeed);
 624      mFeedChardet = !dontFeed;
 625      NS_ENSURE_SUCCESS(rv, rv);
 626    }
 627    if (mFeedChardet && (!aFromSegment || mReparseForbidden)) {
 628      // mReparseForbidden is checked so that we get to use the sniffing
 629      // buffer with the best guess so far if we aren't allowed to guess
 630      // better later.
 631      mFeedChardet = false;
 632      rv = mChardet->Done();
 633      NS_ENSURE_SUCCESS(rv, rv);
 634    }
 635    // fall thru; callback may have changed charset  
 636  }
 637  if (mCharsetSource == kCharsetUninitialized) {
 638    // Hopefully this case is never needed, but dealing with it anyway
 639    mCharset.AssignLiteral("windows-1252");
 640    mCharsetSource = kCharsetFromWeakDocTypeDefault;
 641    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 642  } else if (mMode == LOAD_AS_DATA &&
 643             mCharsetSource == kCharsetFromWeakDocTypeDefault) {
 644    NS_ASSERTION(mReparseForbidden, "Reparse should be forbidden for XHR");
 645    NS_ASSERTION(!mFeedChardet, "Should not feed chardet for XHR");
 646    NS_ASSERTION(mCharset.EqualsLiteral("UTF-8"),
 647                 "XHR should default to UTF-8");
 648    // Now mark charset source as non-weak to signal that we have a decision
 649    mCharsetSource = kCharsetFromDocTypeDefault;
 650    mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 651  }
 652  return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
 653}
 654
 655nsresult
 656nsHtml5StreamParser::SniffStreamBytes(const PRUint8* aFromSegment,
 657                                      PRUint32 aCount,
 658                                      PRUint32* aWriteCount)
 659{
 660  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 661  nsresult rv = NS_OK;
 662  PRUint32 writeCount;
 663  for (PRUint32 i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
 664    switch (mBomState) {
 665      case BOM_SNIFFING_NOT_STARTED:
 666        NS_ASSERTION(i == 0, "Bad BOM sniffing state.");
 667        switch (*aFromSegment) {
 668          case 0xEF:
 669            mBomState = SEEN_UTF_8_FIRST_BYTE;
 670            break;
 671          case 0xFF:
 672            mBomState = SEEN_UTF_16_LE_FIRST_BYTE;
 673            break;
 674          case 0xFE:
 675            mBomState = SEEN_UTF_16_BE_FIRST_BYTE;
 676            break;
 677          default:
 678            mBomState = BOM_SNIFFING_OVER;
 679            break;
 680        }
 681        break;
 682      case SEEN_UTF_16_LE_FIRST_BYTE:
 683        if (aFromSegment[i] == 0xFE) {
 684          rv = SetupDecodingFromBom("UTF-16", "UTF-16LE"); // upper case is the raw form
 685          NS_ENSURE_SUCCESS(rv, rv);
 686          PRUint32 count = aCount - (i + 1);
 687          rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
 688          NS_ENSURE_SUCCESS(rv, rv);
 689          *aWriteCount = writeCount + (i + 1);
 690          return rv;
 691        }
 692        mBomState = BOM_SNIFFING_OVER;
 693        break;
 694      case SEEN_UTF_16_BE_FIRST_BYTE:
 695        if (aFromSegment[i] == 0xFF) {
 696          rv = SetupDecodingFromBom("UTF-16", "UTF-16BE"); // upper case is the raw form
 697          NS_ENSURE_SUCCESS(rv, rv);
 698          PRUint32 count = aCount - (i + 1);
 699          rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
 700          NS_ENSURE_SUCCESS(rv, rv);
 701          *aWriteCount = writeCount + (i + 1);
 702          return rv;
 703        }
 704        mBomState = BOM_SNIFFING_OVER;
 705        break;
 706      case SEEN_UTF_8_FIRST_BYTE:
 707        if (aFromSegment[i] == 0xBB) {
 708          mBomState = SEEN_UTF_8_SECOND_BYTE;
 709        } else {
 710          mBomState = BOM_SNIFFING_OVER;
 711        }
 712        break;
 713      case SEEN_UTF_8_SECOND_BYTE:
 714        if (aFromSegment[i] == 0xBF) {
 715          rv = SetupDecodingFromBom("UTF-8", "UTF-8"); // upper case is the raw form
 716          NS_ENSURE_SUCCESS(rv, rv);
 717          PRUint32 count = aCount - (i + 1);
 718          rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
 719          NS_ENSURE_SUCCESS(rv, rv);
 720          *aWriteCount = writeCount + (i + 1);
 721          return rv;
 722        }
 723        mBomState = BOM_SNIFFING_OVER;
 724        break;
 725      default:
 726        mBomState = BOM_SNIFFING_OVER;
 727        break;
 728    }
 729  }
 730  // if we get here, there either was no BOM or the BOM sniffing isn't complete yet
 731  
 732  if (!mMetaScanner && (mMode == NORMAL ||
 733                        mMode == VIEW_SOURCE_HTML ||
 734                        mMode == LOAD_AS_DATA)) {
 735    mMetaScanner = new nsHtml5MetaScanner();
 736  }
 737  
 738  if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
 739    // this is the last buffer
 740    PRUint32 countToSniffingLimit =
 741        NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
 742    if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
 743      nsHtml5ByteReadable readable(aFromSegment, aFromSegment +
 744          countToSniffingLimit);
 745      mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
 746      if (mUnicodeDecoder) {
 747        mUnicodeDecoder->SetInputErrorBehavior(
 748            nsIUnicodeDecoder::kOnError_Recover);
 749        // meta scan successful
 750        mCharsetSource = kCharsetFromMetaPrescan;
 751        mFeedChardet = false;
 752        mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 753        mMetaScanner = nsnull;
 754        return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount,
 755            aWriteCount);
 756      }
 757    }
 758    return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
 759        countToSniffingLimit);
 760  }
 761
 762  // not the last buffer
 763  if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
 764    nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
 765    mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
 766    if (mUnicodeDecoder) {
 767      // meta scan successful
 768      mUnicodeDecoder->SetInputErrorBehavior(
 769          nsIUnicodeDecoder::kOnError_Recover);
 770      mCharsetSource = kCharsetFromMetaPrescan;
 771      mFeedChardet = false;
 772      mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
 773      mMetaScanner = nsnull;
 774      return WriteSniffingBufferAndCurrentSegment(aFromSegment, 
 775                                                  aCount,
 776                                                  aWriteCount);
 777    }
 778  }
 779
 780  if (!mSniffingBuffer) {
 781    const mozilla::fallible_t fallible = mozilla::fallible_t();
 782    mSniffingBuffer = new (fallible)
 783      PRUint8[NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE];
 784    if (!mSniffingBuffer) {
 785      return NS_ERROR_OUT_OF_MEMORY;
 786    }
 787  }
 788  memcpy(mSniffingBuffer + mSniffingLength, aFromSegment, aCount);
 789  mSniffingLength += aCount;
 790  *aWriteCount = aCount;
 791  return NS_OK;
 792}
 793
 794nsresult
 795nsHtml5StreamParser::WriteStreamBytes(const PRUint8* aFromSegment,
 796                                      PRUint32 aCount,
 797                                      PRUint32* aWriteCount)
 798{
 799  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 800  // mLastBuffer always points to a buffer of the size
 801  // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE.
 802  if (mLastBuffer->getEnd() == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
 803    nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
 804      nsHtml5OwningUTF16Buffer::FalliblyCreate(
 805        NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
 806    if (!newBuf) {
 807      return NS_ERROR_OUT_OF_MEMORY;
 808    }
 809    mLastBuffer = (mLastBuffer->next = newBuf.forget());
 810  }
 811  PRInt32 totalByteCount = 0;
 812  for (;;) {
 813    PRInt32 end = mLastBuffer->getEnd();
 814    PRInt32 byteCount = aCount - totalByteCount;
 815    PRInt32 utf16Count = NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE - end;
 816
 817    NS_ASSERTION(utf16Count, "Trying to convert into a buffer with no free space!");
 818    // byteCount may be zero to force the decoder to output a pending surrogate
 819    // pair.
 820
 821    nsresult convResult = mUnicodeDecoder->Convert((const char*)aFromSegment, &byteCount, mLastBuffer->getBuffer() + end, &utf16Count);
 822
 823    end += utf16Count;
 824    mLastBuffer->setEnd(end);
 825    totalByteCount += byteCount;
 826    aFromSegment += byteCount;
 827
 828    NS_ASSERTION(end <= NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE,
 829        "The Unicode decoder wrote too much data.");
 830    NS_ASSERTION(byteCount >= -1, "The decoder consumed fewer than -1 bytes.");
 831
 832    if (NS_FAILED(convResult)) {
 833      // Using the more generic NS_FAILED test above in case there are still
 834      // decoders around that don't use NS_ERROR_ILLEGAL_INPUT properly.
 835      NS_ASSERTION(convResult == NS_ERROR_ILLEGAL_INPUT,
 836          "The decoder signaled an error other than NS_ERROR_ILLEGAL_INPUT.");
 837
 838      // There's an illegal byte in the input. It's now the responsibility
 839      // of this calling code to output a U+FFFD REPLACEMENT CHARACTER and
 840      // reset the decoder.
 841
 842      if (totalByteCount < (PRInt32)aCount) {
 843        // advance over the bad byte
 844        ++totalByteCount;
 845        ++aFromSegment;
 846      } else {
 847        NS_NOTREACHED("The decoder signaled an error but consumed all input.");
 848        // Recovering from this situation in case there are still broken
 849        // decoders, since nsScanner had recovery code, too.
 850        totalByteCount = (PRInt32)aCount;
 851      }
 852
 853      // Emit the REPLACEMENT CHARACTER
 854      mLastBuffer->getBuffer()[end] = 0xFFFD;
 855      ++end;
 856      mLastBuffer->setEnd(end);
 857      if (end == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
 858        nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
 859          nsHtml5OwningUTF16Buffer::FalliblyCreate(
 860            NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
 861        if (!newBuf) {
 862          return NS_ERROR_OUT_OF_MEMORY;
 863        }
 864        mLastBuffer = (mLastBuffer->next = newBuf.forget());
 865      }
 866
 867      mUnicodeDecoder->Reset();
 868      if (totalByteCount == (PRInt32)aCount) {
 869        *aWriteCount = (PRUint32)totalByteCount;
 870        return NS_OK;
 871      }
 872    } else if (convResult == NS_PARTIAL_MORE_OUTPUT) {
 873      nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
 874        nsHtml5OwningUTF16Buffer::FalliblyCreate(
 875          NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
 876      if (!newBuf) {
 877        return NS_ERROR_OUT_OF_MEMORY;
 878      }
 879      mLastBuffer = (mLastBuffer->next = newBuf.forget());
 880      // All input may have been consumed if there is a pending surrogate pair
 881      // that doesn't fit in the output buffer. Loop back to push a zero-length
 882      // input to the decoder in that case.
 883    } else {
 884      NS_ASSERTION(totalByteCount == (PRInt32)aCount,
 885          "The Unicode decoder consumed the wrong number of bytes.");
 886      *aWriteCount = (PRUint32)totalByteCount;
 887      return NS_OK;
 888    }
 889  }
 890}
 891
 892// nsIRequestObserver methods:
 893nsresult
 894nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
 895{
 896  NS_PRECONDITION(STREAM_NOT_STARTED == mStreamState,
 897                  "Got OnStartRequest when the stream had already started.");
 898  NS_PRECONDITION(!mExecutor->HasStarted(), 
 899                  "Got OnStartRequest at the wrong stage in the executor life cycle.");
 900  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
 901  if (mObserver) {
 902    mObserver->OnStartRequest(aRequest, aContext);
 903  }
 904  mRequest = aRequest;
 905
 906  mStreamState = STREAM_BEING_READ;
 907
 908  if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
 909    mTokenizer->StartViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
 910  }
 911
 912  // For View Source, the parser should run with scripts "enabled" if a normal
 913  // load would have scripts enabled.
 914  bool scriptingEnabled = mMode == LOAD_AS_DATA ?
 915                                   false : mExecutor->IsScriptEnabled();
 916  mOwner->StartTokenizer(scriptingEnabled);
 917  mTreeBuilder->setScriptingEnabled(scriptingEnabled);
 918  mTokenizer->start();
 919  mExecutor->Start();
 920  mExecutor->StartReadingFromStage();
 921
 922  if (mMode == PLAIN_TEXT) {
 923    mTreeBuilder->StartPlainText();
 924    mTokenizer->StartPlainText();
 925  } else if (mMode == VIEW_SOURCE_PLAIN) {
 926    mTreeBuilder->StartPlainTextViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
 927    mTokenizer->StartPlainText();
 928  }
 929
 930  /*
 931   * If you move the following line, be very careful not to cause 
 932   * WillBuildModel to be called before the document has had its 
 933   * script global object set.
 934   */
 935  mExecutor->WillBuildModel(eDTDMode_unknown);
 936  
 937  nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
 938    nsHtml5OwningUTF16Buffer::FalliblyCreate(
 939      NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
 940  if (!newBuf) {
 941    mExecutor->MarkAsBroken(); // marks this stream parser as terminated,
 942                               // which prevents entry to code paths that
 943                               // would use mFirstBuffer or mLastBuffer.
 944    return NS_ERROR_OUT_OF_MEMORY;
 945  }
 946  NS_ASSERTION(!mFirstBuffer, "How come we have the first buffer set?");
 947  NS_ASSERTION(!mLastBuffer, "How come we have the last buffer set?");
 948  mFirstBuffer = mLastBuffer = newBuf;
 949
 950  nsresult rv = NS_OK;
 951
 952  // The line below means that the encoding can end up being wrong if
 953  // a view-source URL is loaded without having the encoding hint from a
 954  // previous normal load in the history.
 955  mReparseForbidden = !(mMode == NORMAL || mMode == PLAIN_TEXT);
 956
 957  nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(mRequest, &rv));
 958  if (NS_SUCCEEDED(rv)) {
 959    nsCAutoString method;
 960    httpChannel->GetRequestMethod(method);
 961    // XXX does Necko have a way to renavigate POST, etc. without hitting
 962    // the network?
 963    if (!method.EqualsLiteral("GET")) {
 964      // This is the old Gecko behavior but the HTML5 spec disagrees.
 965      // Don't reparse on POST.
 966      mReparseForbidden = true;
 967      mFeedChardet = false; // can't restart anyway
 968    }
 969  }
 970
 971  if (mCharsetSource >= kCharsetFromAutoDetection) {
 972    mFeedChardet = false;
 973  }
 974  
 975  if (mCharsetSource <= kCharsetFromMetaPrescan) {
 976    // we aren't ready to commit to an encoding yet
 977    // leave converter uninstantiated for now
 978    return NS_OK;
 979  }
 980  
 981  nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
 982  NS_ENSURE_SUCCESS(rv, rv);
 983  rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
 984  // if we failed to get a decoder, there will be fallback, so don't propagate
 985  //  the error.
 986  if (NS_SUCCEEDED(rv)) {
 987    mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
 988  } else {
 989    mCharsetSource = kCharsetFromWeakDocTypeDefault;
 990  }
 991  return NS_OK;
 992}
 993
 994void
 995nsHtml5StreamParser::DoStopRequest()
 996{
 997  NS_ASSERTION(IsParserThread(), "Wrong thread!");
 998  NS_PRECONDITION(STREAM_BEING_READ == mStreamState,
 999                  "Stream ended without being open.");
1000  mTokenizerMutex.AssertCurrentThreadOwns();
1001
1002  if (IsTerminated()) {
1003    return;
1004  }
1005
1006  mStreamState = STREAM_ENDED;
1007
1008  if (!mUnicodeDecoder) {
1009    PRUint32 writeCount;
1010    if (NS_FAILED(FinalizeSniffing(nsnull, 0, &writeCount, 0))) {
1011      MarkAsBroken();
1012      return;
1013    }
1014  } else if (mFeedChardet) {
1015    mChardet->Done();
1016  }
1017
1018  if (IsTerminatedOrInterrupted()) {
1019    return;
1020  }
1021
1022  ParseAvailableData(); 
1023}
1024
1025class nsHtml5RequestStopper : public nsRunnable
1026{
1027  private:
1028    nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1029  public:
1030    nsHtml5RequestStopper(nsHtml5StreamParser* aStreamParser)
1031      : mStreamParser(aStreamParser)
1032    {}
1033    NS_IMETHODIMP Run()
1034    {
1035      mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1036      mStreamParser->DoStopRequest();
1037      return NS_OK;
1038    }
1039};
1040
1041nsresult
1042nsHtml5StreamParser::OnStopRequest(nsIRequest* aRequest,
1043                             nsISupports* aContext,
1044                             nsresult status)
1045{
1046  NS_ASSERTION(mRequest == aRequest, "Got Stop on wrong stream.");
1047  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1048  if (mObserver) {
1049    mObserver->OnStopRequest(aRequest, aContext, status);
1050  }
1051  nsCOMPtr<nsIRunnable> stopper = new nsHtml5RequestStopper(this);
1052  if (NS_FAILED(mThread->Dispatch(stopper, nsIThread::DISPATCH_NORMAL))) {
1053    NS_WARNING("Dispatching StopRequest event failed.");
1054  }
1055  return NS_OK;
1056}
1057
1058void
1059nsHtml5StreamParser::DoDataAvailable(PRUint8* aBuffer, PRUint32 aLength)
1060{
1061  NS_ASSERTION(IsParserThread(), "Wrong thread!");
1062  NS_PRECONDITION(STREAM_BEING_READ == mStreamState,
1063                  "DoDataAvailable called when stream not open.");
1064  mTokenizerMutex.AssertCurrentThreadOwns();
1065
1066  if (IsTerminated()) {
1067    return;
1068  }
1069
1070  PRUint32 writeCount;
1071  nsresult rv;
1072  if (HasDecoder()) {
1073    if (mFeedChardet) {
1074      bool dontFeed;
1075      mChardet->DoIt((const char*)aBuffer, aLength, &dontFeed);
1076      mFeedChardet = !dontFeed;
1077    }
1078    rv = WriteStreamBytes(aBuffer, aLength, &writeCount);
1079  } else {
1080    rv = SniffStreamBytes(aBuffer, aLength, &writeCount);
1081  }
1082  if (NS_FAILED(rv)) {
1083    MarkAsBroken();
1084    return;
1085  }
1086  NS_ASSERTION(writeCount == aLength, "Wrong number of stream bytes written/sniffed.");
1087
1088  if (IsTerminatedOrInterrupted()) {
1089    return;
1090  }
1091
1092  ParseAvailableData();
1093
1094  if (mFlushTimerArmed || mSpeculating) {
1095    return;
1096  }
1097
1098  mFlushTimer->InitWithFuncCallback(nsHtml5StreamParser::TimerCallback,
1099                                    static_cast<void*> (this),
1100                                    mFlushTimerEverFired ?
1101                                        sTimerInitialDelay :
1102                                        sTimerSubsequentDelay,
1103                                    nsITimer::TYPE_ONE_SHOT);
1104  mFlushTimerArmed = true;
1105}
1106
1107class nsHtml5DataAvailable : public nsRunnable
1108{
1109  private:
1110    nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1111    nsAutoArrayPtr<PRUint8>            mData;
1112    PRUint32                           mLength;
1113  public:
1114    nsHtml5DataAvailable(nsHtml5StreamParser* aStreamParser,
1115                         PRUint8*             aData,
1116                         PRUint32             aLength)
1117      : mStreamParser(aStreamParser)
1118      , mData(aData)
1119      , mLength(aLength)
1120    {}
1121    NS_IMETHODIMP Run()
1122    {
1123      mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1124      mStreamParser->DoDataAvailable(mData, mLength);
1125      return NS_OK;
1126    }
1127};
1128
1129// nsIStreamListener method:
1130nsresult
1131nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
1132                               nsISupports* aContext,
1133                               nsIInputStream* aInStream,
1134                               PRUint32 aSourceOffset,
1135                               PRUint32 aLength)
1136{
1137  if (mExecutor->IsBroken()) {
1138    return NS_ERROR_OUT_OF_MEMORY;
1139  }
1140
1141  NS_ASSERTION(mRequest == aRequest, "Got data on wrong stream.");
1142  PRUint32 totalRead;
1143  const mozilla::fallible_t fallible = mozilla::fallible_t();
1144  nsAutoArrayPtr<PRUint8> data(new (fallible) PRUint8[aLength]);
1145  if (!data) {
1146    mExecutor->MarkAsBroken();
1147    return NS_ERROR_OUT_OF_MEMORY;
1148  }
1149  nsresult rv = aInStream->Read(reinterpret_cast<char*>(data.get()),
1150  aLength, &totalRead);
1151  NS_ENSURE_SUCCESS(rv, rv);
1152  NS_ASSERTION(totalRead <= aLength, "Read more bytes than were available?");
1153  nsCOMPtr<nsIRunnable> dataAvailable = new nsHtml5DataAvailable(this,
1154                                                                 data.forget(),
1155                                                                totalRead);
1156  if (NS_FAILED(mThread->Dispatch(dataAvailable, nsIThread::DISPATCH_NORMAL))) {
1157    NS_WARNING("Dispatching DataAvailable event failed.");
1158  }
1159  return rv;
1160}
1161
1162bool
1163nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
1164{
1165  nsCAutoString newEncoding(aEncoding);
1166  newEncoding.Trim(" \t\r\n\f");
1167  if (newEncoding.LowerCaseEqualsLiteral("utf-16") ||
1168      newEncoding.LowerCaseEqualsLiteral("utf-16be") ||
1169      newEncoding.LowerCaseEqualsLiteral("utf-16le")) {
1170    newEncoding.Assign("UTF-8");
1171  }
1172
1173  nsresult rv = NS_OK;
1174  nsCOMPtr<nsICharsetAlias> calias(do_GetService(kCharsetAliasCID, &rv));
1175  if (NS_FAILED(rv)) {
1176    NS_NOTREACHED("Charset alias service not available.");
1177    return false;
1178  }
1179  bool eq;
1180  rv = calias->Equals(newEncoding, mCharset, &eq);
1181  if (NS_FAILED(rv)) {
1182    NS_NOTREACHED("Charset name equality check failed.");
1183    return false;
1184  }
1185  if (eq) {
1186    mCharsetSource = kCharsetFromMetaTag; // become confident
1187    mFeedChardet = false; // don't feed chardet when confident
1188    return false;
1189  }
1190  
1191  // XXX check HTML5 non-IANA aliases here
1192  
1193  nsCAutoString preferred;
1194  
1195  rv = calias->GetPreferred(newEncoding, preferred);
1196  if (NS_FAILED(rv)) {
1197    // the encoding name is bogus
1198    return false;
1199  }
1200  
1201  if (preferred.LowerCaseEqualsLiteral("utf-16") ||
1202      preferred.LowerCaseEqualsLiteral("utf-16be") ||
1203      preferred.LowerCaseEqualsLiteral("utf-16le") ||
1204      preferred.LowerCaseEqualsLiteral("utf-7") ||
1205      preferred.LowerCaseEqualsLiteral("jis_x0212-1990") ||
1206      preferred.LowerCaseEqualsLiteral("x-jis0208") ||
1207      preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7") ||
1208      preferred.LowerCaseEqualsLiteral("x-user-defined")) {
1209    // Not a rough ASCII superset
1210    return false;
1211  }
1212  aEncoding.Assign(preferred);
1213  return true;
1214}
1215
1216bool
1217nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
1218{
1219  // This code needs to stay in sync with
1220  // nsHtml5MetaScanner::tryCharset. Unfortunately, the
1221  // trickery with member fields there leads to some copy-paste reuse. :-(
1222  NS_ASSERTION(IsParserThread(), "Wrong thread!");
1223  if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
1224    return false;
1225  }
1226
1227  if (mReparseForbidden) {
1228    return false; // not reparsing even if we wanted to
1229  }
1230
1231  nsCAutoString newEncoding;
1232  CopyUTF16toUTF8(*aEncoding, newEncoding);
1233
1234  if (!PreferredForInternalEncodingDecl(newEncoding)) {
1235    return false;
1236  }
1237
1238  // Avoid having the chardet ask for another restart after this restart
1239  // request.
1240  mFeedChardet = false;
1241  mTreeBuilder->NeedsCharsetSwitchTo(newEncoding, kCharsetFromMetaTag);
1242  FlushTreeOpsAndDisarmTimer();
1243  Interrupt();
1244  // the tree op executor will cause the stream parser to terminate
1245  // if the charset switch request is accepted or it'll uninterrupt 
1246  // if the request failed. Note that if the restart request fails,
1247  // we don't bother trying to make chardet resume. Might as well
1248  // assume that chardet-requested restarts would fail, too.
1249  return true;
1250}
1251
1252void
1253nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer()
1254{
1255  NS_ASSERTION(IsParserThread(), "Wrong thread!");
1256  if (mFlushTimerArmed) {
1257    // avoid calling Cancel if the flush timer isn't armed to avoid acquiring
1258    // a mutex
1259    mFlushTimer->Cancel();
1260    mFlushTimerArmed = false;
1261  }
1262  if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1263    mTokenizer->FlushViewSource();
1264  }
1265  mTreeBuilder->Flush();
1266  if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1267    NS_WARNING("failed to dispatch executor flush event");
1268  }
1269}
1270
1271void
1272nsHtml5StreamParser::ParseAvailableData()
1273{
1274  NS_ASSERTION(IsParserThread(), "Wrong thread!");
1275  mTokenizerMutex.AssertCurrentThreadOwns();
1276
1277  if (IsTerminatedOrInterrupted()) {
1278    return;
1279  }
1280  
1281  for (;;) {
1282    if (!mFirstBuffer->hasMore()) {
1283      if (mFirstBuffer == mLastBuffer) {
1284        switch (mStreamState) {
1285          case STREAM_BEING_READ:
1286            // never release the last buffer.
1287            if (!mSpeculating) {
1288              // reuse buffer space if not speculating
1289              mFirstBuffer->setStart(0);
1290              mFirstBuffer->setEnd(0);
1291            }
1292            mTreeBuilder->FlushLoads();
1293            // Dispatch this runnable unconditionally, because the loads
1294            // that need flushing may have been flushed earlier even if the
1295            // flush right above here did nothing.
1296            if (NS_FAILED(NS_DispatchToMainThread(mLoadFlusher))) {
1297              NS_WARNING("failed to dispatch load flush event");
1298            }
1299            return; // no more data for now but expecting more
1300          case STREAM_ENDED:
1301            if (mAtEOF) {
1302              return;
1303            }
1304            mAtEOF = true;
1305            mTokenizer->eof();
1306            mTreeBuilder->StreamEnded();
1307            if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1308              mTokenizer->EndViewSource();
1309            }
1310            FlushTreeOpsAndDisarmTimer();
1311            return; // no more data and not expecting more
1312          default:
1313            NS_NOTREACHED("It should be impossible to reach this.");
1314            return;
1315        }
1316      }
1317      mFirstBuffer = mFirstBuffer->next;
1318      continue;
1319    }
1320
1321    // now we have a non-empty buffer
1322    mFirstBuffer->adjust(mLastWasCR);
1323    mLastWasCR = false;
1324    if (mFirstBuffer->hasMore()) {
1325      mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
1326      // At this point, internalEncodingDeclaration() may have called 
1327      // Terminate, but that never happens together with script.
1328      // Can't assert that here, though, because it's possible that the main
1329      // thread has called Terminate() while this thread was parsing.
1330      if (mMode == NORMAL && mTreeBuilder->HasScript()) {
1331        mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1332        nsHtml5Speculation* speculation = 
1333          new nsHtml5Speculation(mFirstBuffer,
1334                                 mFirstBuffer->getStart(),
1335                                 mTokenizer->getLineNumber(),
1336                                 mTreeBuilder->newSnapshot());
1337        mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(), 
1338                                          speculation->GetStartLineNumber());
1339        FlushTreeOpsAndDisarmTimer();
1340        mTreeBuilder->SetOpSink(speculation);
1341        mSpeculations.AppendElement(speculation); // adopts the pointer
1342        mSpeculating = true;
1343      }
1344      if (IsTerminatedOrInterrupted()) {
1345        return;
1346      }
1347    }
1348    continue;
1349  }
1350}
1351
1352class nsHtml5StreamParserContinuation : public nsRunnable
1353{
1354private:
1355  nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1356public:
1357  nsHtml5StreamParserContinuation(nsHtml5StreamParser* aStreamParser)
1358    : mStreamParser(aStreamParser)
1359  {}
1360  NS_IMETHODIMP Run()
1361  {
1362    mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1363    mStreamParser->Uninterrupt();
1364    mStreamParser->ParseAvailableData();
1365    return NS_OK;
1366  }
1367};
1368
1369void
1370nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 
1371                                          nsHtml5TreeBuilder* aTreeBuilder,
1372                                          bool aLastWasCR)
1373{
1374  NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1375  NS_ASSERTION(!(mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML),
1376      "ContinueAfterScripts called in view source mode!");
1377  if (mExecutor->IsBroken()) {
1378    return;
1379  }
1380  #ifdef DEBUG
1381    mExecutor->AssertStageEmpty();
1382  #endif
1383  bool speculationFailed = false;
1384  {
1385    mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1386    if (mSpeculations.IsEmpty()) {
1387      NS_NOTREACHED("ContinueAfterScripts called without speculations.");
1388      return;
1389    }
1390    nsHtml5Speculation* speculation = mSpec

Large files files are truncated, but you can click here to view the full file