diff options
Diffstat (limited to 'parser/htmlparser/nsScanner.cpp')
-rw-r--r-- | parser/htmlparser/nsScanner.cpp | 409 |
1 files changed, 409 insertions, 0 deletions
diff --git a/parser/htmlparser/nsScanner.cpp b/parser/htmlparser/nsScanner.cpp new file mode 100644 index 0000000000..620764590a --- /dev/null +++ b/parser/htmlparser/nsScanner.cpp @@ -0,0 +1,409 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=2 sw=2 et tw=78: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//#define __INCREMENTAL 1 + +#include "mozilla/Attributes.h" +#include "mozilla/DebugOnly.h" + +#include "nsScanner.h" +#include "nsDebug.h" +#include "nsReadableUtils.h" +#include "nsIInputStream.h" +#include "nsIFile.h" +#include "nsUTF8Utils.h" // for LossyConvertEncoding +#include "nsCRT.h" +#include "nsParser.h" +#include "nsCharsetSource.h" + +#include "mozilla/dom/EncodingUtils.h" + +using mozilla::dom::EncodingUtils; + +nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) : + mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set +{ + // Build filter that will be used to filter out characters with + // bits that none of the terminal chars have. This works very well + // because terminal chars often have only the last 4-6 bits set and + // normal ascii letters have bit 7 set. Other letters have even higher + // bits set. + + // Calculate filter + const char16_t *current = aTerminateChars; + char16_t terminalChar = *current; + while (terminalChar) { + mFilter &= ~terminalChar; + ++current; + terminalChar = *current; + } +} + +/** + * Use this constructor if you want i/o to be based on + * a single string you hand in during construction. + * This short cut was added for Javascript. + * + * @update gess 5/12/98 + * @param aMode represents the parser mode (nav, other) + * @return + */ +nsScanner::nsScanner(const nsAString& anHTMLString) +{ + MOZ_COUNT_CTOR(nsScanner); + + mSlidingBuffer = nullptr; + if (AppendToBuffer(anHTMLString)) { + mSlidingBuffer->BeginReading(mCurrentPosition); + } else { + /* XXX see hack below, re: bug 182067 */ + memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); + mEndPosition = mCurrentPosition; + } + mMarkPosition = mCurrentPosition; + mIncremental = false; + mUnicodeDecoder = nullptr; + mCharsetSource = kCharsetUninitialized; +} + +/** + * Use this constructor if you want i/o to be based on strings + * the scanner receives. If you pass a null filename, you + * can still provide data to the scanner via append. + */ +nsScanner::nsScanner(nsString& aFilename, bool aCreateStream) + : mFilename(aFilename) +{ + MOZ_COUNT_CTOR(nsScanner); + NS_ASSERTION(!aCreateStream, "This is always true."); + + mSlidingBuffer = nullptr; + + // XXX This is a big hack. We need to initialize the iterators to something. + // What matters is that mCurrentPosition == mEndPosition, so that our methods + // believe that we are at EOF (see bug 182067). We null out mCurrentPosition + // so that we have some hope of catching null pointer dereferences associated + // with this hack. --darin + memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); + mMarkPosition = mCurrentPosition; + mEndPosition = mCurrentPosition; + + mIncremental = true; + + mUnicodeDecoder = nullptr; + mCharsetSource = kCharsetUninitialized; + // XML defaults to UTF-8 and about:blank is UTF-8, too. + SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault); +} + +nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource) +{ + if (aSource < mCharsetSource) // priority is lower than the current one + return NS_OK; + + mCharsetSource = aSource; + + nsCString charsetName; + mozilla::DebugOnly<bool> valid = + EncodingUtils::FindEncodingForLabel(aCharset, charsetName); + MOZ_ASSERT(valid, "Should never call with a bogus aCharset."); + + if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) { + return NS_OK; // no difference, don't change it + } + + // different, need to change it + + mCharset.Assign(charsetName); + + mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset); + mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal); + + return NS_OK; +} + + +/** + * default destructor + * + * @update gess 3/25/98 + * @param + * @return + */ +nsScanner::~nsScanner() { + + delete mSlidingBuffer; + + MOZ_COUNT_DTOR(nsScanner); +} + +/** + * Resets current offset position of input stream to marked position. + * This allows us to back up to this point if the need should arise, + * such as when tokenization gets interrupted. + * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! + * + * @update gess 5/12/98 + * @param + * @return + */ +void nsScanner::RewindToMark(void){ + if (mSlidingBuffer) { + mCurrentPosition = mMarkPosition; + } +} + + +/** + * Records current offset position in input stream. This allows us + * to back up to this point if the need should arise, such as when + * tokenization gets interrupted. + * + * @update gess 7/29/98 + * @param + * @return + */ +int32_t nsScanner::Mark() { + int32_t distance = 0; + if (mSlidingBuffer) { + nsScannerIterator oldStart; + mSlidingBuffer->BeginReading(oldStart); + + distance = Distance(oldStart, mCurrentPosition); + + mSlidingBuffer->DiscardPrefix(mCurrentPosition); + mSlidingBuffer->BeginReading(mCurrentPosition); + mMarkPosition = mCurrentPosition; + } + + return distance; +} + +/** + * Insert data to our underlying input buffer as + * if it were read from an input stream. + * + * @update harishd 01/12/99 + * @return error code + */ +bool nsScanner::UngetReadable(const nsAString& aBuffer) { + if (!mSlidingBuffer) { + return false; + } + + mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition); + mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators + mSlidingBuffer->EndReading(mEndPosition); + + return true; +} + +/** + * Append data to our underlying input buffer as + * if it were read from an input stream. + * + * @update gess4/3/98 + * @return error code + */ +nsresult nsScanner::Append(const nsAString& aBuffer) { + if (!AppendToBuffer(aBuffer)) + return NS_ERROR_OUT_OF_MEMORY; + return NS_OK; +} + +/** + * + * + * @update gess 5/21/98 + * @param + * @return + */ +nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) +{ + nsresult res = NS_OK; + if (mUnicodeDecoder) { + int32_t unicharBufLen = 0; + + nsresult rv = mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen); + if (NS_WARN_IF(NS_FAILED(rv))) { + return rv; + } + + nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1); + NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY); + char16_t *unichars = buffer->DataStart(); + + int32_t totalChars = 0; + int32_t unicharLength = unicharBufLen; + + do { + int32_t srcLength = aLen; + res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength); + + totalChars += unicharLength; + // Continuation of failure case + if(NS_FAILED(res)) { + // if we failed, we consume one byte, replace it with the replacement + // character and try the conversion again. + + // This is only needed because some decoders don't follow the + // nsIUnicodeDecoder contract: they return a failure when *aDestLength + // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT. See bug 244177 + if ((unichars + unicharLength) >= buffer->DataEnd()) { + NS_ERROR("Unexpected end of destination buffer"); + break; + } + + // Since about:blank is empty, this line runs only for XML. Use a + // character that's illegal in XML instead of U+FFFD in order to make + // expat flag the error. + unichars[unicharLength++] = 0xFFFF; + + unichars = unichars + unicharLength; + unicharLength = unicharBufLen - (++totalChars); + + mUnicodeDecoder->Reset(); + + if(((uint32_t) (srcLength + 1)) > aLen) { + srcLength = aLen; + } + else { + ++srcLength; + } + + aBuffer += srcLength; + aLen -= srcLength; + } + } while (NS_FAILED(res) && (aLen > 0)); + + buffer->SetDataLength(totalChars); + // Don't propagate return code of unicode decoder + // since it doesn't reflect on our success or failure + // - Ref. bug 87110 + res = NS_OK; + if (!AppendToBuffer(buffer)) + res = NS_ERROR_OUT_OF_MEMORY; + } + else { + NS_WARNING("No decoder found."); + res = NS_ERROR_FAILURE; + } + + return res; +} + +/** + * retrieve next char from scanners internal input stream + * + * @update gess 3/25/98 + * @param + * @return error code reflecting read status + */ +nsresult nsScanner::GetChar(char16_t& aChar) { + if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { + aChar = 0; + return kEOF; + } + + aChar = *mCurrentPosition++; + + return NS_OK; +} + + +void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd) +{ + aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd); +} + +void nsScanner::CurrentPosition(nsScannerIterator& aPosition) +{ + aPosition = mCurrentPosition; +} + +void nsScanner::EndReading(nsScannerIterator& aPosition) +{ + aPosition = mEndPosition; +} + +void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) +{ + if (mSlidingBuffer) { + mCurrentPosition = aPosition; + if (aTerminate && (mCurrentPosition == mEndPosition)) { + mMarkPosition = mCurrentPosition; + mSlidingBuffer->DiscardPrefix(mCurrentPosition); + } + } +} + +bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) +{ + if (!mSlidingBuffer) { + mSlidingBuffer = new nsScannerString(aBuf); + if (!mSlidingBuffer) + return false; + mSlidingBuffer->BeginReading(mCurrentPosition); + mMarkPosition = mCurrentPosition; + mSlidingBuffer->EndReading(mEndPosition); + } + else { + mSlidingBuffer->AppendBuffer(aBuf); + if (mCurrentPosition == mEndPosition) { + mSlidingBuffer->BeginReading(mCurrentPosition); + } + mSlidingBuffer->EndReading(mEndPosition); + } + + return true; +} + +/** + * call this to copy bytes out of the scanner that have not yet been consumed + * by the tokenization process. + * + * @update gess 5/12/98 + * @param aCopyBuffer is where the scanner buffer will be copied to + * @return true if OK or false on OOM + */ +bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) { + if (!mSlidingBuffer) { + aCopyBuffer.Truncate(); + return true; + } + + nsScannerIterator start, end; + start = mCurrentPosition; + end = mEndPosition; + + return CopyUnicodeTo(start, end, aCopyBuffer); +} + +/** + * Retrieve the name of the file that the scanner is reading from. + * In some cases, it's just a given name, because the scanner isn't + * really reading from a file. + * + * @update gess 5/12/98 + * @return + */ +nsString& nsScanner::GetFilename(void) { + return mFilename; +} + +/** + * Conduct self test. Actually, selftesting for this class + * occurs in the parser selftest. + * + * @update gess 3/25/98 + * @param + * @return + */ + +void nsScanner::SelfTest(void) { +#ifdef _DEBUG +#endif +} |