diff options
Diffstat (limited to 'parser/html/javasrc/MetaScanner.java')
-rw-r--r-- | parser/html/javasrc/MetaScanner.java | 854 |
1 files changed, 854 insertions, 0 deletions
diff --git a/parser/html/javasrc/MetaScanner.java b/parser/html/javasrc/MetaScanner.java new file mode 100644 index 0000000000..be9aabfe33 --- /dev/null +++ b/parser/html/javasrc/MetaScanner.java @@ -0,0 +1,854 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008-2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.impl; + +import java.io.IOException; + +import nu.validator.htmlparser.annotation.Auto; +import nu.validator.htmlparser.annotation.Inline; +import nu.validator.htmlparser.common.ByteReadable; + +import org.xml.sax.SAXException; + +public abstract class MetaScanner { + + /** + * Constant for "charset". + */ + private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' }; + + /** + * Constant for "content". + */ + private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' }; + + /** + * Constant for "http-equiv". + */ + private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q', + 'u', 'i', 'v' }; + + /** + * Constant for "content-type". + */ + private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n', + 't', '-', 't', 'y', 'p', 'e' }; + + private static final int NO = 0; + + private static final int M = 1; + + private static final int E = 2; + + private static final int T = 3; + + private static final int A = 4; + + private static final int DATA = 0; + + private static final int TAG_OPEN = 1; + + private static final int SCAN_UNTIL_GT = 2; + + private static final int TAG_NAME = 3; + + private static final int BEFORE_ATTRIBUTE_NAME = 4; + + private static final int ATTRIBUTE_NAME = 5; + + private static final int AFTER_ATTRIBUTE_NAME = 6; + + private static final int BEFORE_ATTRIBUTE_VALUE = 7; + + private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; + + private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; + + private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; + + private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; + + private static final int MARKUP_DECLARATION_OPEN = 13; + + private static final int MARKUP_DECLARATION_HYPHEN = 14; + + private static final int COMMENT_START = 15; + + private static final int COMMENT_START_DASH = 16; + + private static final int COMMENT = 17; + + private static final int COMMENT_END_DASH = 18; + + private static final int COMMENT_END = 19; + + private static final int SELF_CLOSING_START_TAG = 20; + + private static final int HTTP_EQUIV_NOT_SEEN = 0; + + private static final int HTTP_EQUIV_CONTENT_TYPE = 1; + + private static final int HTTP_EQUIV_OTHER = 2; + + /** + * The data source. + */ + protected ByteReadable readable; + + /** + * The state of the state machine that recognizes the tag name "meta". + */ + private int metaState = NO; + + /** + * The current position in recognizing the attribute name "content". + */ + private int contentIndex = Integer.MAX_VALUE; + + /** + * The current position in recognizing the attribute name "charset". + */ + private int charsetIndex = Integer.MAX_VALUE; + + /** + * The current position in recognizing the attribute name "http-equive". + */ + private int httpEquivIndex = Integer.MAX_VALUE; + + /** + * The current position in recognizing the attribute value "content-type". + */ + private int contentTypeIndex = Integer.MAX_VALUE; + + /** + * The tokenizer state. + */ + protected int stateSave = DATA; + + /** + * The currently filled length of strBuf. + */ + private int strBufLen; + + /** + * Accumulation buffer for attribute values. + */ + private @Auto char[] strBuf; + + private String content; + + private String charset; + + private int httpEquivState; + + // CPPONLY: private TreeBuilder treeBuilder; + + public MetaScanner( + // CPPONLY: TreeBuilder tb + ) { + this.readable = null; + this.metaState = NO; + this.contentIndex = Integer.MAX_VALUE; + this.charsetIndex = Integer.MAX_VALUE; + this.httpEquivIndex = Integer.MAX_VALUE; + this.contentTypeIndex = Integer.MAX_VALUE; + this.stateSave = DATA; + this.strBufLen = 0; + this.strBuf = new char[36]; + this.content = null; + this.charset = null; + this.httpEquivState = HTTP_EQUIV_NOT_SEEN; + // CPPONLY: this.treeBuilder = tb; + } + + @SuppressWarnings("unused") private void destructor() { + Portability.releaseString(content); + Portability.releaseString(charset); + } + + // [NOCPP[ + + /** + * Reads a byte from the data source. + * + * -1 means end. + * @return + * @throws IOException + */ + protected int read() throws IOException { + return readable.readByte(); + } + + // ]NOCPP] + + // WARNING When editing this, makes sure the bytecode length shown by javap + // stays under 8000 bytes! + /** + * The runs the meta scanning algorithm. + */ + protected final void stateLoop(int state) + throws SAXException, IOException { + int c = -1; + boolean reconsume = false; + stateloop: for (;;) { + switch (state) { + case DATA: + dataloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '<': + state = MetaScanner.TAG_OPEN; + break dataloop; // FALL THROUGH continue + // stateloop; + default: + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case TAG_OPEN: + tagopenloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case 'm': + case 'M': + metaState = M; + state = MetaScanner.TAG_NAME; + break tagopenloop; + // continue stateloop; + case '!': + state = MetaScanner.MARKUP_DECLARATION_OPEN; + continue stateloop; + case '?': + case '/': + state = MetaScanner.SCAN_UNTIL_GT; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { + metaState = NO; + state = MetaScanner.TAG_NAME; + break tagopenloop; + // continue stateloop; + } + state = MetaScanner.DATA; + reconsume = true; + continue stateloop; + } + } + // FALL THROUGH DON'T REORDER + case TAG_NAME: + tagnameloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + break tagnameloop; + // continue stateloop; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + case 'e': + case 'E': + if (metaState == M) { + metaState = E; + } else { + metaState = NO; + } + continue; + case 't': + case 'T': + if (metaState == E) { + metaState = T; + } else { + metaState = NO; + } + continue; + case 'a': + case 'A': + if (metaState == T) { + metaState = A; + } else { + metaState = NO; + } + continue; + default: + metaState = NO; + continue; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_ATTRIBUTE_NAME: + beforeattributenameloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + /* + * Consume the next input character: + */ + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + continue; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = DATA; + continue stateloop; + case 'c': + case 'C': + contentIndex = 0; + charsetIndex = 0; + httpEquivIndex = Integer.MAX_VALUE; + contentTypeIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + break beforeattributenameloop; + case 'h': + case 'H': + contentIndex = Integer.MAX_VALUE; + charsetIndex = Integer.MAX_VALUE; + httpEquivIndex = 0; + contentTypeIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + break beforeattributenameloop; + default: + contentIndex = Integer.MAX_VALUE; + charsetIndex = Integer.MAX_VALUE; + httpEquivIndex = Integer.MAX_VALUE; + contentTypeIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + break beforeattributenameloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case ATTRIBUTE_NAME: + attributenameloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + state = MetaScanner.AFTER_ATTRIBUTE_NAME; + continue stateloop; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '=': + strBufLen = 0; + contentTypeIndex = 0; + state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; + break attributenameloop; + // continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + if (metaState == A) { + if (c >= 'A' && c <= 'Z') { + c += 0x20; + } + if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { + ++contentIndex; + } else { + contentIndex = Integer.MAX_VALUE; + } + if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { + ++charsetIndex; + } else { + charsetIndex = Integer.MAX_VALUE; + } + if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { + ++httpEquivIndex; + } else { + httpEquivIndex = Integer.MAX_VALUE; + } + } + continue; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_ATTRIBUTE_VALUE: + beforeattributevalueloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + continue; + case '"': + state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; + break beforeattributevalueloop; + // continue stateloop; + case '\'': + state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; + continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + handleCharInAttributeValue(c); + state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: + attributevaluedoublequotedloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '"': + handleAttributeValue(); + state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; + break attributevaluedoublequotedloop; + // continue stateloop; + default: + handleCharInAttributeValue(c); + continue; + } + } + // FALLTHRU DON'T REORDER + case AFTER_ATTRIBUTE_VALUE_QUOTED: + afterattributevaluequotedloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + continue stateloop; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + break afterattributevaluequotedloop; + // continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + reconsume = true; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case SELF_CLOSING_START_TAG: + c = read(); + switch (c) { + case -1: + break stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + reconsume = true; + continue stateloop; + } + // XXX reorder point + case ATTRIBUTE_VALUE_UNQUOTED: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + + case '\u000C': + handleAttributeValue(); + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + continue stateloop; + case '>': + handleAttributeValue(); + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + handleCharInAttributeValue(c); + continue; + } + } + // XXX reorder point + case AFTER_ATTRIBUTE_NAME: + for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + continue; + case '/': + handleAttributeValue(); + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '=': + strBufLen = 0; + contentTypeIndex = 0; + state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; + continue stateloop; + case '>': + handleAttributeValue(); + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + case 'c': + case 'C': + contentIndex = 0; + charsetIndex = 0; + state = MetaScanner.ATTRIBUTE_NAME; + continue stateloop; + default: + contentIndex = Integer.MAX_VALUE; + charsetIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + continue stateloop; + } + } + // XXX reorder point + case MARKUP_DECLARATION_OPEN: + markupdeclarationopenloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.MARKUP_DECLARATION_HYPHEN; + break markupdeclarationopenloop; + // continue stateloop; + default: + state = MetaScanner.SCAN_UNTIL_GT; + reconsume = true; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case MARKUP_DECLARATION_HYPHEN: + markupdeclarationhyphenloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_START; + break markupdeclarationhyphenloop; + // continue stateloop; + default: + state = MetaScanner.SCAN_UNTIL_GT; + reconsume = true; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_START: + commentstartloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_START_DASH; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.COMMENT; + break commentstartloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT: + commentloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_END_DASH; + break commentloop; + // continue stateloop; + default: + continue; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_END_DASH: + commentenddashloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_END; + break commentenddashloop; + // continue stateloop; + default: + state = MetaScanner.COMMENT; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_END: + for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + case '-': + continue; + default: + state = MetaScanner.COMMENT; + continue stateloop; + } + } + // XXX reorder point + case COMMENT_START_DASH: + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_END; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.COMMENT; + continue stateloop; + } + // XXX reorder point + case ATTRIBUTE_VALUE_SINGLE_QUOTED: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '\'': + handleAttributeValue(); + state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; + continue stateloop; + default: + handleCharInAttributeValue(c); + continue; + } + } + // XXX reorder point + case SCAN_UNTIL_GT: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + continue; + } + } + } + } + stateSave = state; + } + + private void handleCharInAttributeValue(int c) { + if (metaState == A) { + if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { + addToBuffer(c); + } else if (httpEquivIndex == HTTP_EQUIV.length) { + if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { + ++contentTypeIndex; + } else { + contentTypeIndex = Integer.MAX_VALUE; + } + } + } + } + + @Inline private int toAsciiLowerCase(int c) { + if (c >= 'A' && c <= 'Z') { + return c + 0x20; + } + return c; + } + + /** + * Adds a character to the accumulation buffer. + * @param c the character to add + */ + private void addToBuffer(int c) { + if (strBufLen == strBuf.length) { + char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; + System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); + strBuf = newBuf; + } + strBuf[strBufLen++] = (char)c; + } + + /** + * Attempts to extract a charset name from the accumulation buffer. + * @return <code>true</code> if successful + * @throws SAXException + */ + private void handleAttributeValue() throws SAXException { + if (metaState != A) { + return; + } + if (contentIndex == CONTENT.length && content == null) { + content = Portability.newStringFromBuffer(strBuf, 0, strBufLen + // CPPONLY: , treeBuilder + ); + return; + } + if (charsetIndex == CHARSET.length && charset == null) { + charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen + // CPPONLY: , treeBuilder + ); + return; + } + if (httpEquivIndex == HTTP_EQUIV.length + && httpEquivState == HTTP_EQUIV_NOT_SEEN) { + httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE + : HTTP_EQUIV_OTHER; + return; + } + } + + private boolean handleTag() throws SAXException { + boolean stop = handleTagInner(); + Portability.releaseString(content); + content = null; + Portability.releaseString(charset); + charset = null; + httpEquivState = HTTP_EQUIV_NOT_SEEN; + return stop; + } + + private boolean handleTagInner() throws SAXException { + if (charset != null && tryCharset(charset)) { + return true; + } + if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { + String extract = TreeBuilder.extractCharsetFromContent(content + // CPPONLY: , treeBuilder + ); + if (extract == null) { + return false; + } + boolean success = tryCharset(extract); + Portability.releaseString(extract); + return success; + } + return false; + } + + /** + * Tries to switch to an encoding. + * + * @param encoding + * @return <code>true</code> if successful + * @throws SAXException + */ + protected abstract boolean tryCharset(String encoding) throws SAXException; + +} |