| /* |
| * Copyright (C) 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.google.streamhtmlparser.impl; |
| |
| import com.google.common.collect.Maps; |
| import com.google.streamhtmlparser.ExternalState; |
| import com.google.streamhtmlparser.JavascriptParser; |
| import com.google.streamhtmlparser.util.HtmlUtils; |
| import com.google.streamhtmlparser.util.JavascriptTokenBuffer; |
| |
| import java.util.Map; |
| |
| /** |
| * <p>Many comments copied almost verbatim from the original C version. |
| */ |
| public class JavascriptParserImpl extends GenericParser |
| implements JavascriptParser { |
| |
| final static InternalState JS_TEXT; |
| final static InternalState JS_Q; |
| final static InternalState JS_Q_E; |
| final static InternalState JS_DQ; |
| final static InternalState JS_DQ_E; |
| final static InternalState JS_SLASH; |
| final static InternalState JS_REGEXP_SLASH; |
| final static InternalState JS_REGEXP; |
| final static InternalState JS_REGEXP_BRK; |
| final static InternalState JS_REGEXP_BRK_E; |
| final static InternalState JS_REGEXP_E; |
| final static InternalState JS_COM_LN; |
| final static InternalState JS_COM_ML; |
| final static InternalState JS_COM_ML_CLOSE; |
| final static InternalState JS_COM_AFTER; |
| |
| static { |
| JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT"); |
| JS_Q = InternalState.getInstanceJavascript("JS_Q"); |
| JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E"); |
| JS_DQ = InternalState.getInstanceJavascript("JS_DQ"); |
| JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E"); |
| JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH"); |
| JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP"); |
| JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH"); |
| JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E"); |
| JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK"); |
| JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E"); |
| JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN"); |
| JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML"); |
| JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE"); |
| JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER"); |
| } |
| |
| private static final Map<InternalState, ExternalState> STATE_MAPPING = |
| Maps.newHashMap(); |
| static { |
| initializeStateMapping(); |
| } |
| |
| private static final ParserStateTable STATE_TABLE = new ParserStateTable(); |
| static { |
| initializeParserStateTable(); |
| } |
| |
| private final JavascriptTokenBuffer ccBuffer; |
| |
| /** |
| * Creates a {@code JavascriptParserImpl} object. |
| */ |
| public JavascriptParserImpl() { |
| super(STATE_TABLE, STATE_MAPPING, JS_TEXT); |
| ccBuffer = new JavascriptTokenBuffer(); |
| } |
| |
| /** |
| * Creates a {@code JavascriptParserImpl} object that is a copy |
| * of the one provided. |
| * |
| * @param aJavascriptParserImpl the {@code JavascriptParserImpl} to copy |
| */ |
| public JavascriptParserImpl(JavascriptParserImpl aJavascriptParserImpl) { |
| super(aJavascriptParserImpl); |
| ccBuffer = new JavascriptTokenBuffer(aJavascriptParserImpl.ccBuffer); |
| } |
| |
| @Override |
| public void reset() { |
| super.reset(); |
| currentState = JS_TEXT; |
| } |
| |
| @Override |
| protected InternalState handleEnterState(InternalState currentState, |
| InternalState expectedNextState, |
| char input) { |
| InternalState nextState = expectedNextState; |
| if (currentState == JS_SLASH) { |
| nextState = enterStateJsSlash(currentState, input); |
| } else if (currentState == JS_COM_AFTER) { |
| enterStateJsCommentAfter(); |
| } |
| return nextState; |
| } |
| |
| @Override |
| protected InternalState handleExitState(InternalState currentState, |
| InternalState expectedNextState, |
| char input) { |
| // Nothing to do - no handlers for exit states |
| return expectedNextState; |
| } |
| |
| @Override |
| protected InternalState handleInState(InternalState currentState, |
| char input) { |
| if (currentState == JS_TEXT) { |
| inStateJsText(input); |
| } |
| return currentState; |
| } |
| |
| /** |
| * Called every time we find a slash ('/') character in the javascript |
| * text (except for slashes that close comments or regexp literals). |
| * |
| * <p>Comment copied verbatim from the corresponding C-version. |
| * |
| * <p>Implements the logic to figure out if this slash character is a |
| * division operator or if it opens a regular expression literal. |
| * This is heavily inspired by the syntactic resynchronization |
| * for javascript 2.0: |
| * |
| * <p>When we receive a '/', we look at the previous non space character |
| * to figure out if it's the ending of a punctuator that can precede a |
| * regexp literal, in which case we assume the current '/' is part of a |
| * regular expression literal (or the opening of a javascript comment, |
| * but that part is dealt with in the state machine). The exceptions to |
| * this are unary operators, so we look back a second character to rule |
| * out '++' and '--'. |
| * |
| * <p> Although it is not straightforward to figure out if the binary |
| * operator is a postfix of the previous expression or a prefix of the |
| * regular expression, we rule out the later as it is an uncommon practice. |
| * |
| * <p>If we ruled out the previous token to be a valid regexp preceding |
| * punctuator, we extract the last identifier in the buffer and match |
| * against a list of keywords that are known to precede expressions in |
| * the grammar. If we get a match on any of these keywords, then we are |
| * opening a regular expression, if not, then we have a division operator. |
| * |
| * <p>Known cases that are accepted by the grammar but we handle |
| * differently, although I (falmeida) don't believe there is a |
| * legitimate usage for those: |
| * Division of a regular expression: var result = /test/ / 5; |
| * Prefix unary increment of a regular expression: var result = ++/test/; |
| * Division of an object literal: { a: 1 } /x/.exec('x'); |
| * |
| * @param state being entered to |
| * @param input character being processed |
| * @return state next state to go to, may be the same as the one we |
| * were called with |
| * |
| * <a>http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html> |
| * Syntactic Resynchronization</a> |
| */ |
| private InternalState enterStateJsSlash(InternalState state, char input) { |
| |
| InternalState nextState = state; |
| int position = -1; |
| |
| // Consume the last whitespace |
| if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) { |
| --position; |
| } |
| |
| switch (ccBuffer.getChar(position)) { |
| // Ignore unary increment |
| case '+': |
| if (ccBuffer.getChar(position - 1) != '+') { |
| nextState = JS_REGEXP_SLASH; |
| } |
| break; |
| case '-': |
| // Ignore unary decrement |
| if (ccBuffer.getChar(position - 1) != '-') { |
| nextState = JS_REGEXP_SLASH; |
| } |
| break; |
| // List of punctuator endings except ), ], }, + and - * |
| case '=': |
| case '<': |
| case '>': |
| case '&': |
| case '|': |
| case '!': |
| case '%': |
| case '*': |
| case '/': |
| case ',': |
| case ';': |
| case '?': |
| case ':': |
| case '^': |
| case '~': |
| case '{': |
| case '(': |
| case '[': |
| case '}': |
| case '\0': |
| nextState = JS_REGEXP_SLASH; |
| break; |
| default: |
| String lastIdentifier = ccBuffer.getLastIdentifier(); |
| if (lastIdentifier != null && HtmlUtils |
| .isJavascriptRegexpPrefix(lastIdentifier)) { |
| nextState = JS_REGEXP_SLASH; |
| } |
| } |
| ccBuffer.appendChar(input); |
| return nextState; |
| } |
| |
| /** |
| * Called at the end of a javascript comment. |
| * |
| * <p>When we open a comment, the initial '/' was inserted into the ring |
| * buffer, but it is not a token and should be considered whitespace |
| * for parsing purposes. |
| * |
| * <p>When we first saw the '/' character, we didn't yet know if it was |
| * the beginning of a comment, a division operator, or a regexp. |
| * |
| * <p>In this function we just replace the inital '/' with a whitespace |
| * character, unless we had a preceding whitespace character, in which |
| * case we just remove the '/'. This is needed to ensure all spaces in |
| * the buffer are correctly folded. |
| */ |
| private void enterStateJsCommentAfter() { |
| if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) { |
| ccBuffer.popChar(); |
| } else { |
| ccBuffer.setChar(-1, ' '); |
| } |
| } |
| |
| private void inStateJsText(char input) { |
| ccBuffer.appendChar(input); |
| } |
| |
| // ======================================================= // |
| // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. // |
| // ======================================================= // |
| |
| private static void registerMapping(InternalState internalState, |
| ExternalState externalState) { |
| STATE_MAPPING.put(internalState, externalState); |
| } |
| |
| private static void initializeStateMapping() { |
| // Each parser implementation must map the error state appropriately. |
| registerMapping(InternalState.INTERNAL_ERROR_STATE, |
| JavascriptParser.STATE_ERROR); |
| |
| registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT); |
| registerMapping(JS_Q, JavascriptParser.STATE_Q); |
| registerMapping(JS_Q_E, JavascriptParser.STATE_Q); |
| registerMapping(JS_DQ, JavascriptParser.STATE_DQ); |
| registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ); |
| registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT); |
| registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT); |
| registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP); |
| registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP); |
| registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP); |
| registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP); |
| registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT); |
| registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT); |
| registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT); |
| registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT); |
| } |
| |
| private static void registerTransition(String expression, |
| InternalState source, |
| InternalState to) { |
| // It seems to silly to go through a StateTableTransition here |
| // but it adds extra data checking. |
| StateTableTransition stt = new StateTableTransition(expression, |
| source, to); |
| STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(), |
| stt.getTo()); |
| } |
| |
| private static void initializeParserStateTable() { |
| registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT); |
| registerTransition("/", JS_COM_AFTER, JS_SLASH); |
| registerTransition("\"", JS_COM_AFTER, JS_DQ); |
| registerTransition("\'", JS_COM_AFTER, JS_Q); |
| registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML); |
| registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER); |
| registerTransition("[:default:]", JS_COM_ML, JS_COM_ML); |
| registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE); |
| registerTransition("[:default:]", JS_COM_LN,JS_COM_LN); |
| registerTransition("\n", JS_COM_LN,JS_COM_AFTER); |
| registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP); |
| registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK); |
| registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK); |
| registerTransition("]", JS_REGEXP_BRK, JS_REGEXP); |
| registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E); |
| registerTransition("[:default:]", JS_REGEXP, JS_REGEXP); |
| registerTransition("/", JS_REGEXP, JS_TEXT); |
| registerTransition("[", JS_REGEXP, JS_REGEXP_BRK); |
| registerTransition("\\", JS_REGEXP, JS_REGEXP_E); |
| registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP); |
| registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK); |
| registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E); |
| registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML); |
| registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN); |
| registerTransition("[:default:]", JS_SLASH, JS_TEXT); |
| registerTransition("*", JS_SLASH, JS_COM_ML); |
| registerTransition("/", JS_SLASH, JS_COM_LN); |
| registerTransition("[:default:]", JS_DQ_E,JS_DQ); |
| registerTransition("[:default:]", JS_DQ,JS_DQ); |
| registerTransition("\"", JS_DQ, JS_TEXT); |
| registerTransition("\\", JS_DQ, JS_DQ_E); |
| registerTransition("[:default:]", JS_Q_E,JS_Q); |
| registerTransition("[:default:]", JS_Q,JS_Q); |
| registerTransition("\'", JS_Q, JS_TEXT); |
| registerTransition("\\", JS_Q, JS_Q_E); |
| registerTransition("[:default:]", JS_TEXT, JS_TEXT); |
| registerTransition("/", JS_TEXT, JS_SLASH); |
| registerTransition("\"", JS_TEXT, JS_DQ); |
| registerTransition("\'", JS_TEXT, JS_Q); |
| } |
| } |