| /* |
| * Copyright (C) 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.google.streamhtmlparser.impl; |
| |
| import com.google.common.base.Preconditions; |
| import com.google.streamhtmlparser.ExternalState; |
| import com.google.streamhtmlparser.Parser; |
| import com.google.streamhtmlparser.ParseException; |
| import com.google.streamhtmlparser.util.HtmlUtils; |
| |
| import java.util.Map; |
| |
| /** |
| * An implementation of the {@code Parser} interface that is common to both |
| * {@code HtmlParser} and {@code JavascriptParser}. |
| * |
| * <p>Provides methods for parsing input and ensuring that all in-state, |
| * entering-a-state and exiting-a-state callbacks are invoked as appropriate. |
| * |
| * <p>This class started as abstract but it was found better for testing to |
| * make it instantiatable so that the parsing logic can be tested with dummy |
| * state transitions. |
| */ |
| public class GenericParser implements Parser { |
| |
| protected final ParserStateTable parserStateTable; |
| protected final Map<InternalState, ExternalState> intToExtStateTable; |
| protected final InternalState initialState; |
| protected InternalState currentState; |
| protected int lineNumber; |
| protected int columnNumber; |
| |
| protected GenericParser(ParserStateTable parserStateTable, |
| Map<InternalState, ExternalState> intToExtStateTable, |
| InternalState initialState) { |
| this.parserStateTable = parserStateTable; |
| this.intToExtStateTable = intToExtStateTable; |
| this.initialState = initialState; |
| this.currentState = initialState; |
| this.lineNumber = 1; |
| this.columnNumber = 1; |
| } |
| |
| /** |
| * Constructs a generic parser that is an exact copy of the |
| * one given. Note that here too, data structures that do not |
| * change are shallow-copied (parser state table and state mappings). |
| * |
| * @param aGenericParser the {@code GenericParser} to copy |
| */ |
| protected GenericParser(GenericParser aGenericParser) { |
| parserStateTable = aGenericParser.parserStateTable; |
| intToExtStateTable = aGenericParser.intToExtStateTable; |
| initialState = aGenericParser.initialState; |
| currentState = aGenericParser.currentState; |
| lineNumber = aGenericParser.lineNumber; |
| columnNumber = aGenericParser.columnNumber; |
| } |
| |
| /** |
| * Tell the parser to process the provided {@code String}. This is just a |
| * convenience method that wraps over {@link Parser#parse(char)}. |
| * @param input the {@code String} to parse |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| @Override |
| public void parse(String input) throws ParseException { |
| for (int i = 0; i < input.length(); i++) |
| parse(input.charAt(i)); |
| } |
| |
| /** |
| * Main loop for parsing of input. |
| * |
| * <p>Absent any callbacks defined, this function simply determines the |
| * next state to switch to based on the <code>ParserStateTable</code> which is |
| * derived from a state-machine configuration file in the original C++ parser. |
| * |
| * <p>However some states have specific callbacks defined which when |
| * receiving specific characters may decide to overwrite the next state to |
| * go to. Hence the next state is a function both of the main state table |
| * in {@code ParserStateTable} as well as specific run-time information |
| * from the callback functions. |
| * |
| * <p>Also note that the callbacks are called in a proper sequence, |
| * first the exit-state one then the enter-state one and finally the |
| * in-state one. Changing the order may result in a functional change. |
| * |
| * @param input the input character to parse (process) |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| @Override |
| public void parse(char input) throws ParseException { |
| InternalState nextState = |
| parserStateTable.getNextState(currentState, input); |
| |
| if (nextState == InternalState.INTERNAL_ERROR_STATE) { |
| String errorMsg = |
| String.format("Unexpected character '%s' in int_state '%s' " + |
| "(ext_state '%s')", |
| HtmlUtils.encodeCharForAscii(input), |
| currentState.getName(), getState().getName()); |
| currentState = InternalState.INTERNAL_ERROR_STATE; |
| throw new ParseException(this, errorMsg); |
| } |
| |
| if (currentState != nextState) { |
| nextState = handleExitState(currentState, nextState, input); |
| } |
| if (currentState != nextState) { |
| nextState = handleEnterState(nextState, nextState, input); |
| } |
| nextState = handleInState(nextState, input); |
| currentState = nextState; |
| record(input); |
| |
| columnNumber++; |
| if (input == '\n') { |
| lineNumber++; |
| columnNumber = 1; |
| } |
| } |
| |
| /** |
| * Return the current state of the parser. |
| */ |
| @Override |
| public ExternalState getState() { |
| if (!intToExtStateTable.containsKey(currentState)) { |
| throw new NullPointerException("Did not find external state mapping " + |
| "For internal state: " + currentState); |
| } |
| return intToExtStateTable.get(currentState); |
| } |
| |
| /** |
| * Reset the parser back to its initial default state. |
| */ |
| @Override |
| public void reset() { |
| currentState = initialState; |
| lineNumber = 1; |
| columnNumber = 1; |
| } |
| |
| /** |
| * Sets the current line number which is returned during error messages. |
| */ |
| @Override |
| public void setLineNumber(int lineNumber) { |
| this.lineNumber = lineNumber; |
| } |
| |
| /** |
| * Returns the current line number. |
| */ |
| @Override |
| public int getLineNumber() { |
| return lineNumber; |
| } |
| |
| /** |
| * Sets the current column number which is returned during error messages. |
| */ |
| @Override |
| public void setColumnNumber(int columnNumber) { |
| this.columnNumber = columnNumber; |
| } |
| |
| /** |
| * Returns the current column number. |
| */ |
| @Override |
| public int getColumnNumber() { |
| return columnNumber; |
| } |
| |
| InternalState getCurrentInternalState() { |
| return currentState; |
| } |
| |
| protected void setNextState(InternalState nextState) throws ParseException { |
| Preconditions.checkNotNull(nextState); // Developer error if it triggers. |
| |
| /* We are not actually parsing hence providing |
| * a null char to the event handlers. |
| */ |
| // TODO: Complicated logic to follow in C++ but clean it up. |
| final char nullChar = '\0'; |
| |
| if (currentState != nextState) { |
| nextState = handleExitState(currentState, nextState, nullChar); |
| } |
| if (currentState != nextState) { |
| handleEnterState(nextState, nextState, nullChar); |
| } |
| currentState = nextState; |
| } |
| |
| /** |
| * Invoked when the parser enters a new state. |
| * |
| * @param currentState the current state of the parser |
| * @param expectedNextState the next state according to the |
| * state table definition |
| * @param input the last character parsed |
| * @return the state to change to, could be the same as the |
| * {@code expectedNextState} provided |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| protected InternalState handleEnterState(InternalState currentState, |
| InternalState expectedNextState, |
| char input) throws ParseException { |
| return expectedNextState; |
| } |
| |
| /** |
| * Invoked when the parser exits a state. |
| * |
| * @param currentState the current state of the parser |
| * @param expectedNextState the next state according to the |
| * state table definition |
| * @param input the last character parsed |
| * @return the state to change to, could be the same as the |
| * {@code expectedNextState} provided |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| protected InternalState handleExitState(InternalState currentState, |
| InternalState expectedNextState, |
| char input) throws ParseException { |
| return expectedNextState; |
| } |
| |
| /** |
| * Invoked for each character read when no state change occured. |
| * |
| * @param currentState the current state of the parser |
| * @param input the last character parsed |
| * @return the state to change to, could be the same as the |
| * {@code expectedNextState} provided |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| protected InternalState handleInState(InternalState currentState, |
| char input) throws ParseException { |
| return currentState; |
| } |
| |
| /** |
| * Perform some processing on the given character. Derived classes |
| * may override this method in order to perform additional logic |
| * on every processed character beyond the logic defined in |
| * state transitions. |
| * |
| * @param input the input character to operate on |
| */ |
| protected void record(char input) { } |
| } |