blob: aa0f5d3ddf4a4f502b6f72e81605dcd85a206000 [file] [log] [blame]
/*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.base.Preconditions;
import com.google.streamhtmlparser.ExternalState;
import com.google.streamhtmlparser.Parser;
import com.google.streamhtmlparser.ParseException;
import com.google.streamhtmlparser.util.HtmlUtils;
import java.util.Map;
/**
* An implementation of the {@code Parser} interface that is common to both
* {@code HtmlParser} and {@code JavascriptParser}.
*
* <p>Provides methods for parsing input and ensuring that all in-state,
* entering-a-state and exiting-a-state callbacks are invoked as appropriate.
*
* <p>This class started as abstract but it was found better for testing to
* make it instantiatable so that the parsing logic can be tested with dummy
* state transitions.
*/
public class GenericParser implements Parser {
protected final ParserStateTable parserStateTable;
protected final Map<InternalState, ExternalState> intToExtStateTable;
protected final InternalState initialState;
protected InternalState currentState;
protected int lineNumber;
protected int columnNumber;
protected GenericParser(ParserStateTable parserStateTable,
Map<InternalState, ExternalState> intToExtStateTable,
InternalState initialState) {
this.parserStateTable = parserStateTable;
this.intToExtStateTable = intToExtStateTable;
this.initialState = initialState;
this.currentState = initialState;
this.lineNumber = 1;
this.columnNumber = 1;
}
/**
* Constructs a generic parser that is an exact copy of the
* one given. Note that here too, data structures that do not
* change are shallow-copied (parser state table and state mappings).
*
* @param aGenericParser the {@code GenericParser} to copy
*/
protected GenericParser(GenericParser aGenericParser) {
parserStateTable = aGenericParser.parserStateTable;
intToExtStateTable = aGenericParser.intToExtStateTable;
initialState = aGenericParser.initialState;
currentState = aGenericParser.currentState;
lineNumber = aGenericParser.lineNumber;
columnNumber = aGenericParser.columnNumber;
}
/**
* Tell the parser to process the provided {@code String}. This is just a
* convenience method that wraps over {@link Parser#parse(char)}.
* @param input the {@code String} to parse
* @throws ParseException if an unrecoverable error occurred during parsing
*/
@Override
public void parse(String input) throws ParseException {
for (int i = 0; i < input.length(); i++)
parse(input.charAt(i));
}
/**
* Main loop for parsing of input.
*
* <p>Absent any callbacks defined, this function simply determines the
* next state to switch to based on the <code>ParserStateTable</code> which is
* derived from a state-machine configuration file in the original C++ parser.
*
* <p>However some states have specific callbacks defined which when
* receiving specific characters may decide to overwrite the next state to
* go to. Hence the next state is a function both of the main state table
* in {@code ParserStateTable} as well as specific run-time information
* from the callback functions.
*
* <p>Also note that the callbacks are called in a proper sequence,
* first the exit-state one then the enter-state one and finally the
* in-state one. Changing the order may result in a functional change.
*
* @param input the input character to parse (process)
* @throws ParseException if an unrecoverable error occurred during parsing
*/
@Override
public void parse(char input) throws ParseException {
InternalState nextState =
parserStateTable.getNextState(currentState, input);
if (nextState == InternalState.INTERNAL_ERROR_STATE) {
String errorMsg =
String.format("Unexpected character '%s' in int_state '%s' " +
"(ext_state '%s')",
HtmlUtils.encodeCharForAscii(input),
currentState.getName(), getState().getName());
currentState = InternalState.INTERNAL_ERROR_STATE;
throw new ParseException(this, errorMsg);
}
if (currentState != nextState) {
nextState = handleExitState(currentState, nextState, input);
}
if (currentState != nextState) {
nextState = handleEnterState(nextState, nextState, input);
}
nextState = handleInState(nextState, input);
currentState = nextState;
record(input);
columnNumber++;
if (input == '\n') {
lineNumber++;
columnNumber = 1;
}
}
/**
* Return the current state of the parser.
*/
@Override
public ExternalState getState() {
if (!intToExtStateTable.containsKey(currentState)) {
throw new NullPointerException("Did not find external state mapping " +
"For internal state: " + currentState);
}
return intToExtStateTable.get(currentState);
}
/**
* Reset the parser back to its initial default state.
*/
@Override
public void reset() {
currentState = initialState;
lineNumber = 1;
columnNumber = 1;
}
/**
* Sets the current line number which is returned during error messages.
*/
@Override
public void setLineNumber(int lineNumber) {
this.lineNumber = lineNumber;
}
/**
* Returns the current line number.
*/
@Override
public int getLineNumber() {
return lineNumber;
}
/**
* Sets the current column number which is returned during error messages.
*/
@Override
public void setColumnNumber(int columnNumber) {
this.columnNumber = columnNumber;
}
/**
* Returns the current column number.
*/
@Override
public int getColumnNumber() {
return columnNumber;
}
InternalState getCurrentInternalState() {
return currentState;
}
protected void setNextState(InternalState nextState) throws ParseException {
Preconditions.checkNotNull(nextState); // Developer error if it triggers.
/* We are not actually parsing hence providing
* a null char to the event handlers.
*/
// TODO: Complicated logic to follow in C++ but clean it up.
final char nullChar = '\0';
if (currentState != nextState) {
nextState = handleExitState(currentState, nextState, nullChar);
}
if (currentState != nextState) {
handleEnterState(nextState, nextState, nullChar);
}
currentState = nextState;
}
/**
* Invoked when the parser enters a new state.
*
* @param currentState the current state of the parser
* @param expectedNextState the next state according to the
* state table definition
* @param input the last character parsed
* @return the state to change to, could be the same as the
* {@code expectedNextState} provided
* @throws ParseException if an unrecoverable error occurred during parsing
*/
protected InternalState handleEnterState(InternalState currentState,
InternalState expectedNextState,
char input) throws ParseException {
return expectedNextState;
}
/**
* Invoked when the parser exits a state.
*
* @param currentState the current state of the parser
* @param expectedNextState the next state according to the
* state table definition
* @param input the last character parsed
* @return the state to change to, could be the same as the
* {@code expectedNextState} provided
* @throws ParseException if an unrecoverable error occurred during parsing
*/
protected InternalState handleExitState(InternalState currentState,
InternalState expectedNextState,
char input) throws ParseException {
return expectedNextState;
}
/**
* Invoked for each character read when no state change occured.
*
* @param currentState the current state of the parser
* @param input the last character parsed
* @return the state to change to, could be the same as the
* {@code expectedNextState} provided
* @throws ParseException if an unrecoverable error occurred during parsing
*/
protected InternalState handleInState(InternalState currentState,
char input) throws ParseException {
return currentState;
}
/**
* Perform some processing on the given character. Derived classes
* may override this method in order to perform additional logic
* on every processed character beyond the logic defined in
* state transitions.
*
* @param input the input character to operate on
*/
protected void record(char input) { }
}