| /* |
| * Copyright (C) 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.google.streamhtmlparser; |
| |
| /** |
| * Methods exposed for HTML parsing of text to facilitate implementation |
| * of Automatic context-aware escaping. The HTML parser also embeds a |
| * Javascript parser for processing Javascript fragments. In the future, |
| * it will also embed other specific parsers and hence most likely remain |
| * the main interface to callers of this package. |
| * |
| * <p>Note: These are the exact methods exposed in the original C++ Parser. The |
| * names are simply modified to conform to Java. |
| */ |
| public interface HtmlParser extends Parser { |
| |
| /** |
| * The Parser Mode requested for parsing a given template. |
| * Currently we support: |
| * <ul> |
| * <li>{@code HTML} for HTML templates. |
| * <li>{@code JS} for javascript templates. |
| * <li>{@code CSS} for Cascading Style-Sheets templates. |
| * <li>{@code HTML_IN_TAG} for HTML templates that consist only of |
| * HTML attribute name and value pairs. This is typically the case for |
| * a template that is being included from a parent template where the |
| * parent template contains the start and the closing of the HTML tag. |
| * This is a special mode, for standard HTML templates please use |
| * {@link #HTML}. |
| * An example of such as template is: |
| * <p><code>class="someClass" target="_blank"</code></p> |
| * <p>Which could be included from a parent template that contains |
| * an anchor tag, say:</p> |
| * <p><code><a href="/bla" ["INCLUDED_TEMPLATE"]></code></p> |
| * </ul> |
| */ |
| public enum Mode { |
| HTML, |
| JS, |
| CSS, |
| HTML_IN_TAG |
| } |
| |
| /** |
| * Indicates the type of HTML attribute that the parser is currently in or |
| * {@code NONE} if the parser is not currently in an attribute. |
| * {@code URI} is for attributes taking a URI such as "href" and "src". |
| * {@code JS} is for attributes taking javascript such as "onclick". |
| * {@code STYLE} is for the "style" attribute. |
| * All other attributes fall under {@code REGULAR}. |
| * |
| * Returned by {@link HtmlParser#getAttributeType()} |
| */ |
| public enum ATTR_TYPE { |
| NONE, |
| REGULAR, |
| URI, |
| JS, |
| STYLE |
| } |
| |
| /** |
| * All the states in which the parser can be. These are external states. |
| * The parser has many more internal states that are not exposed and which |
| * are instead mapped to one of these external ones. |
| * {@code STATE_TEXT} the parser is in HTML proper. |
| * {@code STATE_TAG} the parser is inside an HTML tag name. |
| * {@code STATE_COMMENT} the parser is inside an HTML comment. |
| * {@code STATE_ATTR} the parser is inside an HTML attribute name. |
| * {@code STATE_VALUE} the parser is inside an HTML attribute value. |
| * {@code STATE_JS_FILE} the parser is inside javascript code. |
| * {@code STATE_CSS_FILE} the parser is inside CSS code. |
| * |
| * <p>All these states map exactly to those exposed in the C++ (original) |
| * version of the HtmlParser. |
| */ |
| public final static ExternalState STATE_TEXT = |
| new ExternalState("STATE_TEXT"); |
| public final static ExternalState STATE_TAG = |
| new ExternalState("STATE_TAG"); |
| public final static ExternalState STATE_COMMENT = |
| new ExternalState("STATE_COMMENT"); |
| public final static ExternalState STATE_ATTR = |
| new ExternalState("STATE_ATTR"); |
| public final static ExternalState STATE_VALUE = |
| new ExternalState("STATE_VALUE"); |
| public final static ExternalState STATE_JS_FILE = |
| new ExternalState("STATE_JS_FILE"); |
| public final static ExternalState STATE_CSS_FILE = |
| new ExternalState("STATE_CSS_FILE"); |
| |
| /** |
| * Returns {@code true} if the parser is currently processing Javascript. |
| * Such is the case if and only if, the parser is processing an attribute |
| * that takes Javascript, a Javascript script block or the parser |
| * is (re)set with {@link Mode#JS}. |
| * |
| * @return {@code true} if the parser is processing Javascript, |
| * {@code false} otherwise |
| */ |
| public boolean inJavascript(); |
| |
| /** |
| * Returns {@code true} if the parser is currently processing |
| * a Javascript litteral that is quoted. The caller will typically |
| * invoke this method after determining that the parser is processing |
| * Javascript. Knowing whether the element is quoted or not helps |
| * determine which escaping to apply to it when needed. |
| * |
| * @return {@code true} if and only if the parser is inside a quoted |
| * Javascript literal |
| */ |
| public boolean isJavascriptQuoted(); |
| |
| |
| /** |
| * Returns {@code true} if and only if the parser is currently within |
| * an attribute, be it within the attribute name or the attribute value. |
| * |
| * @return {@code true} if and only if inside an attribute |
| */ |
| public boolean inAttribute(); |
| |
| /** |
| * Returns {@code true} if and only if the parser is currently within |
| * a CSS context. A CSS context is one of the below: |
| * <ul> |
| * <li>Inside a STYLE tag. |
| * <li>Inside a STYLE attribute. |
| * <li>Inside a CSS file when the parser was reset in the CSS mode. |
| * </ul> |
| * |
| * @return {@code true} if and only if the parser is inside CSS |
| */ |
| public boolean inCss(); |
| |
| /** |
| * Returns the type of the attribute that the parser is in |
| * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute. |
| * The caller will typically invoke this method after determining |
| * that the parser is processing an attribute. |
| * |
| * <p>This is useful to determine which escaping to apply based |
| * on the type of value this attribute expects. |
| * |
| * @return type of the attribute |
| * @see HtmlParser.ATTR_TYPE |
| */ |
| public ATTR_TYPE getAttributeType(); |
| |
| /** |
| * Returns {@code true} if and only if the parser is currently within |
| * an attribute value and that attribute value is quoted. |
| * |
| * @return {@code true} if and only if the attribute value is quoted |
| */ |
| public boolean isAttributeQuoted(); |
| |
| |
| /** |
| * Returns the name of the HTML tag if the parser is currently within one. |
| * Note that the name may be incomplete if the parser is currently still |
| * parsing the name. Returns an empty {@code String} if the parser is not |
| * in a tag as determined by {@code getCurrentExternalState}. |
| * |
| * @return the name of the HTML tag or an empty {@code String} if we are |
| * not within an HTML tag |
| */ |
| public String getTag(); |
| |
| /** |
| * Returns the name of the HTML attribute the parser is currently processing. |
| * If the parser is still parsing the name, then the returned name |
| * may be incomplete. Returns an empty {@code String} if the parser is not |
| * in an attribute as determined by {@code getCurrentExternalState}. |
| * |
| * @return the name of the HTML attribute or an empty {@code String} |
| * if we are not within an HTML attribute |
| */ |
| public String getAttribute(); |
| |
| /** |
| * Returns the value of an HTML attribute if the parser is currently |
| * within one. If the parser is currently parsing the value, the returned |
| * value may be incomplete. The caller will typically first determine |
| * that the parser is processing a value by calling |
| * {@code getCurrentExternalState}. |
| * |
| * @return the value, could be an empty {@code String} if the parser is not |
| * in an HTML attribute value |
| */ |
| public String getValue(); |
| |
| /** |
| * Returns the current position of the parser within the HTML attribute |
| * value, zero being the position of the first character in the value. |
| * The caller will typically first determine that the parser is |
| * processing a value by calling {@link #getState()}. |
| * |
| * @return the index or zero if the parser is not processing a value |
| */ |
| public int getValueIndex(); |
| |
| /** |
| * Returns {@code true} if and only if the current position of the parser is |
| * at the start of a URL HTML attribute value. This is the case when the |
| * following three conditions are all met: |
| * <p> |
| * <ol> |
| * <li>The parser is in an HTML attribute value. |
| * <li>The HTML attribute expects a URL, as determined by |
| * {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}. |
| * <li>The parser has not yet seen any characters from that URL. |
| * </ol> |
| * |
| * <p> This method may be used by an Html Sanitizer or an Auto-Escape system |
| * to determine whether to validate the URL for well-formedness and validate |
| * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe. |
| * In particular, it is recommended to use this method instead of |
| * checking that {@link #getValueIndex()} is {@code 0} to support attribute |
| * types where the URL does not start at index zero, such as the |
| * {@code content} attribute of the {@code meta} HTML tag. |
| * |
| * @return {@code true} if and only if the parser is at the start of the URL |
| */ |
| public boolean isUrlStart(); |
| |
| /** |
| * Resets the state of the parser, allowing for reuse of the |
| * {@code HtmlParser} object. |
| * |
| * <p>See the {@link HtmlParser.Mode} enum for information on all |
| * the valid modes. |
| * |
| * @param mode is an enum representing the high-level state of the parser |
| */ |
| public void resetMode(HtmlParser.Mode mode); |
| |
| /** |
| * A specialized directive to tell the parser there is some content |
| * that will be inserted here but that it will not get to parse. Used |
| * by the template system that may not be able to give some content |
| * to the parser but wants it to know there typically will be content |
| * inserted at that point. This is a hint used in corner cases within |
| * parsing of HTML attribute names and values where content we do not |
| * get to see could affect our parsing and alter our current state. |
| * |
| * <p>Returns {@code false} if and only if the parser encountered |
| * a fatal error which prevents it from continuing further parsing. |
| * |
| * <p>Note: The return value is different from the C++ Parser which |
| * always returns {@code true} but in my opinion makes more sense. |
| * |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| public void insertText() throws ParseException; |
| |
| /** |
| * Returns the state the Javascript parser is in. |
| * |
| * <p>See {@link JavascriptParser} for more information on the valid |
| * external states. The caller will typically first determine that the |
| * parser is processing Javascript and then invoke this method to |
| * obtain more fine-grained state information. |
| * |
| * @return external state of the javascript parser |
| */ |
| public ExternalState getJavascriptState(); |
| } |