| /* |
| * Copyright (C) 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.google.streamhtmlparser.util; |
| |
| import com.google.common.base.Preconditions; |
| import com.google.common.collect.ImmutableMap; |
| |
| import java.util.Map; |
| |
| /** |
| * <p>Decodes (unescapes) HTML entities with the complication that these |
| * are received one character at a time hence must be stored temporarily. |
| * Also, we may receive some "junk" characters before the actual |
| * entity which we will discard. |
| * |
| * <p>This class is designed to be 100% compatible with the corresponding |
| * logic in the C-version of the |
| * {@link com.google.security.streamhtmlparser.HtmlParser}, found |
| * in <code>htmlparser.c</code>. There are however a few intentional |
| * differences outlines below: |
| * <ul> |
| * <li>We accept lower and upper-case hex NCRs, the C-version |
| * accepts only lower-case ones. |
| * <li>The output on some invalid inputs may be different. This is |
| * currently in the process of consolidation with Filipe. |
| * <li>The API is a bit different, I find this one better suited |
| * for Java. In particular, the C method <code>processChar</code> |
| * returns the output {@code String} whereas in Java, we return |
| * a status code and then provide the {@code String} in a separate |
| * method <code>getEntity</code>. It is cleaner as it avoids the |
| * need to return empty {@code String}s during incomplete processing. |
| * </ul> |
| * |
| * <p>Valid HTML entities have one of the following three forms: |
| * <ul> |
| * <li><code>&dd;</code> where dd is a number in decimal (base 10) form. |
| * <li><code>&x|Xyy;</code> where yy is a hex-number (base 16). |
| * <li><code>&<html-entity>;</code> where |
| * <code><html-entity></code> is one of <code>lt</code>, |
| * <code>gt</code>, <code>amp</code>, <code>quot</code> or |
| * <code>apos</code>. |
| * </ul> |
| * |
| * <p>A <code>reset</code> method is provided to facilitate object re-use. |
| */ |
| public class EntityResolver { |
| |
| /** |
| * Returned in <code>processChar</code> method. |
| * <p> |
| * <ul> |
| * <li><code>NOT_STARTED</code> indicates we are still processing |
| * trailing characters before the start of an entity. |
| * The caller may want to save the characters it provided us. |
| * <li><code>IN_PROGRESS</code> indicates we are currently processing |
| * characters part of an entity. |
| * <li><code>COMPLETED</code> indicates we have finished processing |
| * an entity. The caller can then invoke <code>getEntity</code> |
| * then re-set the object for future re-use. |
| * </ul> |
| */ |
| public enum Status { |
| NOT_STARTED("Not Started"), |
| IN_PROGRESS("In Progress"), |
| COMPLETED("Completed"); |
| |
| private final String message; |
| |
| private Status(String message) { |
| this.message = message; |
| } |
| |
| /** |
| * Returns a brief description of the {@code Status} for |
| * debugging purposes. The format of the returned {@code String} |
| * is not fully specified nor guaranteed to remain the same. |
| */ |
| @Override |
| public String toString() { |
| return message; |
| } |
| } |
| |
| /** |
| * How many characters to store as we are processing an entity. Once we |
| * reach that size, we know the entity is definitely invalid. The size |
| * is higher than needed but keeping it as-is for compatibility with |
| * the C-version. |
| */ |
| private static final int MAX_ENTITY_SIZE = 10; |
| |
| /** |
| * Map containing the recognized HTML entities and their decoded values. |
| * The trailing ';' is not included in the key but it is accounted for. |
| */ |
| private static final Map<String, String> HTML_ENTITIES_MAP = |
| new ImmutableMap.Builder<String, String>() |
| .put("<", "<") |
| .put(">", ">") |
| .put("&", "&") |
| .put("&apos", "'") |
| .build(); |
| |
| /** Storage for received until characters until an HTML entity is complete. */ |
| private final StringBuilder sb; |
| |
| /** |
| * Indicates the state we are in. see {@link EntityResolver.Status}. |
| */ |
| private Status status; |
| private String entity; |
| |
| /** |
| * Constructs an entity resolver that is initially empty and |
| * with status {@code NOT_STARTED}, see {@link EntityResolver.Status}. |
| * |
| */ |
| public EntityResolver() { |
| sb = new StringBuilder(); |
| status = Status.NOT_STARTED; |
| entity = ""; |
| } |
| |
| /** |
| * Constructs an entity resolver that is an exact copy of |
| * the one provided. In particular it has the same contents |
| * and status. |
| * |
| * @param aEntityResolver the entity resolver to copy |
| */ |
| public EntityResolver(EntityResolver aEntityResolver) { |
| sb = new StringBuilder(); |
| sb.replace(0, sb.length(), aEntityResolver.sb.toString()); |
| entity = aEntityResolver.entity; |
| status = aEntityResolver.status; |
| } |
| |
| /** |
| * Returns the object to its original state for re-use, deleting any |
| * stored characters that may be present. |
| */ |
| public void reset() { |
| status = Status.NOT_STARTED; |
| sb.setLength(0); |
| entity = ""; |
| } |
| |
| /** |
| * Returns the full state of the <code>StreamEntityResolver</code> |
| * in a human readable form. The format of the returned <code>String</code> |
| * is not specified and is subject to change. |
| * |
| * @return full state of this object |
| */ |
| @Override |
| public String toString() { |
| return String.format("Status: %s; Contents (%d): %s", status.toString(), |
| sb.length(), sb.toString()); |
| } |
| |
| /** |
| * Returns the decoded HTML Entity. Should only be called |
| * after {@code processChar} returned status {@code COMPLETED}. |
| * |
| * @return the decoded HTML Entity or an empty {@code String} if |
| * we were called with any status other than {@code COMPLETED} |
| */ |
| public String getEntity() { |
| return entity; |
| } |
| |
| /** |
| * Processes a character from the input stream and decodes any html entities |
| * from that processed input stream. |
| * |
| * @param input the {@code char} to process |
| * @return the processed {@code String}. Typically returns an empty |
| * {@code String} while awaiting for more characters to complete |
| * processing of the entity. |
| */ |
| public Status processChar(char input) { |
| // Developer error if the precondition fails. |
| Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0); |
| if (status == Status.NOT_STARTED) { |
| if (input == '&') { |
| sb.append(input); |
| status = Status.IN_PROGRESS; |
| } |
| } else if (status == Status.IN_PROGRESS) { |
| if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) { |
| status = Status.COMPLETED; |
| entity = convertEntity(input); |
| } else { |
| if (sb.length() < MAX_ENTITY_SIZE) { |
| sb.append(input); |
| } else { |
| status = Status.COMPLETED; |
| entity = uncovertedInput(input); |
| } |
| } |
| } else { |
| // Status.COMPLETED, ignore character, do nothing. |
| } |
| return status; |
| } |
| |
| /** |
| * Performs the decoding of a complete HTML entity and saves the |
| * result back into the buffer. |
| * <a href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1"> |
| * Numeric Character References</a> |
| * |
| * @param terminator the last character read, unused on successful |
| * conversions since it is the end delimiter of the entity |
| * @return The decoded entity or the original input if we could not decode it. |
| */ |
| private String convertEntity(char terminator) { |
| // Developer error if the buffer was empty or does not start with '&'. |
| Preconditions.checkArgument(sb.length() > 0); |
| Preconditions.checkArgument(sb.charAt(0) == '&'); |
| |
| if (sb.length() > 1) { |
| if (sb.charAt(1) == '#') { |
| if (sb.length() <= 2) { // Error => return content as-is. |
| return uncovertedInput(terminator); |
| } |
| try { |
| if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) { // Hex NCR |
| return new String(Character.toChars( |
| Integer.parseInt(sb.substring(3), 16))); |
| } else { // Decimal NCR |
| return new String(Character.toChars( |
| Integer.parseInt(sb.substring(2)))); |
| } |
| } catch (NumberFormatException e) { |
| return uncovertedInput(terminator); |
| } |
| } |
| |
| // See if it matches any of the few recognized entities. |
| String key = sb.toString(); |
| if (HTML_ENTITIES_MAP.containsKey(key)) { |
| return HTML_ENTITIES_MAP.get(key); |
| } |
| } |
| // Covers the case of a lonely '&' given or valid/invalid unknown entities. |
| return uncovertedInput(terminator); |
| } |
| |
| private String uncovertedInput(char terminator) { |
| return String.format("%s%c", sb.toString(), terminator); |
| } |
| } |