blob: 1905b5e44b77e1e1a33bbff6eaa0ddbfec6b0b6d [file] [log] [blame]
/*
* Copyright (C) 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers;
import com.google.i18n.phonenumbers.PhoneNumberUtil.Leniency;
import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}.
* Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in
* {@link PhoneNumberUtil}.
*
* <p>Vanity numbers (phone numbers using alphabetic digits such as <tt>1-800-SIX-FLAGS</tt> are
* not found.
*
* <p>This class is not thread-safe.
*
* @author Tom Hofmann
*/
final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> {
/**
* The phone number pattern used by {@link #find}, similar to
* {@code PhoneNumberUtil.VALID_PHONE_NUMBER}, but with the following differences:
* <ul>
* <li>All captures are limited in order to place an upper bound to the text matched by the
* pattern.
* <ul>
* <li>Leading punctuation / plus signs are limited.
* <li>Consecutive occurrences of punctuation are limited.
* <li>Number of digits is limited.
* </ul>
* <li>No whitespace is allowed at the start or end.
* <li>No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported.
* </ul>
*/
private static final Pattern PATTERN;
/**
* A phone number pattern that does not allow whitespace as punctuation. This pattern is only used
* in a second attempt to find a phone number occurring in the context of other numbers, such as
* when the preceding or following token is a zip code.
*/
private static final Pattern INNER;
/**
* Matches strings that look like publication pages. Example:
* <pre>Computing Complete Answers to Queries in the Presence of Limited Access Patterns.
* Chen Li. VLDB J. 12(3): 211-227 (2003).</pre>
*
* The string "211-227 (2003)" is not a telephone number.
*/
private static final Pattern PUB_PAGES = Pattern.compile("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}");
static {
/* Builds the PATTERN and INNER regular expression patterns. The building blocks below
* exist to make the patterns more easily understood. */
/* Limit on the number of leading (plus) characters. */
String leadLimit = limit(0, 2);
/* Limit on the number of consecutive punctuation characters. */
String punctuationLimit = limit(0, 4);
/* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a
* single block, set high enough to accommodate the entire national number and the international
* country code. */
int digitBlockLimit =
PhoneNumberUtil.MAX_LENGTH_FOR_NSN + PhoneNumberUtil.MAX_LENGTH_COUNTRY_CODE;
/* Limit on the number of blocks separated by punctuation. Use digitBlockLimit since in some
* formats use spaces to separate each digit. */
String blockLimit = limit(0, digitBlockLimit);
/* Same as {@link PhoneNumberUtil#VALID_PUNCTUATION} but without space characters. */
String nonSpacePunctuationChars = removeSpace(PhoneNumberUtil.VALID_PUNCTUATION);
/* A punctuation sequence without white space. */
String nonSpacePunctuation = "[" + nonSpacePunctuationChars + "]" + punctuationLimit;
/* A punctuation sequence allowing white space. */
String punctuation = "[" + PhoneNumberUtil.VALID_PUNCTUATION + "]" + punctuationLimit;
/* A digits block without punctuation. */
String digitSequence = "\\p{Nd}" + limit(1, digitBlockLimit);
/* Punctuation that may be at the start of a phone number - brackets and plus signs. */
String leadClass = "[(\\[" + PhoneNumberUtil.PLUS_CHARS + "]";
/* Phone number pattern allowing optional punctuation. */
PATTERN = Pattern.compile(
"(?:" + leadClass + punctuation + ")" + leadLimit +
digitSequence + "(?:" + punctuation + digitSequence + ")" + blockLimit +
"(?:" + PhoneNumberUtil.KNOWN_EXTN_PATTERNS + ")?",
PhoneNumberUtil.REGEX_FLAGS);
/* Phone number pattern with no whitespace allowed. */
INNER = Pattern.compile(
leadClass + leadLimit +
digitSequence + "(?:" + nonSpacePunctuation + digitSequence + ")" + blockLimit,
PhoneNumberUtil.REGEX_FLAGS);
}
/** Returns a regular expression quantifier with an upper and lower limit. */
private static String limit(int lower, int upper) {
if ((lower < 0) || (upper <= 0) || (upper < lower)) {
throw new IllegalArgumentException();
}
return "{" + lower + "," + upper + "}";
}
/**
* Returns a copy of {@code characters} with any {@linkplain Character#isSpaceChar space}
* characters removed.
*/
private static String removeSpace(String characters) {
StringBuilder builder = new StringBuilder(characters.length());
int i = 0;
while (i < characters.length()) {
int codePoint = characters.codePointAt(i);
if (!Character.isSpaceChar(codePoint)) {
builder.appendCodePoint(codePoint);
}
i += Character.charCount(codePoint);
}
return builder.toString();
}
/** The potential states of a PhoneNumberMatcher. */
private enum State {
NOT_READY, READY, DONE
}
/** The phone number utility. */
private final PhoneNumberUtil util;
/** The text searched for phone numbers. */
private final CharSequence text;
/**
* The region (country) to assume for phone numbers without an international prefix, possibly
* null.
*/
private final String preferredRegion;
/** The degree of validation requested. */
private final Leniency leniency;
/** The maximum number of retries after matching an invalid number. */
private long maxTries;
/** The iteration tristate. */
private State state = State.NOT_READY;
/** The last successful match, null unless in {@link State#READY}. */
private PhoneNumberMatch lastMatch = null;
/** The next index to start searching at. Undefined in {@link State#DONE}. */
private int searchIndex = 0;
/**
* Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a
* new instance.
*
* @param util the phone number util to use
* @param text the character sequence that we will search, null for no text
* @param country the ISO 3166-1 two-letter country code indicating the country to assume for
* phone numbers not written in international format (with a leading plus, or
* with the international dialing prefix of the specified region). May be null or
* "ZZ" if only numbers with a leading plus should be considered.
* @param leniency the leniency to use when evaluating candidate phone numbers
* @param maxTries the maximum number of invalid numbers to try before giving up on the text.
* This is to cover degenerate cases where the text has a lot of false positives
* in it. Must be {@code >= 0}.
*/
PhoneNumberMatcher(PhoneNumberUtil util, CharSequence text, String country, Leniency leniency,
long maxTries) {
if ((util == null) || (leniency == null)) {
throw new NullPointerException();
}
if (maxTries < 0) {
throw new IllegalArgumentException();
}
this.util = util;
this.text = (text != null) ? text : "";
this.preferredRegion = country;
this.leniency = leniency;
this.maxTries = maxTries;
}
public boolean hasNext() {
if (state == State.NOT_READY) {
lastMatch = find(searchIndex);
if (lastMatch == null) {
state = State.DONE;
} else {
searchIndex = lastMatch.end();
state = State.READY;
}
}
return state == State.READY;
}
public PhoneNumberMatch next() {
// Check the state and find the next match as a side-effect if necessary.
if (!hasNext()) {
throw new NoSuchElementException();
}
// Don't retain that memory any longer than necessary.
PhoneNumberMatch result = lastMatch;
lastMatch = null;
state = State.NOT_READY;
return result;
}
/**
* Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex}
* that represents a phone number. Returns the next match, null if none was found.
*
* @param index the search index to start searching at
* @return the phone number match found, null if none can be found
*/
private PhoneNumberMatch find(int index) {
Matcher matcher = PATTERN.matcher(text);
while ((maxTries > 0) && matcher.find(index)) {
int start = matcher.start();
CharSequence candidate = text.subSequence(start, matcher.end());
// Check for extra numbers at the end.
// TODO: This is the place to start when trying to support extraction of multiple phone number
// from split notations (+41 79 123 45 67 / 68).
candidate = trimAfterFirstMatch(PhoneNumberUtil.SECOND_NUMBER_START_PATTERN, candidate);
PhoneNumberMatch match = extractMatch(candidate, start);
if (match != null) {
return match;
}
index = start + candidate.length();
maxTries--;
}
return null;
}
/**
* Trims away any characters after the first match of {@code pattern} in {@code candidate},
* returning the trimmed version.
*/
private static CharSequence trimAfterFirstMatch(Pattern pattern, CharSequence candidate) {
Matcher trailingCharsMatcher = pattern.matcher(candidate);
if (trailingCharsMatcher.find()) {
candidate = candidate.subSequence(0, trailingCharsMatcher.start());
}
return candidate;
}
/**
* Attempts to extract a match from a {@code candidate} character sequence.
*
* @param candidate the candidate text that might contain a phone number
* @param offset the offset of {@code candidate} within {@link #text}
* @return the match found, null if none can be found
*/
private PhoneNumberMatch extractMatch(CharSequence candidate, int offset) {
// Skip a match that is more likely a publication page reference.
if (PUB_PAGES.matcher(candidate).find()) {
return null;
}
// Try to come up with a valid match given the entire candidate.
String rawString = candidate.toString();
PhoneNumberMatch match = parseAndVerify(rawString, offset);
if (match != null) {
return match;
}
// If that failed, try to find an inner match without white space.
return extractInnerMatch(rawString, offset);
}
/**
* Attempts to extract a match from {@code candidate} using the {@link #INNER} pattern.
*
* @param candidate the candidate text that might contain a phone number
* @param offset the offset of {@code candidate} within {@link #text}
* @return the match found, null if none can be found
*/
private PhoneNumberMatch extractInnerMatch(String candidate, int offset) {
int index = 0;
Matcher matcher = INNER.matcher(candidate);
while ((maxTries > 0) && matcher.find(index)) {
String innerCandidate = candidate.substring(matcher.start(), matcher.end());
PhoneNumberMatch match = parseAndVerify(innerCandidate, offset + matcher.start());
if (match != null) {
return match;
}
maxTries--;
index = matcher.end();
}
return null;
}
/**
* Parses a phone number from the {@code candidate} using {@link PhoneNumberUtil#parse} and
* verifies it matches the requested {@link #leniency}. If parsing and verification succeed, a
* corresponding {@link PhoneNumberMatch} is returned, otherwise this method returns null.
*
* @param candidate the candidate match
* @param offset the offset of {@code candidate} within {@link #text}
* @return the parsed and validated phone number match, or null
*/
private PhoneNumberMatch parseAndVerify(String candidate, int offset) {
try {
PhoneNumber number = util.parse(candidate, preferredRegion);
if (leniency.verify(number, util)) {
return new PhoneNumberMatch(offset, candidate, number);
}
} catch (NumberParseException e) {
// ignore and continue
}
return null;
}
/**
* Always throws {@link UnsupportedOperationException} as removal is not supported.
*/
public void remove() {
throw new UnsupportedOperationException();
}
}