blob: ca97a71d4d7fff40e213a66ff109c9f3861d0321 [file] [log] [blame]
/*
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#undef WEBKIT_IMPLEMENTATION
#undef LOG
#include "base/utf_string_conversions.h"
#include "net/base/escape.h"
#include "PhoneEmailDetector.h"
#include "Settings.h"
#include "WebString.h"
#define LOG_TAG "PhoneNumberDetector"
#include <cutils/log.h>
#define PHONE_PATTERN "(200) /-.\\ 100 -. 0000"
static const char kTelSchemaPrefix[] = "tel:";
static const char kEmailSchemaPrefix[] = "mailto:";
void FindReset(FindState* state);
void FindResetNumber(FindState* state);
FoundState FindPartialNumber(const UChar* chars, unsigned length,
FindState* s);
struct FindState;
static FoundState FindPartialEMail(const UChar* , unsigned length, FindState* );
static bool IsDomainChar(UChar ch);
static bool IsMailboxChar(UChar ch);
PhoneEmailDetector::PhoneEmailDetector()
: m_foundResult(FOUND_NONE)
{
}
bool PhoneEmailDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test)
{
WebCore::Settings* settings = GetSettings(hit_test);
if (!settings)
return false;
m_isPhoneDetectionEnabled = settings->formatDetectionTelephone();
m_isEmailDetectionEnabled = settings->formatDetectionEmail();
return m_isEmailDetectionEnabled || m_isPhoneDetectionEnabled;
}
bool PhoneEmailDetector::FindContent(const string16::const_iterator& begin,
const string16::const_iterator& end,
size_t* start_pos,
size_t* end_pos)
{
FindReset(&m_findState);
m_foundResult = FOUND_NONE;
if (m_isPhoneDetectionEnabled)
m_foundResult = FindPartialNumber(begin, end - begin, &m_findState);
if (m_foundResult == FOUND_COMPLETE)
m_prefix = kTelSchemaPrefix;
else {
FindReset(&m_findState);
if (m_isEmailDetectionEnabled)
m_foundResult = FindPartialEMail(begin, end - begin, &m_findState);
m_prefix = kEmailSchemaPrefix;
}
*start_pos = m_findState.mStartResult;
*end_pos = m_findState.mEndResult;
return m_foundResult == FOUND_COMPLETE;
}
std::string PhoneEmailDetector::GetContentText(const WebKit::WebRange& range)
{
if (m_foundResult == FOUND_COMPLETE) {
if (m_prefix == kTelSchemaPrefix)
return UTF16ToUTF8(m_findState.mStore);
else
return UTF16ToUTF8(range.toPlainText());
}
return std::string();
}
GURL PhoneEmailDetector::GetIntentURL(const std::string& content_text)
{
return GURL(m_prefix +
EscapeQueryParamValue(content_text, true));
}
void FindReset(FindState* state)
{
memset(state, 0, sizeof(FindState));
state->mCurrent = ' ';
FindResetNumber(state);
}
void FindResetNumber(FindState* state)
{
state->mOpenParen = false;
state->mPattern = (char*) PHONE_PATTERN;
state->mStorePtr = state->mStore;
}
FoundState FindPartialNumber(const UChar* chars, unsigned length,
FindState* s)
{
char* pattern = s->mPattern;
UChar* store = s->mStorePtr;
const UChar* start = chars;
const UChar* end = chars + length;
const UChar* lastDigit = 0;
string16 search16(chars, length);
std::string searchSpace = UTF16ToUTF8(search16);
do {
bool initialized = s->mInitialized;
while (chars < end) {
if (initialized == false) {
s->mBackTwo = s->mBackOne;
s->mBackOne = s->mCurrent;
}
UChar ch = s->mCurrent = *chars;
do {
char patternChar = *pattern;
switch (patternChar) {
case '2':
if (initialized == false) {
s->mStartResult = chars - start;
initialized = true;
}
case '0':
case '1':
if (ch < patternChar || ch > '9')
goto resetPattern;
*store++ = ch;
pattern++;
lastDigit = chars;
goto nextChar;
case '\0':
if (WTF::isASCIIDigit(ch) == false) {
*store = '\0';
goto checkMatch;
}
goto resetPattern;
case ' ':
if (ch == patternChar)
goto nextChar;
break;
case '(':
if (ch == patternChar) {
s->mStartResult = chars - start;
initialized = true;
s->mOpenParen = true;
}
goto commonPunctuation;
case ')':
if ((ch == patternChar) ^ s->mOpenParen)
goto resetPattern;
default:
commonPunctuation:
if (ch == patternChar) {
pattern++;
goto nextChar;
}
}
} while (++pattern); // never false
nextChar:
chars++;
}
break;
resetPattern:
if (s->mContinuationNode)
return FOUND_NONE;
FindResetNumber(s);
pattern = s->mPattern;
store = s->mStorePtr;
} while (++chars < end);
checkMatch:
if (WTF::isASCIIDigit(s->mBackOne != '1' ? s->mBackOne : s->mBackTwo)) {
return FOUND_NONE;
}
*store = '\0';
s->mStorePtr = store;
s->mPattern = pattern;
s->mEndResult = lastDigit - start + 1;
char pState = pattern[0];
return pState == '\0' ? FOUND_COMPLETE : pState == '(' || (WTF::isASCIIDigit(pState) && WTF::isASCIIDigit(pattern[-1])) ?
FOUND_NONE : FOUND_PARTIAL;
}
FoundState FindPartialEMail(const UChar* chars, unsigned length,
FindState* s)
{
// the following tables were generated by tests/browser/focusNavigation/BrowserDebug.cpp
// hand-edit at your own risk
static const int domainTwoLetter[] = {
0x02df797c, // a followed by: [cdefgilmnoqrstuwxz]
0x036e73fb, // b followed by: [abdefghijmnorstvwyz]
0x03b67ded, // c followed by: [acdfghiklmnorsuvxyz]
0x02005610, // d followed by: [ejkmoz]
0x001e00d4, // e followed by: [ceghrstu]
0x00025700, // f followed by: [ijkmor]
0x015fb9fb, // g followed by: [abdefghilmnpqrstuwy]
0x001a3400, // h followed by: [kmnrtu]
0x000f7818, // i followed by: [delmnoqrst]
0x0000d010, // j followed by: [emop]
0x0342b1d0, // k followed by: [eghimnprwyz]
0x013e0507, // l followed by: [abcikrstuvy]
0x03fffccd, // m followed by: [acdghklmnopqrstuvwxyz]
0x0212c975, // n followed by: [acefgilopruz]
0x00001000, // o followed by: [m]
0x014e3cf1, // p followed by: [aefghklmnrstwy]
0x00000001, // q followed by: [a]
0x00504010, // r followed by: [eouw]
0x032a7fdf, // s followed by: [abcdeghijklmnortvyz]
0x026afeec, // t followed by: [cdfghjklmnoprtvwz]
0x03041441, // u followed by: [agkmsyz]
0x00102155, // v followed by: [aceginu]
0x00040020, // w followed by: [fs]
0x00000000, // x
0x00180010, // y followed by: [etu]
0x00401001, // z followed by: [amw]
};
static char const* const longDomainNames[] = {
"\x03" "ero" "\x03" "rpa", // aero, arpa
"\x02" "iz", // biz
"\x02" "at" "\x02" "om" "\x03" "oop", // cat, com, coop
NULL, // d
"\x02" "du", // edu
NULL, // f
"\x02" "ov", // gov
NULL, // h
"\x03" "nfo" "\x02" "nt", // info, int
"\x03" "obs", // jobs
NULL, // k
NULL, // l
"\x02" "il" "\x03" "obi" "\x05" "useum", // mil, mobi, museum
"\x03" "ame" "\x02" "et", // name, net
"\x02" "rg", // , org
"\x02" "ro", // pro
NULL, // q
NULL, // r
NULL, // s
"\x05" "ravel", // travel
NULL, // u
NULL, // v
NULL, // w
NULL, // x
NULL, // y
NULL, // z
};
const UChar* start = chars;
const UChar* end = chars + length;
while (chars < end) {
UChar ch = *chars++;
if (ch != '@')
continue;
const UChar* atLocation = chars - 1;
// search for domain
ch = *chars++ | 0x20; // convert uppercase to lower
if (ch < 'a' || ch > 'z')
continue;
while (chars < end) {
ch = *chars++;
if (IsDomainChar(ch) == false)
goto nextAt;
if (ch != '.')
continue;
UChar firstLetter = *chars++ | 0x20; // first letter of the domain
if (chars >= end)
return FOUND_NONE; // only one letter; must be at least two
firstLetter -= 'a';
if (firstLetter > 'z' - 'a')
continue; // non-letter followed '.'
int secondLetterMask = domainTwoLetter[firstLetter];
ch = *chars | 0x20; // second letter of the domain
ch -= 'a';
if (ch >= 'z' - 'a')
continue;
bool secondMatch = (secondLetterMask & 1 << ch) != 0;
const char* wordMatch = longDomainNames[firstLetter];
int wordIndex = 0;
while (wordMatch != NULL) {
int len = *wordMatch++;
char match;
do {
match = wordMatch[wordIndex];
if (match < 0x20)
goto foundDomainStart;
if (chars[wordIndex] != match)
break;
wordIndex++;
} while (true);
wordMatch += len;
if (*wordMatch == '\0')
break;
wordIndex = 0;
}
if (secondMatch) {
wordIndex = 1;
foundDomainStart:
chars += wordIndex;
if (chars < end) {
ch = *chars;
if (ch != '.') {
if (IsDomainChar(ch))
goto nextDot;
} else if (chars + 1 < end && IsDomainChar(chars[1]))
goto nextDot;
}
// found domain. Search backwards from '@' for beginning of email address
s->mEndResult = chars - start;
chars = atLocation;
if (chars <= start)
goto nextAt;
ch = *--chars;
if (ch == '.')
goto nextAt; // mailbox can't end in period
do {
if (IsMailboxChar(ch) == false) {
chars++;
break;
}
if (chars == start)
break;
ch = *--chars;
} while (true);
UChar firstChar = *chars;
if (firstChar == '.' || firstChar == '@') // mailbox can't start with period or be empty
goto nextAt;
s->mStartResult = chars - start;
return FOUND_COMPLETE;
}
nextDot:
;
}
nextAt:
chars = atLocation + 1;
}
return FOUND_NONE;
}
bool IsDomainChar(UChar ch)
{
static const unsigned body[] = {0x03ff6000, 0x07fffffe, 0x07fffffe}; // 0-9 . - A-Z a-z
ch -= 0x20;
if (ch > 'z' - 0x20)
return false;
return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0;
}
bool IsMailboxChar(UChar ch)
{
// According to http://en.wikipedia.org/wiki/Email_address
// ! # $ % & ' * + - . / 0-9 = ?
// A-Z ^ _
// ` a-z { | } ~
static const unsigned body[] = {0xa3ffecfa, 0xc7fffffe, 0x7fffffff};
ch -= 0x20;
if (ch > '~' - 0x20)
return false;
return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0;
}