blob: d8c5d277c9c2dc30ea532c5b908b0bbf64a37519 [file] [log] [blame]
/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "YarrInterpreter.h"
#include "Yarr.h"
#include <wtf/BumpPointerAllocator.h>
#ifndef NDEBUG
#include <stdio.h>
#endif
using namespace WTF;
namespace JSC { namespace Yarr {
class Interpreter {
public:
struct ParenthesesDisjunctionContext;
struct BackTrackInfoPatternCharacter {
uintptr_t matchAmount;
};
struct BackTrackInfoCharacterClass {
uintptr_t matchAmount;
};
struct BackTrackInfoBackReference {
uintptr_t begin; // Not really needed for greedy quantifiers.
uintptr_t matchAmount; // Not really needed for fixed quantifiers.
};
struct BackTrackInfoAlternative {
uintptr_t offset;
};
struct BackTrackInfoParentheticalAssertion {
uintptr_t begin;
};
struct BackTrackInfoParenthesesOnce {
uintptr_t begin;
};
struct BackTrackInfoParenthesesTerminal {
uintptr_t begin;
};
struct BackTrackInfoParentheses {
uintptr_t matchAmount;
ParenthesesDisjunctionContext* lastContext;
};
static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
{
context->next = backTrack->lastContext;
backTrack->lastContext = context;
++backTrack->matchAmount;
}
static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
{
ASSERT(backTrack->matchAmount);
ASSERT(backTrack->lastContext);
backTrack->lastContext = backTrack->lastContext->next;
--backTrack->matchAmount;
}
struct DisjunctionContext
{
DisjunctionContext()
: term(0)
{
}
void* operator new(size_t, void* where)
{
return where;
}
int term;
unsigned matchBegin;
unsigned matchEnd;
uintptr_t frame[1];
};
DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
{
size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
allocatorPool = allocatorPool->ensureCapacity(size);
if (!allocatorPool)
CRASH();
return new(allocatorPool->alloc(size)) DisjunctionContext();
}
void freeDisjunctionContext(DisjunctionContext* context)
{
allocatorPool = allocatorPool->dealloc(context);
}
struct ParenthesesDisjunctionContext
{
ParenthesesDisjunctionContext(int* output, ByteTerm& term)
: next(0)
{
unsigned firstSubpatternId = term.atom.subpatternId;
unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
output[(firstSubpatternId << 1) + i] = -1;
}
new(getDisjunctionContext(term)) DisjunctionContext();
}
void* operator new(size_t, void* where)
{
return where;
}
void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
{
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
}
DisjunctionContext* getDisjunctionContext(ByteTerm& term)
{
return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
}
ParenthesesDisjunctionContext* next;
int subpatternBackup[1];
};
ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
{
size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
allocatorPool = allocatorPool->ensureCapacity(size);
if (!allocatorPool)
CRASH();
return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
}
void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
{
allocatorPool = allocatorPool->dealloc(context);
}
class InputStream {
public:
InputStream(const UChar* input, unsigned start, unsigned length)
: input(input)
, pos(start)
, length(length)
{
}
void next()
{
++pos;
}
void rewind(unsigned amount)
{
ASSERT(pos >= amount);
pos -= amount;
}
int read()
{
ASSERT(pos < length);
if (pos < length)
return input[pos];
return -1;
}
int readPair()
{
ASSERT(pos + 1 < length);
return input[pos] | input[pos + 1] << 16;
}
int readChecked(int position)
{
ASSERT(position < 0);
ASSERT(static_cast<unsigned>(-position) <= pos);
unsigned p = pos + position;
ASSERT(p < length);
return input[p];
}
int reread(unsigned from)
{
ASSERT(from < length);
return input[from];
}
int prev()
{
ASSERT(!(pos > length));
if (pos && length)
return input[pos - 1];
return -1;
}
unsigned getPos()
{
return pos;
}
void setPos(unsigned p)
{
pos = p;
}
bool atStart()
{
return pos == 0;
}
bool atEnd()
{
return pos == length;
}
bool checkInput(int count)
{
if ((pos + count) <= length) {
pos += count;
return true;
}
return false;
}
void uncheckInput(int count)
{
pos -= count;
}
bool atStart(int position)
{
return (pos + position) == 0;
}
bool atEnd(int position)
{
return (pos + position) == length;
}
bool isNotAvailableInput(int position)
{
return (pos + position) > length;
}
private:
const UChar* input;
unsigned pos;
unsigned length;
};
bool testCharacterClass(CharacterClass* characterClass, int ch)
{
if (ch & 0xFF80) {
for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
if (ch == characterClass->m_matchesUnicode[i])
return true;
for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
return true;
} else {
for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
if (ch == characterClass->m_matches[i])
return true;
for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
return true;
}
return false;
}
bool checkCharacter(int testChar, int inputPosition)
{
return testChar == input.readChecked(inputPosition);
}
bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
{
int ch = input.readChecked(inputPosition);
return (loChar == ch) || (hiChar == ch);
}
bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
{
bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
return invert ? !match : match;
}
bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
{
int matchSize = matchEnd - matchBegin;
if (!input.checkInput(matchSize))
return false;
if (pattern->m_ignoreCase) {
for (int i = 0; i < matchSize; ++i) {
int ch = input.reread(matchBegin + i);
int lo = Unicode::toLower(ch);
int hi = Unicode::toUpper(ch);
if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
input.uncheckInput(matchSize);
return false;
}
}
} else {
for (int i = 0; i < matchSize; ++i) {
if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
input.uncheckInput(matchSize);
return false;
}
}
}
return true;
}
bool matchAssertionBOL(ByteTerm& term)
{
return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
}
bool matchAssertionEOL(ByteTerm& term)
{
if (term.inputPosition)
return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
}
bool matchAssertionWordBoundary(ByteTerm& term)
{
bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
bool readIsWordchar;
if (term.inputPosition)
readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
else
readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
bool wordBoundary = prevIsWordchar != readIsWordchar;
return term.invert() ? !wordBoundary : wordBoundary;
}
bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
{
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
switch (term.atom.quantityType) {
case QuantifierFixedCount:
break;
case QuantifierGreedy:
if (backTrack->matchAmount) {
--backTrack->matchAmount;
input.uncheckInput(1);
return true;
}
break;
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
++backTrack->matchAmount;
if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
return true;
}
input.uncheckInput(backTrack->matchAmount);
break;
}
return false;
}
bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
{
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
switch (term.atom.quantityType) {
case QuantifierFixedCount:
break;
case QuantifierGreedy:
if (backTrack->matchAmount) {
--backTrack->matchAmount;
input.uncheckInput(1);
return true;
}
break;
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
++backTrack->matchAmount;
if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
return true;
}
input.uncheckInput(backTrack->matchAmount);
break;
}
return false;
}
bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeCharacterClass);
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
switch (term.atom.quantityType) {
case QuantifierFixedCount: {
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
return false;
}
return true;
}
case QuantifierGreedy: {
unsigned matchAmount = 0;
while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
input.uncheckInput(1);
break;
}
++matchAmount;
}
backTrack->matchAmount = matchAmount;
return true;
}
case QuantifierNonGreedy:
backTrack->matchAmount = 0;
return true;
}
ASSERT_NOT_REACHED();
return false;
}
bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeCharacterClass);
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
switch (term.atom.quantityType) {
case QuantifierFixedCount:
break;
case QuantifierGreedy:
if (backTrack->matchAmount) {
--backTrack->matchAmount;
input.uncheckInput(1);
return true;
}
break;
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
++backTrack->matchAmount;
if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
return true;
}
input.uncheckInput(backTrack->matchAmount);
break;
}
return false;
}
bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeBackReference);
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
int matchBegin = output[(term.atom.subpatternId << 1)];
int matchEnd = output[(term.atom.subpatternId << 1) + 1];
// If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
// In this case the result of match is empty string like when it references to a parentheses with zero-width match.
// Eg.: /(a\1)/
if (matchEnd == -1)
return true;
ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
if (matchBegin == matchEnd)
return true;
switch (term.atom.quantityType) {
case QuantifierFixedCount: {
backTrack->begin = input.getPos();
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
input.setPos(backTrack->begin);
return false;
}
}
return true;
}
case QuantifierGreedy: {
unsigned matchAmount = 0;
while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
++matchAmount;
backTrack->matchAmount = matchAmount;
return true;
}
case QuantifierNonGreedy:
backTrack->begin = input.getPos();
backTrack->matchAmount = 0;
return true;
}
ASSERT_NOT_REACHED();
return false;
}
bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeBackReference);
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
int matchBegin = output[(term.atom.subpatternId << 1)];
int matchEnd = output[(term.atom.subpatternId << 1) + 1];
ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
if (matchBegin == matchEnd)
return false;
switch (term.atom.quantityType) {
case QuantifierFixedCount:
// for quantityCount == 1, could rewind.
input.setPos(backTrack->begin);
break;
case QuantifierGreedy:
if (backTrack->matchAmount) {
--backTrack->matchAmount;
input.rewind(matchEnd - matchBegin);
return true;
}
break;
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
++backTrack->matchAmount;
return true;
}
input.setPos(backTrack->begin);
break;
}
return false;
}
void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
{
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
}
}
void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
{
unsigned firstSubpatternId = term.atom.subpatternId;
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
context->restoreOutput(output, firstSubpatternId, count);
}
JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
{
while (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
if (result == JSRegExpMatch)
return JSRegExpMatch;
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
if (result != JSRegExpNoMatch)
return result;
}
return JSRegExpNoMatch;
}
bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
ASSERT(term.atom.quantityCount == 1);
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
switch (term.atom.quantityType) {
case QuantifierGreedy: {
// set this speculatively; if we get to the parens end this will be true.
backTrack->begin = input.getPos();
break;
}
case QuantifierNonGreedy: {
backTrack->begin = notFound;
context->term += term.atom.parenthesesWidth;
return true;
}
case QuantifierFixedCount:
break;
}
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
}
return true;
}
bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
ASSERT(term.atom.quantityCount == 1);
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
}
if (term.atom.quantityType == QuantifierFixedCount)
return true;
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
return backTrack->begin != input.getPos();
}
bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
ASSERT(term.atom.quantityCount == 1);
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
output[(subpatternId << 1)] = -1;
output[(subpatternId << 1) + 1] = -1;
}
switch (term.atom.quantityType) {
case QuantifierGreedy:
// if we backtrack to this point, there is another chance - try matching nothing.
ASSERT(backTrack->begin != notFound);
backTrack->begin = notFound;
context->term += term.atom.parenthesesWidth;
return true;
case QuantifierNonGreedy:
ASSERT(backTrack->begin != notFound);
case QuantifierFixedCount:
break;
}
return false;
}
bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
ASSERT(term.atom.quantityCount == 1);
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
switch (term.atom.quantityType) {
case QuantifierGreedy:
if (backTrack->begin == notFound) {
context->term -= term.atom.parenthesesWidth;
return false;
}
case QuantifierNonGreedy:
if (backTrack->begin == notFound) {
backTrack->begin = input.getPos();
if (term.capture()) {
// Technically this access to inputPosition should be accessing the begin term's
// inputPosition, but for repeats other than fixed these values should be
// the same anyway! (We don't pre-check for greedy or non-greedy matches.)
ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
unsigned subpatternId = term.atom.subpatternId;
output[subpatternId << 1] = input.getPos() + term.inputPosition;
}
context->term -= term.atom.parenthesesWidth;
return true;
}
case QuantifierFixedCount:
break;
}
return false;
}
bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
ASSERT(term.atom.quantityType == QuantifierGreedy);
ASSERT(term.atom.quantityCount == quantifyInfinite);
ASSERT(!term.capture());
BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
backTrack->begin = input.getPos();
return true;
}
bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
// Empty match is a failed match.
if (backTrack->begin == input.getPos())
return false;
// Successful match! Okay, what's next? - loop around and try to match moar!
context->term -= (term.atom.parenthesesWidth + 1);
return true;
}
bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
ASSERT(term.atom.quantityType == QuantifierGreedy);
ASSERT(term.atom.quantityCount == quantifyInfinite);
ASSERT(!term.capture());
// If we backtrack to this point, we have failed to match this iteration of the parens.
// Since this is greedy / zero minimum a failed is also accepted as a match!
context->term += term.atom.parenthesesWidth;
return true;
}
bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
{
// 'Terminal' parentheses are at the end of the regex, and as such a match past end
// should always be returned as a successful match - we should never backtrack to here.
ASSERT_NOT_REACHED();
return false;
}
bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
ASSERT(term.atom.quantityCount == 1);
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
backTrack->begin = input.getPos();
return true;
}
bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
ASSERT(term.atom.quantityCount == 1);
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
input.setPos(backTrack->begin);
// We've reached the end of the parens; if they are inverted, this is failure.
if (term.invert()) {
context->term -= term.atom.parenthesesWidth;
return false;
}
return true;
}
bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
ASSERT(term.atom.quantityCount == 1);
// We've failed to match parens; if they are inverted, this is win!
if (term.invert()) {
context->term += term.atom.parenthesesWidth;
return true;
}
return false;
}
bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
ASSERT(term.atom.quantityCount == 1);
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
input.setPos(backTrack->begin);
context->term -= term.atom.parenthesesWidth;
return false;
}
JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
backTrack->matchAmount = 0;
backTrack->lastContext = 0;
switch (term.atom.quantityType) {
case QuantifierFixedCount: {
// While we haven't yet reached our fixed limit,
while (backTrack->matchAmount < term.atom.quantityCount) {
// Try to do a match, and it it succeeds, add it to the list.
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
// The match failed; try to find an alternate point to carry on from.
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
if (result == JSRegExpNoMatch) {
JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
if (backtrackResult != JSRegExpMatch)
return backtrackResult;
} else
return result;
}
}
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
return JSRegExpMatch;
}
case QuantifierGreedy: {
while (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
if (result != JSRegExpNoMatch)
return result;
break;
}
}
if (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
return JSRegExpMatch;
}
case QuantifierNonGreedy:
return JSRegExpMatch;
}
ASSERT_NOT_REACHED();
return JSRegExpErrorNoMatch;
}
// Rules for backtracking differ depending on whether this is greedy or non-greedy.
//
// Greedy matches never should try just adding more - you should already have done
// the 'more' cases. Always backtrack, at least a leetle bit. However cases where
// you backtrack an item off the list needs checking, since we'll never have matched
// the one less case. Tracking forwards, still add as much as possible.
//
// Non-greedy, we've already done the one less case, so don't match on popping.
// We haven't done the one more case, so always try to add that.
//
JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
{
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
switch (term.atom.quantityType) {
case QuantifierFixedCount: {
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
ParenthesesDisjunctionContext* context = 0;
JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
if (result != JSRegExpMatch)
return result;
// While we haven't yet reached our fixed limit,
while (backTrack->matchAmount < term.atom.quantityCount) {
// Try to do a match, and it it succeeds, add it to the list.
context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
if (result == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
// The match failed; try to find an alternate point to carry on from.
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
if (result == JSRegExpNoMatch) {
JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
if (backtrackResult != JSRegExpMatch)
return backtrackResult;
} else
return result;
}
}
ASSERT(backTrack->matchAmount == term.atom.quantityCount);
context = backTrack->lastContext;
recordParenthesesMatch(term, context);
return JSRegExpMatch;
}
case QuantifierGreedy: {
if (!backTrack->matchAmount)
return JSRegExpNoMatch;
ParenthesesDisjunctionContext* context = backTrack->lastContext;
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
if (result == JSRegExpMatch) {
while (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
if (parenthesesResult == JSRegExpMatch)
appendParenthesesDisjunctionContext(backTrack, context);
else {
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
if (parenthesesResult != JSRegExpNoMatch)
return parenthesesResult;
break;
}
}
} else {
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
if (result != JSRegExpNoMatch)
return result;
}
if (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
return JSRegExpMatch;
}
case QuantifierNonGreedy: {
// If we've not reached the limit, try to add one more match.
if (backTrack->matchAmount < term.atom.quantityCount) {
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
if (result == JSRegExpMatch) {
appendParenthesesDisjunctionContext(backTrack, context);
recordParenthesesMatch(term, context);
return JSRegExpMatch;
}
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
if (result != JSRegExpNoMatch)
return result;
}
// Nope - okay backtrack looking for an alternative.
while (backTrack->matchAmount) {
ParenthesesDisjunctionContext* context = backTrack->lastContext;
JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
if (result == JSRegExpMatch) {
// successful backtrack! we're back in the game!
if (backTrack->matchAmount) {
context = backTrack->lastContext;
recordParenthesesMatch(term, context);
}
return JSRegExpMatch;
}
// pop a match off the stack
resetMatches(term, context);
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
return result;
}
return JSRegExpNoMatch;
}
}
ASSERT_NOT_REACHED();
return JSRegExpErrorNoMatch;
}
void lookupForBeginChars()
{
int character;
bool firstSingleCharFound;
while (true) {
if (input.isNotAvailableInput(2))
return;
firstSingleCharFound = false;
character = input.readPair();
for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
BeginChar bc = pattern->m_beginChars[i];
if (!firstSingleCharFound && bc.value <= 0xFFFF) {
firstSingleCharFound = true;
character &= 0xFFFF;
}
if ((character | bc.mask) == bc.value)
return;
}
input.next();
}
}
#define MATCH_NEXT() { ++context->term; goto matchAgain; }
#define BACKTRACK() { --context->term; goto backtrack; }
#define currentTerm() (disjunction->terms[context->term])
JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
{
if (!--remainingMatchCount)
return JSRegExpErrorHitLimit;
if (btrack)
BACKTRACK();
if (pattern->m_containsBeginChars && isBody)
lookupForBeginChars();
context->matchBegin = input.getPos();
context->term = 0;
matchAgain:
ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
switch (currentTerm().type) {
case ByteTerm::TypeSubpatternBegin:
MATCH_NEXT();
case ByteTerm::TypeSubpatternEnd:
context->matchEnd = input.getPos();
return JSRegExpMatch;
case ByteTerm::TypeBodyAlternativeBegin:
MATCH_NEXT();
case ByteTerm::TypeBodyAlternativeDisjunction:
case ByteTerm::TypeBodyAlternativeEnd:
context->matchEnd = input.getPos();
return JSRegExpMatch;
case ByteTerm::TypeAlternativeBegin:
MATCH_NEXT();
case ByteTerm::TypeAlternativeDisjunction:
case ByteTerm::TypeAlternativeEnd: {
int offset = currentTerm().alternative.end;
BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
backTrack->offset = offset;
context->term += offset;
MATCH_NEXT();
}
case ByteTerm::TypeAssertionBOL:
if (matchAssertionBOL(currentTerm()))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeAssertionEOL:
if (matchAssertionEOL(currentTerm()))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeAssertionWordBoundary:
if (matchAssertionWordBoundary(currentTerm()))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypePatternCharacterOnce:
case ByteTerm::TypePatternCharacterFixed: {
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
BACKTRACK();
}
MATCH_NEXT();
}
case ByteTerm::TypePatternCharacterGreedy: {
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
unsigned matchAmount = 0;
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
input.uncheckInput(1);
break;
}
++matchAmount;
}
backTrack->matchAmount = matchAmount;
MATCH_NEXT();
}
case ByteTerm::TypePatternCharacterNonGreedy: {
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
backTrack->matchAmount = 0;
MATCH_NEXT();
}
case ByteTerm::TypePatternCasedCharacterOnce:
case ByteTerm::TypePatternCasedCharacterFixed: {
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
BACKTRACK();
}
MATCH_NEXT();
}
case ByteTerm::TypePatternCasedCharacterGreedy: {
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
unsigned matchAmount = 0;
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
input.uncheckInput(1);
break;
}
++matchAmount;
}
backTrack->matchAmount = matchAmount;
MATCH_NEXT();
}
case ByteTerm::TypePatternCasedCharacterNonGreedy: {
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
backTrack->matchAmount = 0;
MATCH_NEXT();
}
case ByteTerm::TypeCharacterClass:
if (matchCharacterClass(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeBackReference:
if (matchBackReference(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpattern: {
JSRegExpResult result = matchParentheses(currentTerm(), context);
if (result == JSRegExpMatch) {
MATCH_NEXT();
} else if (result != JSRegExpNoMatch)
return result;
BACKTRACK();
}
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
if (matchParenthesesOnceBegin(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpatternOnceEnd:
if (matchParenthesesOnceEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
if (matchParenthesesTerminalBegin(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
if (matchParenthesesTerminalEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParentheticalAssertionBegin:
if (matchParentheticalAssertionBegin(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParentheticalAssertionEnd:
if (matchParentheticalAssertionEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeCheckInput:
if (input.checkInput(currentTerm().checkInputCount))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeUncheckInput:
input.uncheckInput(currentTerm().checkInputCount);
MATCH_NEXT();
}
// We should never fall-through to here.
ASSERT_NOT_REACHED();
backtrack:
ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
switch (currentTerm().type) {
case ByteTerm::TypeSubpatternBegin:
return JSRegExpNoMatch;
case ByteTerm::TypeSubpatternEnd:
ASSERT_NOT_REACHED();
case ByteTerm::TypeBodyAlternativeBegin:
case ByteTerm::TypeBodyAlternativeDisjunction: {
int offset = currentTerm().alternative.next;
context->term += offset;
if (offset > 0)
MATCH_NEXT();
if (input.atEnd())
return JSRegExpNoMatch;
input.next();
if (pattern->m_containsBeginChars && isBody)
lookupForBeginChars();
context->matchBegin = input.getPos();
if (currentTerm().alternative.onceThrough)
context->term += currentTerm().alternative.next;
MATCH_NEXT();
}
case ByteTerm::TypeBodyAlternativeEnd:
ASSERT_NOT_REACHED();
case ByteTerm::TypeAlternativeBegin:
case ByteTerm::TypeAlternativeDisjunction: {
int offset = currentTerm().alternative.next;
context->term += offset;
if (offset > 0)
MATCH_NEXT();
BACKTRACK();
}
case ByteTerm::TypeAlternativeEnd: {
// We should never backtrack back into an alternative of the main body of the regex.
BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
unsigned offset = backTrack->offset;
context->term -= offset;
BACKTRACK();
}
case ByteTerm::TypeAssertionBOL:
case ByteTerm::TypeAssertionEOL:
case ByteTerm::TypeAssertionWordBoundary:
BACKTRACK();
case ByteTerm::TypePatternCharacterOnce:
case ByteTerm::TypePatternCharacterFixed:
case ByteTerm::TypePatternCharacterGreedy:
case ByteTerm::TypePatternCharacterNonGreedy:
if (backtrackPatternCharacter(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypePatternCasedCharacterOnce:
case ByteTerm::TypePatternCasedCharacterFixed:
case ByteTerm::TypePatternCasedCharacterGreedy:
case ByteTerm::TypePatternCasedCharacterNonGreedy:
if (backtrackPatternCasedCharacter(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeCharacterClass:
if (backtrackCharacterClass(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeBackReference:
if (backtrackBackReference(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpattern: {
JSRegExpResult result = backtrackParentheses(currentTerm(), context);
if (result == JSRegExpMatch) {
MATCH_NEXT();
} else if (result != JSRegExpNoMatch)
return result;
BACKTRACK();
}
case ByteTerm::TypeParenthesesSubpatternOnceBegin:
if (backtrackParenthesesOnceBegin(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpatternOnceEnd:
if (backtrackParenthesesOnceEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
if (backtrackParenthesesTerminalBegin(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
if (backtrackParenthesesTerminalEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParentheticalAssertionBegin:
if (backtrackParentheticalAssertionBegin(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeParentheticalAssertionEnd:
if (backtrackParentheticalAssertionEnd(currentTerm(), context))
MATCH_NEXT();
BACKTRACK();
case ByteTerm::TypeCheckInput:
input.uncheckInput(currentTerm().checkInputCount);
BACKTRACK();
case ByteTerm::TypeUncheckInput:
input.checkInput(currentTerm().checkInputCount);
BACKTRACK();
}
ASSERT_NOT_REACHED();
return JSRegExpErrorNoMatch;
}
JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
{
JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
if (result == JSRegExpMatch) {
while (context->matchBegin == context->matchEnd) {
result = matchDisjunction(disjunction, context, true);
if (result != JSRegExpMatch)
return result;
}
return JSRegExpMatch;
}
return result;
}
int interpret()
{
allocatorPool = pattern->m_allocator->startAllocator();
if (!allocatorPool)
CRASH();
for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
output[i] = -1;
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
if (result == JSRegExpMatch) {
output[0] = context->matchBegin;
output[1] = context->matchEnd;
}
freeDisjunctionContext(context);
pattern->m_allocator->stopAllocator();
// RegExp.cpp currently expects all error to be converted to -1.
ASSERT((result == JSRegExpMatch) == (output[0] != -1));
return output[0];
}
Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
: pattern(pattern)
, output(output)
, input(inputChar, start, length)
, allocatorPool(0)
, remainingMatchCount(matchLimit)
{
}
private:
BytecodePattern* pattern;
int* output;
InputStream input;
BumpPointerPool* allocatorPool;
unsigned remainingMatchCount;
};
class ByteCompiler {
struct ParenthesesStackEntry {
unsigned beginTerm;
unsigned savedAlternativeIndex;
ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
: beginTerm(beginTerm)
, savedAlternativeIndex(savedAlternativeIndex)
{
}
};
public:
ByteCompiler(YarrPattern& pattern)
: m_pattern(pattern)
{
m_currentAlternativeIndex = 0;
}
PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
{
regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
emitDisjunction(m_pattern.m_body);
regexEnd();
return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
}
void checkInput(unsigned count)
{
m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
}
void uncheckInput(unsigned count)
{
m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
}
void assertionBOL(int inputPosition)
{
m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
}
void assertionEOL(int inputPosition)
{
m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
}
void assertionWordBoundary(bool invert, int inputPosition)
{
m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
}
void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
if (m_pattern.m_ignoreCase) {
UChar lo = Unicode::toLower(ch);
UChar hi = Unicode::toUpper(ch);
if (lo != hi) {
m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
return;
}
}
m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
}
void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
}
void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
ASSERT(subpatternId);
m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
}
void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
int beginTerm = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
m_currentAlternativeIndex = beginTerm + 1;
}
void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
int beginTerm = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
m_currentAlternativeIndex = beginTerm + 1;
}
void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
// Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
// then fix this up at the end! - simplifying this should make it much clearer.
// https://bugs.webkit.org/show_bug.cgi?id=50136
int beginTerm = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
m_currentAlternativeIndex = beginTerm + 1;
}
void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
{
int beginTerm = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
m_currentAlternativeIndex = beginTerm + 1;
}
void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
unsigned endTerm = m_bodyDisjunction->terms.size();
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
bool invert = m_bodyDisjunction->terms[beginTerm].invert();
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
}
unsigned popParenthesesStack()
{
ASSERT(m_parenthesesStack.size());
int stackEnd = m_parenthesesStack.size() - 1;
unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
m_parenthesesStack.shrink(stackEnd);
ASSERT(beginTerm < m_bodyDisjunction->terms.size());
ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
return beginTerm;
}
#ifndef NDEBUG
void dumpDisjunction(ByteDisjunction* disjunction)
{
printf("ByteDisjunction(%p):\n\t", disjunction);
for (unsigned i = 0; i < disjunction->terms.size(); ++i)
printf("{ %d } ", disjunction->terms[i].type);
printf("\n");
}
#endif
void closeAlternative(int beginTerm)
{
int origBeginTerm = beginTerm;
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
int endIndex = m_bodyDisjunction->terms.size();
unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
m_bodyDisjunction->terms.remove(beginTerm);
else {
while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
}
m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
}
}
void closeBodyAlternative()
{
int beginTerm = 0;
int origBeginTerm = 0;
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
int endIndex = m_bodyDisjunction->terms.size();
unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
}
m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
}
void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
unsigned endTerm = m_bodyDisjunction->terms.size();
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
bool capture = parenthesesBegin.capture();
unsigned subpatternId = parenthesesBegin.atom.subpatternId;
unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
m_bodyDisjunction->terms.shrink(beginTerm);
m_allParenthesesInfo.append(parenthesesDisjunction);
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
}
void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
unsigned endTerm = m_bodyDisjunction->terms.size();
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
bool capture = m_bodyDisjunction->terms[beginTerm].capture();
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
}
void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
unsigned endTerm = m_bodyDisjunction->terms.size();
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
bool capture = m_bodyDisjunction->terms[beginTerm].capture();
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
}
void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
{
m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
m_bodyDisjunction->terms[0].frameLocation = 0;
m_currentAlternativeIndex = 0;
}
void regexEnd()
{
closeBodyAlternative();
}
void alternativeBodyDisjunction(bool onceThrough)
{
int newAlternativeIndex = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
m_currentAlternativeIndex = newAlternativeIndex;
}
void alternativeDisjunction()
{
int newAlternativeIndex = m_bodyDisjunction->terms.size();
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
m_currentAlternativeIndex = newAlternativeIndex;
}
void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0, bool isParentheticalAssertion = false)
{
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
PatternAlternative* alternative = disjunction->m_alternatives[alt];
if (alt) {
if (disjunction == m_pattern.m_body)
alternativeBodyDisjunction(alternative->onceThrough());
else
alternativeDisjunction();
}
unsigned minimumSize = alternative->m_minimumSize;
int countToCheck;
if (isParentheticalAssertion && parenthesesInputCountAlreadyChecked > minimumSize)
countToCheck = 0;
else
countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
ASSERT(countToCheck >= 0);
if (countToCheck) {
checkInput(countToCheck);
currentCountAlreadyChecked += countToCheck;
}
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
PatternTerm& term = alternative->m_terms[i];
switch (term.type) {
case PatternTerm::TypeAssertionBOL:
assertionBOL(term.inputPosition - currentCountAlreadyChecked);
break;
case PatternTerm::TypeAssertionEOL:
assertionEOL(term.inputPosition - currentCountAlreadyChecked);
break;
case PatternTerm::TypeAssertionWordBoundary:
assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked);
break;
case PatternTerm::TypePatternCharacter:
atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeCharacterClass:
atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeBackReference:
atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeForwardReference:
break;
case PatternTerm::TypeParenthesesSubpattern: {
unsigned disjunctionAlreadyCheckedCount = 0;
if (term.quantityCount == 1 && !term.parentheses.isCopy) {
unsigned alternativeFrameLocation = term.frameLocation;
// For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
if (term.quantityType == QuantifierFixedCount)
disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
else
alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
} else if (term.parentheses.isTerminal) {
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
} else {
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
}
break;
}
case PatternTerm::TypeParentheticalAssertion: {
unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
int positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
int uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
if (uncheckAmount > 0) {
uncheckInput(uncheckAmount);
currentCountAlreadyChecked -= uncheckAmount;
} else
uncheckAmount = 0;
atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset, true);
atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
if (uncheckAmount) {
checkInput(uncheckAmount);
currentCountAlreadyChecked += uncheckAmount;
}
break;
}
}
}
}
}
private:
YarrPattern& m_pattern;
OwnPtr<ByteDisjunction> m_bodyDisjunction;
unsigned m_currentAlternativeIndex;
Vector<ParenthesesStackEntry> m_parenthesesStack;
Vector<ByteDisjunction*> m_allParenthesesInfo;
};
PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
{
return ByteCompiler(pattern).compile(allocator);
}
int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output)
{
return Interpreter(bytecode, output, input, start, length).interpret();
}
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
} }