matcher.to) {
matcher.hitEnd = true;
return false;
}
if (atom.match(matcher, i, seq)) {
return next.match(matcher, matcher.last, seq);
}
i += countChars(seq, i, 1);
matcher.first++;
}
}
boolean study(TreeInfo info) {
atom.study(info);
info.maxValid = false;
info.deterministic = false;
return next.study(info);
}
}
static final class Conditional extends Node {
Node cond, yes, not;
Conditional(Node cond, Node yes, Node not) {
this.cond = cond;
this.yes = yes;
this.not = not;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (cond.match(matcher, i, seq)) {
return yes.match(matcher, i, seq);
} else {
return not.match(matcher, i, seq);
}
}
boolean study(TreeInfo info) {
int minL = info.minLength;
int maxL = info.maxLength;
boolean maxV = info.maxValid;
info.reset();
yes.study(info);
int minL2 = info.minLength;
int maxL2 = info.maxLength;
boolean maxV2 = info.maxValid;
info.reset();
not.study(info);
info.minLength = minL + Math.min(minL2, info.minLength);
info.maxLength = maxL + Math.max(maxL2, info.maxLength);
info.maxValid = (maxV & maxV2 & info.maxValid);
info.deterministic = false;
return next.study(info);
}
}
/**
* Zero width positive lookahead.
*/
static final class Pos extends Node {
Node cond;
Pos(Node cond) {
this.cond = cond;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int savedTo = matcher.to;
boolean conditionMatched = false;
// Relax transparent region boundaries for lookahead
if (matcher.transparentBounds)
matcher.to = matcher.getTextLength();
try {
conditionMatched = cond.match(matcher, i, seq);
} finally {
// Reinstate region boundaries
matcher.to = savedTo;
}
return conditionMatched && next.match(matcher, i, seq);
}
}
/**
* Zero width negative lookahead.
*/
static final class Neg extends Node {
Node cond;
Neg(Node cond) {
this.cond = cond;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int savedTo = matcher.to;
boolean conditionMatched = false;
// Relax transparent region boundaries for lookahead
if (matcher.transparentBounds)
matcher.to = matcher.getTextLength();
try {
if (i < matcher.to) {
conditionMatched = !cond.match(matcher, i, seq);
} else {
// If a negative lookahead succeeds then more input
// could cause it to fail!
matcher.requireEnd = true;
conditionMatched = !cond.match(matcher, i, seq);
}
} finally {
// Reinstate region boundaries
matcher.to = savedTo;
}
return conditionMatched && next.match(matcher, i, seq);
}
}
/**
* Zero width positive lookbehind.
*/
static class Behind extends Node {
Node cond;
int rmax, rmin;
Behind(Node cond, int rmax, int rmin) {
this.cond = cond;
this.rmax = rmax;
this.rmin = rmin;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int savedFrom = matcher.from;
boolean conditionMatched = false;
int startIndex = (!matcher.transparentBounds) ?
matcher.from : 0;
int from = Math.max(i - rmax, startIndex);
for (int j = i - rmin; j >= from; j--) {
// Relax transparent region boundaries for lookbehind
if (matcher.transparentBounds)
matcher.from = 0;
try {
conditionMatched =
(cond.match(matcher, j, seq) && matcher.last == i);
} finally { // Reinstate region boundaries
matcher.from = savedFrom;
}
if (conditionMatched)
return next.match(matcher, i, seq);
}
return false;
}
}
/**
* Zero width positive lookbehind, including supplementary
* characters or unpaired surrogates.
*/
static final class BehindS extends Behind {
BehindS(Node cond, int rmax, int rmin) {
super(cond, rmax, rmin);
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int rmaxChars = countChars(seq, i, -rmax);
int rminChars = countChars(seq, i, -rmin);
int savedFrom = matcher.from;
int startIndex = (!matcher.transparentBounds) ?
matcher.from : 0;
boolean conditionMatched = false;
int from = Math.max(i - rmaxChars, startIndex);
for (int j = i - rminChars;
j >= from;
j -= j>from ? countChars(seq, j, -1) : 1) {
// Relax transparent region boundaries for lookbehind
if (matcher.transparentBounds)
matcher.from = 0;
try {
conditionMatched =
(cond.match(matcher, j, seq) && matcher.last == i);
} finally { // Reinstate region boundaries
matcher.from = savedFrom;
}
if (conditionMatched)
return next.match(matcher, i, seq);
}
return false;
}
}
/**
* Zero width negative lookbehind.
*/
static class NotBehind extends Node {
Node cond;
int rmax, rmin;
NotBehind(Node cond, int rmax, int rmin) {
this.cond = cond;
this.rmax = rmax;
this.rmin = rmin;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int savedFrom = matcher.from;
boolean conditionMatched = false;
int startIndex = (!matcher.transparentBounds) ?
matcher.from : 0;
int from = Math.max(i - rmax, startIndex);
for (int j = i - rmin; j >= from; j--) {
// Relax transparent region boundaries for lookbehind
if (matcher.transparentBounds)
matcher.from = 0;
try {
conditionMatched =
(cond.match(matcher, j, seq) && matcher.last == i);
} finally { // Reinstate region boundaries
matcher.from = savedFrom;
}
if (conditionMatched)
return false;
}
return next.match(matcher, i, seq);
}
}
/**
* Zero width negative lookbehind, including supplementary
* characters or unpaired surrogates.
*/
static final class NotBehindS extends NotBehind {
NotBehindS(Node cond, int rmax, int rmin) {
super(cond, rmax, rmin);
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int rmaxChars = countChars(seq, i, -rmax);
int rminChars = countChars(seq, i, -rmin);
int savedFrom = matcher.from;
boolean conditionMatched = false;
int startIndex = (!matcher.transparentBounds) ?
matcher.from : 0;
int from = Math.max(i - rmaxChars, startIndex);
for (int j = i - rminChars;
j >= from;
j -= j>from ? countChars(seq, j, -1) : 1) {
// Relax transparent region boundaries for lookbehind
if (matcher.transparentBounds)
matcher.from = 0;
try {
conditionMatched =
(cond.match(matcher, j, seq) && matcher.last == i);
} finally { // Reinstate region boundaries
matcher.from = savedFrom;
}
if (conditionMatched)
return false;
}
return next.match(matcher, i, seq);
}
}
/**
* An object added to the tree when a character class has an additional
* range added to it.
*/
static class Add extends Node {
Node lhs, rhs;
Add(Node lhs, Node rhs) {
this.lhs = lhs;
this.rhs = rhs;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
return ((lhs.match(matcher, i, seq) ||
rhs.match(matcher, i, seq)) &&
next.match(matcher, matcher.last, seq));
}
matcher.hitEnd = true;
return false;
}
boolean study(TreeInfo info) {
boolean maxV = info.maxValid;
boolean detm = info.deterministic;
int minL = info.minLength;
int maxL = info.maxLength;
lhs.study(info);
int minL2 = info.minLength;
int maxL2 = info.maxLength;
info.minLength = minL;
info.maxLength = maxL;
rhs.study(info);
info.minLength = Math.min(minL2, info.minLength);
info.maxLength = Math.max(maxL2, info.maxLength);
info.maxValid = maxV;
info.deterministic = detm;
return next.study(info);
}
}
/**
* An object added to the tree when a character class has another
* nested class in it.
*/
static class Both extends Node {
Node lhs, rhs;
Both(Node lhs, Node rhs) {
this.lhs = lhs;
this.rhs = rhs;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
return ((lhs.match(matcher, i, seq) &&
rhs.match(matcher, i, seq)) &&
next.match(matcher, matcher.last, seq));
}
matcher.hitEnd = true;
return false;
}
boolean study(TreeInfo info) {
boolean maxV = info.maxValid;
boolean detm = info.deterministic;
int minL = info.minLength;
int maxL = info.maxLength;
lhs.study(info);
int minL2 = info.minLength;
int maxL2 = info.maxLength;
info.minLength = minL;
info.maxLength = maxL;
rhs.study(info);
info.minLength = Math.min(minL2, info.minLength);
info.maxLength = Math.max(maxL2, info.maxLength);
info.maxValid = maxV;
info.deterministic = detm;
return next.study(info);
}
}
/**
* An object added to the tree when a character class has a range
* or single subtracted from it.
*/
static final class Sub extends Add {
Sub(Node lhs, Node rhs) {
super(lhs, rhs);
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
return !rhs.match(matcher, i, seq)
&& lhs.match(matcher, i, seq)
&& next.match(matcher, matcher.last, seq);
}
matcher.hitEnd = true;
return false;
}
boolean study(TreeInfo info) {
lhs.study(info);
return next.study(info);
}
}
/**
* Handles word boundaries. Includes a field to allow this one class to
* deal with the different types of word boundaries we can match. The word
* characters include underscores, letters, and digits. Non spacing marks
* can are also part of a word if they have a base character, otherwise
* they are ignored for purposes of finding word boundaries.
*/
static final class Bound extends Node {
static int LEFT = 0x1;
static int RIGHT= 0x2;
static int BOTH = 0x3;
static int NONE = 0x4;
int type;
Bound(int n) {
type = n;
}
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
boolean left = false;
int startIndex = matcher.from;
int endIndex = matcher.to;
if (matcher.transparentBounds) {
startIndex = 0;
endIndex = matcher.getTextLength();
}
if (i > startIndex) {
ch = Character.codePointBefore(seq, i);
left = (ch == '_' || Character.isLetterOrDigit(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i-1, seq)));
}
boolean right = false;
if (i < endIndex) {
ch = Character.codePointAt(seq, i);
right = (ch == '_' || Character.isLetterOrDigit(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq)));
} else {
// Tried to access char past the end
matcher.hitEnd = true;
// The addition of another char could wreck a boundary
matcher.requireEnd = true;
}
return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
}
boolean match(Matcher matcher, int i, CharSequence seq) {
return (check(matcher, i, seq) & type) > 0
&& next.match(matcher, i, seq);
}
}
/**
* Non spacing marks only count as word characters in bounds calculations
* if they have a base character.
*/
private static boolean hasBaseCharacter(Matcher matcher, int i,
CharSequence seq)
{
int start = (!matcher.transparentBounds) ?
matcher.from : 0;
for (int x=i; x >= start; x--) {
int ch = Character.codePointAt(seq, x);
if (Character.isLetterOrDigit(ch))
return true;
if (Character.getType(ch) == Character.NON_SPACING_MARK)
continue;
return false;
}
return false;
}
/**
* Attempts to match a slice in the input using the Boyer-Moore string
* matching algorithm. The algorithm is based on the idea that the
* pattern can be shifted farther ahead in the search text if it is
* matched right to left.
*
* The pattern is compared to the input one character at a time, from
* the rightmost character in the pattern to the left. If the characters
* all match the pattern has been found. If a character does not match,
* the pattern is shifted right a distance that is the maximum of two
* functions, the bad character shift and the good suffix shift. This
* shift moves the attempted match position through the input more
* quickly than a naive one position at a time check.
*
* The bad character shift is based on the character from the text that
* did not match. If the character does not appear in the pattern, the
* pattern can be shifted completely beyond the bad character. If the
* character does occur in the pattern, the pattern can be shifted to
* line the pattern up with the next occurrence of that character.
*
* The good suffix shift is based on the idea that some subset on the right
* side of the pattern has matched. When a bad character is found, the
* pattern can be shifted right by the pattern length if the subset does
* not occur again in pattern, or by the amount of distance to the
* next occurrence of the subset in the pattern.
*
* Boyer-Moore search methods adapted from code by Amy Yu.
*/
static class BnM extends Node {
int[] buffer;
int[] lastOcc;
int[] optoSft;
/**
* Pre calculates arrays needed to generate the bad character
* shift and the good suffix shift. Only the last seven bits
* are used to see if chars match; This keeps the tables small
* and covers the heavily used ASCII range, but occasionally
* results in an aliased match for the bad character shift.
*/
static Node optimize(Node node) {
if (!(node instanceof Slice)) {
return node;
}
int[] src = ((Slice) node).buffer;
int patternLength = src.length;
// The BM algorithm requires a bit of overhead;
// If the pattern is short don't use it, since
// a shift larger than the pattern length cannot
// be used anyway.
if (patternLength < 4) {
return node;
}
int i, j, k;
int[] lastOcc = new int[128];
int[] optoSft = new int[patternLength];
// Precalculate part of the bad character shift
// It is a table for where in the pattern each
// lower 7-bit value occurs
for (i = 0; i < patternLength; i++) {
lastOcc[src[i]&0x7F] = i + 1;
}
// Precalculate the good suffix shift
// i is the shift amount being considered
NEXT: for (i = patternLength; i > 0; i--) {
// j is the beginning index of suffix being considered
for (j = patternLength - 1; j >= i; j--) {
// Testing for good suffix
if (src[j] == src[j-i]) {
// src[j..len] is a good suffix
optoSft[j-1] = i;
} else {
// No match. The array has already been
// filled up with correct values before.
continue NEXT;
}
}
// This fills up the remaining of optoSft
// any suffix can not have larger shift amount
// then its sub-suffix. Why???
while (j > 0) {
optoSft[--j] = i;
}
}
// Set the guard value because of unicode compression
optoSft[patternLength-1] = 1;
if (node instanceof SliceS)
return new BnMS(src, lastOcc, optoSft, node.next);
return new BnM(src, lastOcc, optoSft, node.next);
}
BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
this.buffer = src;
this.lastOcc = lastOcc;
this.optoSft = optoSft;
this.next = next;
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int[] src = buffer;
int patternLength = src.length;
int last = matcher.to - patternLength;
// Loop over all possible match positions in text
NEXT: while (i <= last) {
// Loop over pattern from right to left
for (int j = patternLength - 1; j >= 0; j--) {
int ch = seq.charAt(i+j);
if (ch != src[j]) {
// Shift search to the right by the maximum of the
// bad character shift and the good suffix shift
i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
continue NEXT;
}
}
// Entire pattern matched starting at i
matcher.first = i;
boolean ret = next.match(matcher, i + patternLength, seq);
if (ret) {
matcher.first = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
return true;
}
i++;
}
// BnM is only used as the leading node in the unanchored case,
// and it replaced its Start() which always searches to the end
// if it doesn't find what it's looking for, so hitEnd is true.
matcher.hitEnd = true;
return false;
}
boolean study(TreeInfo info) {
info.minLength += buffer.length;
info.maxValid = false;
return next.study(info);
}
}
/**
* Supplementary support version of BnM(). Unpaired surrogates are
* also handled by this class.
*/
static final class BnMS extends BnM {
int lengthInChars;
BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
super(src, lastOcc, optoSft, next);
for (int x = 0; x < buffer.length; x++) {
lengthInChars += Character.charCount(buffer[x]);
}
}
boolean match(Matcher matcher, int i, CharSequence seq) {
int[] src = buffer;
int patternLength = src.length;
int last = matcher.to - lengthInChars;
// Loop over all possible match positions in text
NEXT: while (i <= last) {
// Loop over pattern from right to left
int ch;
for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
j > 0; j -= Character.charCount(ch), x--) {
ch = Character.codePointBefore(seq, i+j);
if (ch != src[x]) {
// Shift search to the right by the maximum of the
// bad character shift and the good suffix shift
int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
i += countChars(seq, i, n);
continue NEXT;
}
}
// Entire pattern matched starting at i
matcher.first = i;
boolean ret = next.match(matcher, i + lengthInChars, seq);
if (ret) {
matcher.first = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
return true;
}
i += countChars(seq, i, 1);
}
matcher.hitEnd = true;
return false;
}
}
/**
* Node class for matching characters in a Unicode block
*/
static final class UBlock extends Node {
Character.UnicodeBlock block;
boolean complementMe = false;
UBlock() {
}
UBlock(Character.UnicodeBlock block, boolean not) {
this.block = block;
this.complementMe = not;
}
Node dup(boolean not) {
if (not)
return new UBlock(block, !complementMe);
else
return new UBlock(block, complementMe);
}
boolean match(Matcher matcher, int i, CharSequence seq) {
if (complementMe)
return notMatch(matcher, i, seq);
if (i < matcher.to) {
int ch = Character.codePointAt(seq, i);
return (block == Character.UnicodeBlock.of(ch) &&
(next.match(matcher, i+Character.charCount(ch), seq)));
}
matcher.hitEnd = true;
return false;
}
boolean notMatch(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch = Character.codePointAt(seq, i);
return (block != Character.UnicodeBlock.of(ch) &&
(next.match(matcher, i+Character.charCount(ch), seq)));
}
matcher.hitEnd = true;
return false;
}
boolean study(TreeInfo info) {
info.minLength++;
info.maxLength++;
return next.study(info);
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/**
* This must be the very first initializer.
*/
static Node accept = new Node();
static Node lastAccept = new LastNode();
static class categoryNames {
static HashMap cMap = new HashMap();
static {
cMap.put("Cn", new Category(1<<0)); // UNASSIGNED
cMap.put("Lu", new Category(1<<1)); // UPPERCASE_LETTER
cMap.put("Ll", new Category(1<<2)); // LOWERCASE_LETTER
cMap.put("Lt", new Category(1<<3)); // TITLECASE_LETTER
cMap.put("Lm", new Category(1<<4)); // MODIFIER_LETTER
cMap.put("Lo", new Category(1<<5)); // OTHER_LETTER
cMap.put("Mn", new Category(1<<6)); // NON_SPACING_MARK
cMap.put("Me", new Category(1<<7)); // ENCLOSING_MARK
cMap.put("Mc", new Category(1<<8)); // COMBINING_SPACING_MARK
cMap.put("Nd", new Category(1<<9)); // DECIMAL_DIGIT_NUMBER
cMap.put("Nl", new Category(1<<10)); // LETTER_NUMBER
cMap.put("No", new Category(1<<11)); // OTHER_NUMBER
cMap.put("Zs", new Category(1<<12)); // SPACE_SEPARATOR
cMap.put("Zl", new Category(1<<13)); // LINE_SEPARATOR
cMap.put("Zp", new Category(1<<14)); // PARAGRAPH_SEPARATOR
cMap.put("Cc", new Category(1<<15)); // CNTRL
cMap.put("Cf", new Category(1<<16)); // FORMAT
cMap.put("Co", new Category(1<<18)); // PRIVATE USE
cMap.put("Cs", new Category(1<<19)); // SURROGATE
cMap.put("Pd", new Category(1<<20)); // DASH_PUNCTUATION
cMap.put("Ps", new Category(1<<21)); // START_PUNCTUATION
cMap.put("Pe", new Category(1<<22)); // END_PUNCTUATION
cMap.put("Pc", new Category(1<<23)); // CONNECTOR_PUNCTUATION
cMap.put("Po", new Category(1<<24)); // OTHER_PUNCTUATION
cMap.put("Sm", new Category(1<<25)); // MATH_SYMBOL
cMap.put("Sc", new Category(1<<26)); // CURRENCY_SYMBOL
cMap.put("Sk", new Category(1<<27)); // MODIFIER_SYMBOL
cMap.put("So", new Category(1<<28)); // OTHER_SYMBOL
cMap.put("L", new Category(0x0000003E)); // LETTER
cMap.put("M", new Category(0x000001C0)); // MARK
cMap.put("N", new Category(0x00000E00)); // NUMBER
cMap.put("Z", new Category(0x00007000)); // SEPARATOR
cMap.put("C", new Category(0x000D8000)); // CONTROL
cMap.put("P", new Category(0x01F00000)); // PUNCTUATION
cMap.put("S", new Category(0x1E000000)); // SYMBOL
cMap.put("LD", new Category(0x0000023E)); // LETTER_OR_DIGIT
cMap.put("L1", new Range(0x000000FF)); // Latin-1
cMap.put("all", new All()); // ALL
cMap.put("ASCII", new Range(0x0000007F)); // ASCII
cMap.put("Alnum", new Ctype(ASCII.ALNUM)); // Alphanumeric characters
cMap.put("Alpha", new Ctype(ASCII.ALPHA)); // Alphabetic characters
cMap.put("Blank", new Ctype(ASCII.BLANK)); // Space and tab characters
cMap.put("Cntrl", new Ctype(ASCII.CNTRL)); // Control characters
cMap.put("Digit", new Range(('0'<<16)|'9')); // Numeric characters
cMap.put("Graph", new Ctype(ASCII.GRAPH)); // printable and visible
cMap.put("Lower", new Range(('a'<<16)|'z')); // Lower-case alphabetic
cMap.put("Print", new Range(0x0020007E)); // Printable characters
cMap.put("Punct", new Ctype(ASCII.PUNCT)); // Punctuation characters
cMap.put("Space", new Ctype(ASCII.SPACE)); // Space characters
cMap.put("Upper", new Range(('A'<<16)|'Z')); // Upper-case alphabetic
cMap.put("XDigit", new Ctype(ASCII.XDIGIT)); // hexadecimal digits
cMap.put("javaLowerCase", new JavaLowerCase());
cMap.put("javaUpperCase", new JavaUpperCase());
cMap.put("javaTitleCase", new JavaTitleCase());
cMap.put("javaDigit", new JavaDigit());
cMap.put("javaDefined", new JavaDefined());
cMap.put("javaLetter", new JavaLetter());
cMap.put("javaLetterOrDigit", new JavaLetterOrDigit());
cMap.put("javaJavaIdentifierStart", new JavaJavaIdentifierStart());
cMap.put("javaJavaIdentifierPart", new JavaJavaIdentifierPart());
cMap.put("javaUnicodeIdentifierStart", new JavaUnicodeIdentifierStart());
cMap.put("javaUnicodeIdentifierPart", new JavaUnicodeIdentifierPart());
cMap.put("javaIdentifierIgnorable", new JavaIdentifierIgnorable());
cMap.put("javaSpaceChar", new JavaSpaceChar());
cMap.put("javaWhitespace", new JavaWhitespace());
cMap.put("javaISOControl", new JavaISOControl());
cMap.put("javaMirrored", new JavaMirrored());
}
}
}