package com.sun.org.apache.regexp.internal;
/*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* .
*
*/
import com.sun.org.apache.regexp.internal.RE;
import java.util.Hashtable;
/**
* A regular expression compiler class. This class compiles a pattern string into a
* regular expression program interpretable by the RE evaluator class. The 'recompile'
* command line tool uses this compiler to pre-compile regular expressions for use
* with RE. For a description of the syntax accepted by RECompiler and what you can
* do with regular expressions, see the documentation for the RE matcher class.
*
* @see RE
* @see recompile
*
* @author Jonathan Locke
* @version $Id: RECompiler.java,v 1.2 2000/05/14 21:04:17 jon Exp $
*/
public class RECompiler
{
// The compiled program
char[] instruction; // The compiled RE 'program' instruction buffer
int lenInstruction; // The amount of the program buffer currently in use
// Input state for compiling regular expression
String pattern; // Input string
int len; // Length of the pattern string
int idx; // Current input index into ac
int parens; // Total number of paren pairs
// Node flags
static final int NODE_NORMAL = 0; // No flags (nothing special)
static final int NODE_NULLABLE = 1; // True if node is potentially null
static final int NODE_TOPLEVEL = 2; // True if top level expr
// Special types of 'escapes'
static final char ESC_MASK = 0xfff0; // Escape complexity mask
static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
// {m,n} stacks
static final int maxBrackets = 10; // Maximum number of bracket pairs
static int brackets = 0; // Number of bracket sets
static int[] bracketStart = null; // Starting point
static int[] bracketEnd = null; // Ending point
static int[] bracketMin = null; // Minimum number of matches
static int[] bracketOpt = null; // Additional optional matches
static final int bracketUnbounded = -1; // Unbounded value
static final int bracketFinished = -2; // Unbounded value
// Lookup table for POSIX character class names
static Hashtable hashPOSIX = new Hashtable();
static
{
hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
}
/**
* Constructor. Creates (initially empty) storage for a regular expression program.
*/
public RECompiler()
{
// Start off with a generous, yet reasonable, initial size
instruction = new char[128];
lenInstruction = 0;
}
/**
* Ensures that n more characters can fit in the program buffer.
* If n more can't fit, then the size is doubled until it can.
* @param n Number of additional characters to ensure will fit.
*/
void ensure(int n)
{
// Get current program length
int curlen = instruction.length;
// If the current length + n more is too much
if (lenInstruction + n >= curlen)
{
// Double the size of the program array until n more will fit
while (lenInstruction + n >= curlen)
{
curlen *= 2;
}
// Allocate new program array and move data into it
char[] newInstruction = new char[curlen];
System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
instruction = newInstruction;
}
}
/**
* Emit a single character into the program stream.
* @param c Character to add
*/
void emit(char c)
{
// Make room for character
ensure(1);
// Add character
instruction[lenInstruction++] = c;
}
/**
* Inserts a node with a given opcode and opdata at insertAt. The node relative next
* pointer is initialized to 0.
* @param opcode Opcode for new node
* @param opdata Opdata for new node (only the low 16 bits are currently used)
* @param insertAt Index at which to insert the new node in the program
*/
void nodeInsert(char opcode, int opdata, int insertAt)
{
// Make room for a new node
ensure(RE.nodeSize);
// Move everything from insertAt to the end down nodeSize elements
System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
instruction[insertAt + RE.offsetOpcode] = opcode;
instruction[insertAt + RE.offsetOpdata] = (char)opdata;
instruction[insertAt + RE.offsetNext] = 0;
lenInstruction += RE.nodeSize;
}
/**
* Appends a node to the end of a node chain
* @param node Start of node chain to traverse
* @param pointTo Node to have the tail of the chain point to
*/
void setNextOfEnd(int node, int pointTo)
{
// Traverse the chain until the next offset is 0
int next;
while ((next = instruction[node + RE.offsetNext]) != 0)
{
node += next;
}
// Point the last node in the chain to pointTo.
instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
}
/**
* Adds a new node
* @param opcode Opcode for node
* @param opdata Opdata for node (only the low 16 bits are currently used)
* @return Index of new node in program
*/
int node(char opcode, int opdata)
{
// Make room for a new node
ensure(RE.nodeSize);
// Add new node at end
instruction[lenInstruction + RE.offsetOpcode] = opcode;
instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
instruction[lenInstruction + RE.offsetNext] = 0;
lenInstruction += RE.nodeSize;
// Return index of new node
return lenInstruction - RE.nodeSize;
}
/**
* Throws a new internal error exception
* @exception Error Thrown in the event of an internal error.
*/
void internalError() throws Error
{
throw new Error("Internal error!");
}
/**
* Throws a new syntax error exception
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
void syntaxError(String s) throws RESyntaxException
{
throw new RESyntaxException(s);
}
/**
* Allocate storage for brackets only as needed
*/
void allocBrackets()
{
// Allocate bracket stacks if not already done
if (bracketStart == null)
{
// Allocate storage
bracketStart = new int[maxBrackets];
bracketEnd = new int[maxBrackets];
bracketMin = new int[maxBrackets];
bracketOpt = new int[maxBrackets];
// Initialize to invalid values
for (int i = 0; i < maxBrackets; i++)
{
bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
}
}
}
/**
* Match bracket {m,n} expression put results in bracket member variables
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
void bracket() throws RESyntaxException
{
// Current character must be a '{'
if (idx >= len || pattern.charAt(idx++) != '{')
{
internalError();
}
// Next char must be a digit
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
{
syntaxError("Expected digit");
}
// Get min ('m' of {m,n}) number
StringBuffer number = new StringBuffer();
while (idx < len && Character.isDigit(pattern.charAt(idx)))
{
number.append(pattern.charAt(idx++));
}
try
{
bracketMin[brackets] = Integer.parseInt(number.toString());
}
catch (NumberFormatException e)
{
syntaxError("Expected valid number");
}
// If out of input, fail
if (idx >= len)
{
syntaxError("Expected comma or right bracket");
}
// If end of expr, optional limit is 0
if (pattern.charAt(idx) == '}')
{
idx++;
bracketOpt[brackets] = 0;
return;
}
// Must have at least {m,} and maybe {m,n}.
if (idx >= len || pattern.charAt(idx++) != ',')
{
syntaxError("Expected comma");
}
// If out of input, fail
if (idx >= len)
{
syntaxError("Expected comma or right bracket");
}
// If {m,} max is unlimited
if (pattern.charAt(idx) == '}')
{
idx++;
bracketOpt[brackets] = bracketUnbounded;
return;
}
// Next char must be a digit
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
{
syntaxError("Expected digit");
}
// Get max number
number.setLength(0);
while (idx < len && Character.isDigit(pattern.charAt(idx)))
{
number.append(pattern.charAt(idx++));
}
try
{
bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
}
catch (NumberFormatException e)
{
syntaxError("Expected valid number");
}
// Optional repetitions must be > 0
if (bracketOpt[brackets] <= 0)
{
syntaxError("Bad range");
}
// Must have close brace
if (idx >= len || pattern.charAt(idx++) != '}')
{
syntaxError("Missing close brace");
}
}
/**
* Match an escape sequence. Handles quoted chars and octal escapes as well
* as normal escape characters. Always advances the input stream by the
* right amount. This code "understands" the subtle difference between an
* octal escape and a backref. You can access the type of ESC_CLASS or
* ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
* @return ESC_* code or character if simple escape
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
char escape() throws RESyntaxException
{
// "Shouldn't" happen
if (pattern.charAt(idx) != '\\')
{
internalError();
}
// Escape shouldn't occur as last character in string!
if (idx + 1 == len)
{
syntaxError("Escape terminates string");
}
// Switch on character after backslash
idx += 2;
char escapeChar = pattern.charAt(idx - 1);
switch (escapeChar)
{
case RE.E_BOUND:
case RE.E_NBOUND:
return ESC_COMPLEX;
case RE.E_ALNUM:
case RE.E_NALNUM:
case RE.E_SPACE:
case RE.E_NSPACE:
case RE.E_DIGIT:
case RE.E_NDIGIT:
return ESC_CLASS;
case 'u':
case 'x':
{
// Exact required hex digits for escape type
int hexDigits = (escapeChar == 'u' ? 4 : 2);
// Parse up to hexDigits characters from input
int val = 0;
for ( ; idx < len && hexDigits-- > 0; idx++)
{
// Get char
char c = pattern.charAt(idx);
// If it's a hexadecimal digit (0-9)
if (c >= '0' && c <= '9')
{
// Compute new value
val = (val << 4) + c - '0';
}
else
{
// If it's a hexadecimal letter (a-f)
c = Character.toLowerCase(c);
if (c >= 'a' && c <= 'f')
{
// Compute new value
val = (val << 4) + (c - 'a') + 10;
}
else
{
// If it's not a valid digit or hex letter, the escape must be invalid
// because hexDigits of input have not been absorbed yet.
syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
}
}
}
return (char)val;
}
case 't':
return '\t';
case 'n':
return '\n';
case 'r':
return '\r';
case 'f':
return '\f';
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
// An octal escape starts with a 0 or has two digits in a row
if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
{
// Handle \nnn octal escapes
int val = escapeChar - '0';
if (idx < len && Character.isDigit(pattern.charAt(idx)))
{
val = ((val << 3) + (pattern.charAt(idx++) - '0'));
if (idx < len && Character.isDigit(pattern.charAt(idx)))
{
val = ((val << 3) + (pattern.charAt(idx++) - '0'));
}
}
return (char)val;
}
// It's actually a backreference (\[1-9]), not an escape
return ESC_BACKREF;
default:
// Simple quoting of a character
return escapeChar;
}
}
/**
* Compile a character class
* @return Index of class node
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int characterClass() throws RESyntaxException
{
// Check for bad calling or empty class
if (pattern.charAt(idx) != '[')
{
internalError();
}
// Check for unterminated or empty class
if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
{
syntaxError("Empty or unterminated class");
}
// Check for POSIX character class
if (idx < len && pattern.charAt(idx) == ':')
{
// Skip colon
idx++;
// POSIX character classes are denoted with lowercase ASCII strings
int idxStart = idx;
while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
{
idx++;
}
// Should be a ":]" to terminate the POSIX character class
if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
{
// Get character class
String charClass = pattern.substring(idxStart, idx);
// Select the POSIX class id
Character i = (Character)hashPOSIX.get(charClass);
if (i != null)
{
// Move past colon and right bracket
idx += 2;
// Return new POSIX character class node
return node(RE.OP_POSIXCLASS, i.charValue());
}
syntaxError("Invalid POSIX character class '" + charClass + "'");
}
syntaxError("Invalid POSIX character class syntax");
}
// Try to build a class. Create OP_ANYOF node
int ret = node(RE.OP_ANYOF, 0);
// Parse class declaration
char CHAR_INVALID = Character.MAX_VALUE;
char last = CHAR_INVALID;
char simpleChar = 0;
boolean include = true;
boolean definingRange = false;
int idxFirst = idx;
char rangeStart = Character.MIN_VALUE;
char rangeEnd;
RERange range = new RERange();
while (idx < len && pattern.charAt(idx) != ']')
{
switchOnCharacter:
// Switch on character
switch (pattern.charAt(idx))
{
case '^':
include = !include;
if (idx == idxFirst)
{
range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
}
idx++;
continue;
case '\\':
{
// Escape always advances the stream
char c;
switch (c = escape ())
{
case ESC_COMPLEX:
case ESC_BACKREF:
// Word boundaries and backrefs not allowed in a character class!
syntaxError("Bad character class");
case ESC_CLASS:
// Classes can't be an endpoint of a range
if (definingRange)
{
syntaxError("Bad character class");
}
// Handle specific type of class (some are ok)
switch (pattern.charAt(idx - 1))
{
case RE.E_NSPACE:
case RE.E_NDIGIT:
case RE.E_NALNUM:
syntaxError("Bad character class");
case RE.E_SPACE:
range.include('\t', include);
range.include('\r', include);
range.include('\f', include);
range.include('\n', include);
range.include('\b', include);
range.include(' ', include);
break;
case RE.E_ALNUM:
range.include('a', 'z', include);
range.include('A', 'Z', include);
range.include('_', include);
// Fall through!
case RE.E_DIGIT:
range.include('0', '9', include);
break;
}
// Make last char invalid (can't be a range start)
last = CHAR_INVALID;
break;
default:
// Escape is simple so treat as a simple char
simpleChar = c;
break switchOnCharacter;
}
}
continue;
case '-':
// Start a range if one isn't already started
if (definingRange)
{
syntaxError("Bad class range");
}
definingRange = true;
// If no last character, start of range is 0
rangeStart = (last == CHAR_INVALID ? 0 : last);
// Premature end of range. define up to Character.MAX_VALUE
if ((idx + 1) < len && pattern.charAt(++idx) == ']')
{
simpleChar = Character.MAX_VALUE;
break;
}
continue;
default:
simpleChar = pattern.charAt(idx++);
break;
}
// Handle simple character simpleChar
if (definingRange)
{
// if we are defining a range make it now
rangeEnd = simpleChar;
// Actually create a range if the range is ok
if (rangeStart >= rangeEnd)
{
syntaxError("Bad character class");
}
range.include(rangeStart, rangeEnd, include);
// We are done defining the range
last = CHAR_INVALID;
definingRange = false;
}
else
{
// If simple character and not start of range, include it
if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
{
range.include(simpleChar, include);
}
last = simpleChar;
}
}
// Shouldn't be out of input
if (idx == len)
{
syntaxError("Unterminated character class");
}
// Absorb the ']' end of class marker
idx++;
// Emit character class definition
instruction[ret + RE.offsetOpdata] = (char)range.num;
for (int i = 0; i < range.num; i++)
{
emit((char)range.minRange[i]);
emit((char)range.maxRange[i]);
}
return ret;
}
/**
* Absorb an atomic character string. This method is a little tricky because
* it can un-include the last character of string if a closure operator follows.
* This is correct because *+? have higher precedence than concatentation (thus
* ABC* means AB(C*) and NOT (ABC)*).
* @return Index of new atom node
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int atom() throws RESyntaxException
{
// Create a string node
int ret = node(RE.OP_ATOM, 0);
// Length of atom
int lenAtom = 0;
// Loop while we've got input
atomLoop:
while (idx < len)
{
// Is there a next char?
if ((idx + 1) < len)
{
char c = pattern.charAt(idx + 1);
// If the next 'char' is an escape, look past the whole escape
if (pattern.charAt(idx) == '\\')
{
int idxEscape = idx;
escape();
if (idx < len)
{
c = pattern.charAt(idx);
}
idx = idxEscape;
}
// Switch on next char
switch (c)
{
case '{':
case '?':
case '*':
case '+':
// If the next character is a closure operator and our atom is non-empty, the
// current character should bind to the closure operator rather than the atom
if (lenAtom != 0)
{
break atomLoop;
}
}
}
// Switch on current char
switch (pattern.charAt(idx))
{
case ']':
case '^':
case '$':
case '.':
case '[':
case '(':
case ')':
case '|':
break atomLoop;
case '{':
case '?':
case '*':
case '+':
// We should have an atom by now
if (lenAtom == 0)
{
// No atom before closure
syntaxError("Missing operand to closure");
}
break atomLoop;
case '\\':
{
// Get the escaped character (advances input automatically)
int idxBeforeEscape = idx;
char c = escape();
// Check if it's a simple escape (as opposed to, say, a backreference)
if ((c & ESC_MASK) == ESC_MASK)
{
// Not a simple escape, so backup to where we were before the escape.
idx = idxBeforeEscape;
break atomLoop;
}
// Add escaped char to atom
emit(c);
lenAtom++;
}
break;
default:
// Add normal character to atom
emit(pattern.charAt(idx++));
lenAtom++;
break;
}
}
// This "shouldn't" happen
if (lenAtom == 0)
{
internalError();
}
// Emit the atom length into the program
instruction[ret + RE.offsetOpdata] = (char)lenAtom;
return ret;
}
/**
* Match a terminal node.
* @param flags Flags
* @return Index of terminal node (closeable)
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int terminal(int[] flags) throws RESyntaxException
{
switch (pattern.charAt(idx))
{
case RE.OP_EOL:
case RE.OP_BOL:
case RE.OP_ANY:
return node(pattern.charAt(idx++), 0);
case '[':
return characterClass();
case '(':
return expr(flags);
case ')':
syntaxError("Unexpected close paren");
case '|':
internalError();
case ']':
syntaxError("Mismatched class");
case 0:
syntaxError("Unexpected end of input");
case '?':
case '+':
case '{':
case '*':
syntaxError("Missing operand to closure");
case '\\':
{
// Don't forget, escape() advances the input stream!
int idxBeforeEscape = idx;
// Switch on escaped character
switch (escape())
{
case ESC_CLASS:
case ESC_COMPLEX:
flags[0] &= ~NODE_NULLABLE;
return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
case ESC_BACKREF:
{
char backreference = (char)(pattern.charAt(idx - 1) - '0');
if (parens <= backreference)
{
syntaxError("Bad backreference");
}
flags[0] |= NODE_NULLABLE;
return node(RE.OP_BACKREF, backreference);
}
default:
// We had a simple escape and we want to have it end up in
// an atom, so we back up and fall though to the default handling
idx = idxBeforeEscape;
flags[0] &= ~NODE_NULLABLE;
break;
}
}
}
// Everything above either fails or returns.
// If it wasn't one of the above, it must be the start of an atom.
flags[0] &= ~NODE_NULLABLE;
return atom();
}
/**
* Compile a possibly closured terminal
* @param flags Flags passed by reference
* @return Index of closured node
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int closure(int[] flags) throws RESyntaxException
{
// Before terminal
int idxBeforeTerminal = idx;
// Values to pass by reference to terminal()
int[] terminalFlags = { NODE_NORMAL };
// Get terminal symbol
int ret = terminal(terminalFlags);
// Or in flags from terminal symbol
flags[0] |= terminalFlags[0];
// Advance input, set NODE_NULLABLE flag and do sanity checks
if (idx >= len)
{
return ret;
}
boolean greedy = true;
char closureType = pattern.charAt(idx);
switch (closureType)
{
case '?':
case '*':
// The current node can be null
flags[0] |= NODE_NULLABLE;
case '+':
// Eat closure character
idx++;
case '{':
// Don't allow blantant stupidity
int opcode = instruction[ret + RE.offsetOpcode];
if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
{
syntaxError("Bad closure operand");
}
if ((terminalFlags[0] & NODE_NULLABLE) != 0)
{
syntaxError("Closure operand can't be nullable");
}
break;
}
// If the next character is a '?', make the closure non-greedy (reluctant)
if (idx < len && pattern.charAt(idx) == '?')
{
idx++;
greedy = false;
}
if (greedy)
{
// Actually do the closure now
switch (closureType)
{
case '{':
{
// We look for our bracket in the list
boolean found = false;
int i;
allocBrackets();
for (i = 0; i < brackets; i++)
{
if (bracketStart[i] == idx)
{
found = true;
break;
}
}
// If its not in the list we parse the {m,n}
if (!found)
{
if (brackets >= maxBrackets)
{
syntaxError("Too many bracketed closures (limit is 10)");
}
bracketStart[brackets] = idx;
bracket();
bracketEnd[brackets] = idx;
i = brackets++;
}
// If there's a min, rewind stream and reparse
if (--bracketMin[i] > 0)
{
// Rewind stream and run it through again
idx = idxBeforeTerminal;
break;
}
// Do the right thing for maximum ({m,})
if (bracketOpt[i] == bracketFinished)
{
// Drop through now and closure expression.
// We are done with the {m,} expr, so skip rest
closureType = '*';
bracketOpt[i] = 0;
idx = bracketEnd[i];
}
else
if (bracketOpt[i] == bracketUnbounded)
{
idx = idxBeforeTerminal;
bracketOpt[i] = bracketFinished;
break;
}
else
if (bracketOpt[i]-- > 0)
{
// Drop through to optionally close and then 'play it again sam!'
idx = idxBeforeTerminal;
closureType = '?';
}
else
{
// We are done. skip the rest of {m,n} expr
idx = bracketEnd[i];
break;
}
}
// Fall through!
case '?':
case '*':
if (!greedy)
{
break;
}
if (closureType == '?')
{
// X? is compiled as (X|)
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option
int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING
setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
}
if (closureType == '*')
{
// X* is compiled as (X{gotoX}|)
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
setNextOfEnd(ret + RE.nodeSize, ret); // the start again
setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
}
break;
case '+':
{
// X+ is compiled as X({gotoX}|)
int branch;
branch = node(RE.OP_BRANCH, 0); // a new branch
setNextOfEnd(ret, branch); // is added to the end of X
setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
}
break;
}
}
else
{
// Add end after closured subexpr
setNextOfEnd(ret, node(RE.OP_END, 0));
// Actually do the closure now
switch (closureType)
{
case '?':
nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
break;
case '*':
nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
break;
case '+':
nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
break;
}
// Point to the expr after the closure
setNextOfEnd(ret, lenInstruction);
}
return ret;
}
/**
* Compile one branch of an or operator (implements concatenation)
* @param flags Flags passed by reference
* @return Pointer to branch node
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int branch(int[] flags) throws RESyntaxException
{
// Get each possibly closured piece and concat
int node;
int ret = node(RE.OP_BRANCH, 0);
int chain = -1;
int[] closureFlags = new int[1];
boolean nullable = true;
while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
{
// Get new node
closureFlags[0] = NODE_NORMAL;
node = closure(closureFlags);
if (closureFlags[0] == NODE_NORMAL)
{
nullable = false;
}
// If there's a chain, append to the end
if (chain != -1)
{
setNextOfEnd(chain, node);
}
// Chain starts at current
chain = node;
}
// If we don't run loop, make a nothing node
if (chain == -1)
{
node(RE.OP_NOTHING, 0);
}
// Set nullable flag for this branch
if (nullable)
{
flags[0] |= NODE_NULLABLE;
}
return ret;
}
/**
* Compile an expression with possible parens around it. Paren matching
* is done at this level so we can tie the branch tails together.
* @param flags Flag value passed by reference
* @return Node index of expression in instruction array
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
*/
int expr(int[] flags) throws RESyntaxException
{
// Create open paren node unless we were called from the top level (which has no parens)
boolean paren = false;
int ret = -1;
int closeParens = parens;
if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
{
idx++;
paren = true;
ret = node(RE.OP_OPEN, parens++);
}
flags[0] &= ~NODE_TOPLEVEL;
// Create a branch node
int branch = branch(flags);
if (ret == -1)
{
ret = branch;
}
else
{
setNextOfEnd(ret, branch);
}
// Loop through branches
while (idx < len && pattern.charAt(idx) == '|')
{
idx++;
branch = branch(flags);
setNextOfEnd(ret, branch);
}
// Create an ending node (either a close paren or an OP_END)
int end;
if (paren)
{
if (idx < len && pattern.charAt(idx) == ')')
{
idx++;
}
else
{
syntaxError("Missing close paren");
}
end = node(RE.OP_CLOSE, closeParens);
}
else
{
end = node(RE.OP_END, 0);
}
// Append the ending node to the ret nodelist
setNextOfEnd(ret, end);
// Hook the ends of each branch to the end node
for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next)
{
// If branch, make the end of the branch's operand chain point to the end node.
if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
{
setNextOfEnd(i + RE.nodeSize, end);
}
}
// Return the node list
return ret;
}
/**
* Compiles a regular expression pattern into a program runnable by the pattern
* matcher class 'RE'.
* @param pattern Regular expression pattern to compile (see RECompiler class
* for details).
* @return A compiled regular expression program.
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
* @see RE
*/
public REProgram compile(String pattern) throws RESyntaxException
{
// Initialize variables for compilation
this.pattern = pattern; // Save pattern in instance variable
len = pattern.length(); // Precompute pattern length for speed
idx = 0; // Set parsing index to the first character
lenInstruction = 0; // Set emitted instruction count to zero
parens = 1; // Set paren level to 1 (the implicit outer parens)
brackets = 0; // No bracketed closures yet
// Initialize pass by reference flags value
int[] flags = { NODE_TOPLEVEL };
// Parse expression
expr(flags);
// Should be at end of input
if (idx != len)
{
if (pattern.charAt(idx) == ')')
{
syntaxError("Unmatched close paren");
}
syntaxError("Unexpected input remains");
}
// Return the result
char[] ins = new char[lenInstruction];
System.arraycopy(instruction, 0, ins, 0, lenInstruction);
return new REProgram(ins);
}
/**
* Local, nested class for maintaining character ranges for character classes.
*/
class RERange
{
int size = 16; // Capacity of current range arrays
int[] minRange = new int[size]; // Range minima
int[] maxRange = new int[size]; // Range maxima
int num = 0; // Number of range array elements in use
/**
* Deletes the range at a given index from the range lists
* @param index Index of range to delete from minRange and maxRange arrays.
*/
void delete(int index)
{
// Return if no elements left or index is out of range
if (num == 0 || index >= num)
{
return;
}
// Move elements down
while (index++ < num)
{
if (index - 1 >= 0)
{
minRange[index-1] = minRange[index];
maxRange[index-1] = maxRange[index];
}
}
// One less element now
num--;
}
/**
* Merges a range into the range list, coalescing ranges if possible.
* @param min Minimum end of range
* @param max Maximum end of range
*/
void merge(int min, int max)
{
// Loop through ranges
for (int i = 0; i < num; i++)
{
// Min-max is subsumed by minRange[i]-maxRange[i]
if (min >= minRange[i] && max <= maxRange[i])
{
return;
}
// Min-max subsumes minRange[i]-maxRange[i]
else if (min <= minRange[i] && max >= maxRange[i])
{
delete(i);
merge(min, max);
return;
}
// Min is in the range, but max is outside
else if (min >= minRange[i] && min <= maxRange[i])
{
delete(i);
min = minRange[i];
merge(min, max);
return;
}
// Max is in the range, but min is outside
else if (max >= minRange[i] && max <= maxRange[i])
{
delete(i);
max = maxRange[i];
merge(min, max);
return;
}
}
// Must not overlap any other ranges
if (num >= size)
{
size *= 2;
int[] newMin = new int[size];
int[] newMax = new int[size];
System.arraycopy(minRange, 0, newMin, 0, num);
System.arraycopy(maxRange, 0, newMax, 0, num);
minRange = newMin;
maxRange = newMax;
}
minRange[num] = min;
maxRange[num] = max;
num++;
}
/**
* Removes a range by deleting or shrinking all other ranges
* @param min Minimum end of range
* @param max Maximum end of range
*/
void remove(int min, int max)
{
// Loop through ranges
for (int i = 0; i < num; i++)
{
// minRange[i]-maxRange[i] is subsumed by min-max
if (minRange[i] >= min && maxRange[i] <= max)
{
delete(i);
i--;
return;
}
// min-max is subsumed by minRange[i]-maxRange[i]
else if (min >= minRange[i] && max <= maxRange[i])
{
int minr = minRange[i];
int maxr = maxRange[i];
delete(i);
if (minr < min - 1)
{
merge(minr, min - 1);
}
if (max + 1 < maxr)
{
merge(max + 1, maxr);
}
return;
}
// minRange is in the range, but maxRange is outside
else if (minRange[i] >= min && minRange[i] <= max)
{
minRange[i] = max + 1;
return;
}
// maxRange is in the range, but minRange is outside
else if (maxRange[i] >= min && maxRange[i] <= max)
{
maxRange[i] = min - 1;
return;
}
}
}
/**
* Includes (or excludes) the range from min to max, inclusive.
* @param min Minimum end of range
* @param max Maximum end of range
* @param include True if range should be included. False otherwise.
*/
void include(int min, int max, boolean include)
{
if (include)
{
merge(min, max);
}
else
{
remove(min, max);
}
}
/**
* Includes a range with the same min and max
* @param minmax Minimum and maximum end of range (inclusive)
* @param include True if range should be included. False otherwise.
*/
void include(char minmax, boolean include)
{
include(minmax, minmax, include);
}
}
}