Parser.java
package com.github.dakusui.jcunitx.regex;
import com.github.dakusui.jcunitx.exceptions.InvalidTestException;
import com.github.dakusui.jcunitx.utils.StringUtils;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.github.dakusui.jcunitx.regex.Parser.Type.ALT;
import static com.github.dakusui.jcunitx.regex.Parser.Type.CAT;
import static com.github.dakusui.jcunitx.utils.Checks.checkcond;
import static com.github.dakusui.jcunitx.utils.Checks.checknotnull;
import static java.lang.String.format;
import static java.util.Arrays.asList;
/**
* A parser that interprets a regular expression and generates a `Expr` object, in which
* the regular expression's structure is stored.
*/
public class Parser {
private static final Pattern QUANTIFIER_PATTERN = Pattern.compile("^\\{([0-9]+),([0-9]+)}");
/*
* We can implement the same mechanism by considering a white space a 'concatenation operator',
* but it increases number of factors and constraints generated. And it results
* in poorer performance.
* Instead, treat white spaces within a word just as part of the word, and after reverse
* regex generation finishes, JCUnit will split into pieces.
*
* See RegexComposer, too.
*/
private static final Pattern LEAF_PATTERN = Pattern.compile("^\\s*[A-Za-z_]([A-Za-z_0-9 ]*[A-Za-z_0-9])?");
private final Expr.Factory exprFactory;
public Parser() {
this.exprFactory = new Expr.Factory();
}
public static List<String> preprocess(String input) {
List<String> ret = new LinkedList<>();
List<String> read = new LinkedList<>();
preprocess(read, ret, tokenizer(input), true);
return ret;
}
@SuppressWarnings("ConstantConditions")
private static void preprocess(List<String> read, List<String> output, Iterator<String> input, boolean topLevel) {
Type type = null;
List<String> work = new LinkedList<>();
try {
PreprocessingState state = PreprocessingState.I;
while (input.hasNext() && state != PreprocessingState.T) {
String cur = input.next();
read.add(cur);
SymbolType symbolType = SymbolType.determine(cur);
switch (state) {
case I:
switch (symbolType) {
case OPEN:
preprocess(read, work, input, false);
state = PreprocessingState.I_T;
break;
case WORD:
work.add(cur);
state = PreprocessingState.I_T;
break;
default:
throw syntaxError(cur, read);
}
break;
case I_T:
switch (symbolType) {
case CHOICE:
state = PreprocessingState.ALT_I;
type = ALT;
break;
case WORD:
state = PreprocessingState.CAT_T;
type = CAT;
work.add(cur);
break;
case OPEN:
state = PreprocessingState.CAT_T;
type = CAT;
preprocess(read, work, input, false);
break;
case CLOSE:
if (!topLevel) {
state = PreprocessingState.T;
break;
}
default:
throw syntaxError(cur, read);
}
break;
case ALT_I:
switch (symbolType) {
case OPEN:
preprocess(read, work, input, false);
state = PreprocessingState.ALT_T;
break;
case WORD:
work.add(cur);
state = PreprocessingState.ALT_T;
break;
default:
throw syntaxError(cur, read);
}
break;
case ALT_T:
switch (symbolType) {
case CHOICE:
state = PreprocessingState.ALT_I;
break;
case CLOSE:
state = PreprocessingState.T;
break;
default:
throw syntaxError(cur, read);
}
break;
case CAT_T:
switch (symbolType) {
case OPEN:
// Re-assigning the same value for the clarity's sake.
state = PreprocessingState.CAT_T;
preprocess(read, work, input, false);
break;
case WORD:
work.add(cur);
break;
case CLOSE:
state = PreprocessingState.T;
break;
default:
throw syntaxError(cur, read);
}
break;
case CAT_R:
if (symbolType == SymbolType.CLOSE) {
state = PreprocessingState.CAT_T;
} else {
throw syntaxError(cur, read);
}
break;
case T:
throw syntaxError(cur, read);
}
}
if (topLevel && input.hasNext()) {
throw syntaxError(input.next(), work);
}
if (!asList(PreprocessingState.I_T, PreprocessingState.T, PreprocessingState.ALT_T, PreprocessingState.CAT_T).contains(state)) {
throw inputShouldNotEndHere(state);
}
} finally {
work.add(0, (type == null ? CAT : type).toString());
work.add(")");
output.addAll(work);
}
}
private static RuntimeException syntaxError(String token, List<String> work) {
throw new InvalidTestException(
format(
"token '%s' should not come after: '%s'",
token,
StringUtils.join("", work.subList(0, Math.max(0, work.size() - 1))))
);
}
private static RuntimeException inputShouldNotEndHere(PreprocessingState state) {
throw new InvalidTestException(format("Input should not end here: '%s'", state));
}
private static Iterator<String> tokenizer(final String input) {
return new Iterator<String>() {
String[] nextToken = nextToken(input);
@Override
public boolean hasNext() {
return nextToken != null;
}
@Override
public String next() {
if (!hasNext())
throw new NoSuchElementException();
try {
return nextToken[0];
} finally {
nextToken = nextToken(nextToken[1]);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private static String[] nextToken(String input) {
if (input.isEmpty())
return null;
char first = input.charAt(0);
if (first == '(' || first == ')' || first == '|') {
return new String[] { input.substring(0, 1), input.substring(1) };
}
{
Matcher m = LEAF_PATTERN.matcher(input);
if (m.find()) {
/*
String matchedPart = m.group(0);
String work = matchedPart.contains(" ") ?
matchedPart.substring(0, matchedPart.indexOf(" ")) :
matchedPart;
return new String[] { work, input.substring(work.length()).trim() };
*/
//return new String[] { m.group(0), input.substring(m.group(0).length()).trim() };
//String matchedPart = m.group(0);
if (m.group(0).contains(" ")) {
String rest = input.replaceFirst("^[^ \t]*\\s+", "");
String head = input.substring(0, input.length() - rest.length());
return new String[] { head.trim(), rest };
}
return new String[] { m.group(0), input.substring(m.group(0).length()) };
}
}
{
Matcher m = QUANTIFIER_PATTERN.matcher(input);
if (m.find()) {
return new String[] { m.group(0), input.substring(m.group(0).length()) };
}
}
throw new InvalidTestException(format("Syntax error: Unparsable: '%s' did neither match '%s' nor '%s'", input, LEAF_PATTERN, QUANTIFIER_PATTERN));
}
private static String head(List<String> tokens) {
if (tokens.isEmpty()) {
return null;
}
return tokens.get(0);
}
private static List<String> tail(List<String> tokens) {
return tokens.subList(1, tokens.size());
}
public Expr parse(String regex) {
return parse(preprocess(regex));
}
private Expr parse(List<String> tokens) {
Context result = readTerm(tokens);
checkcond(result.tokens.isEmpty(), "Syntax error: unparsed=%s", result.tokens);
return result.expr;
}
private Context readTerm(List<String> tokens) {
String head = head(tokens);
Context ret;
if (ALT.asString().equals(head)) {
ret = readAlt(tail(tokens));
} else if (CAT.asString().equals(head)) {
ret = readCat(tail(tokens));
} else {
ret = readLeaf(tokens);
}
String nextHead;
if (ret.tokens != null && (nextHead = head(ret.tokens)) != null) {
Matcher m;
if ((m = QUANTIFIER_PATTERN.matcher(nextHead)).find()) {
ret = new Context(
this.exprFactory.rep(ret.expr, Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))),
tail(ret.tokens));
}
}
return ret;
}
private Context readLeaf(List<String> tokens) {
String head = head(tokens);
if (head == null)
return Context.LAST;
return new Context(this.exprFactory.leaf(head), tail(tokens));
}
private Context readAlt(List<String> tokens) {
List<Expr> work = new LinkedList<>();
for (Context context = readTerm(tokens); context.hasNext(); context = readTerm(tokens)) {
work.add(context.expr);
tokens = context.tokens;
if (")".equals(head(tokens))) {
tokens = tail(tokens);
break;
}
}
if (work.size() == 1) {
return new Context(work.get(0), tokens);
}
return new Context(this.exprFactory.alt(work), tokens);
}
private Context readCat(List<String> tokens) {
List<Expr> work = new LinkedList<>();
for (Context context = readTerm(tokens); context.hasNext(); context = readTerm(tokens)) {
work.add(context.expr);
tokens = context.tokens;
if (")".equals(head(tokens))) {
tokens = tail(tokens);
break;
}
}
return new Context(this.exprFactory.cat(work), tokens);
}
enum Type {
CAT("*"),
/**
* Alternative
*/
ALT("+"),
;
final String value;
Type(String value) {
this.value = value;
}
public String asString() {
return "(" + this.value;
}
public String toString() {
return asString();
}
}
private enum SymbolType {
OPEN,
WORD,
CHOICE,
CLOSE;
static SymbolType determine(String cur) {
checknotnull(cur);
if ("(".equals(cur)) {
return OPEN;
} else if ("|".equals(cur)) {
return CHOICE;
} else if (")".equals(cur)) {
return CLOSE;
}
return WORD;
}
}
private enum PreprocessingState {
I, I_R, I_T,
ALT_I, ALT_R, ALT_T,
CAT_R, CAT_T,
T
}
static class Context {
static final Context LAST = new Context(null, null);
final Expr expr;
final List<String> tokens;
Context(Expr expr, List<String> tokens) {
this.expr = expr;
this.tokens = tokens;
}
boolean hasNext() {
return this.expr != null;
}
}
}