Mercurial > projects > ldc

diff lphobos/std/regexp.d @ 473:373489eeaf90
Applied downs' lphobos update
author: Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date: Mon, 04 Aug 2008 19:28:49 +0200
children: 88e23f8c2354
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lphobos/std/regexp.d	Mon Aug 04 19:28:49 2008 +0200
@@ -0,0 +1,3208 @@
+
+// Regular Expressions
+
+/*
+ *  Copyright (C) 2000-2005 by Digital Mars, www.digitalmars.com
+ *  Written by Walter Bright
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty. In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  o  The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  o  Altered source versions must be plainly marked as such, and must not
+ *     be misrepresented as being the original software.
+ *  o  This notice may not be removed or altered from any source
+ *     distribution.
+ */
+
+/* NOTE: This file has been patched from the original DMD distribution to
+   work with the GDC compiler.
+
+   Modified by David Friedman, September 2004
+*/
+
+/**********************************************
+ * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular expressions)
+ * are a powerful method of string pattern matching.
+ * The regular expression
+ * language used is the same as that commonly used, however, some of the very
+ * advanced forms may behave slightly differently.
+ *
+ * std.regexp is designed to work only with valid UTF strings as input.
+ * To validate untrusted input, use std.utf.validate().
+ *
+ * In the following guide, $(I pattern)[] refers to a
+ * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
+ * The $(I attributes)[] refers to
+	a string controlling the interpretation
+	of the regular expression.
+	It consists of a sequence of one or more
+	of the following characters:
+
+	<table border=1 cellspacing=0 cellpadding=5>
+	<caption>Attribute Characters</caption>
+	$(TR $(TH Attribute) $(TH Action))
+	<tr>
+	$(TD $(B g))
+	$(TD global; repeat over the whole input string)
+	</tr>
+	<tr>
+	$(TD $(B i))
+	$(TD case insensitive)
+	</tr>
+	<tr>
+	$(TD $(B m))
+	$(TD treat as multiple lines separated by newlines)
+	</tr>
+	</table>
+ *
+ * The $(I format)[] string has the formatting characters:
+ *
+ *	<table border=1 cellspacing=0 cellpadding=5>
+	<caption>Formatting Characters</caption>
+	$(TR $(TH Format) $(TH Replaced With))
+	$(TR
+	$(TD $(B $$))	$(TD $)
+	)
+	$(TR
+	$(TD $(B $&))	$(TD The matched substring.)
+	)
+	$(TR
+	$(TD $(B $`))	$(TD The portion of string that precedes the matched substring.)
+	)
+	$(TR
+	$(TD $(B $'))	$(TD The portion of string that follows the matched substring.)
+	)
+	$(TR
+	$(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
+			is a single digit 1-9
+			and $$(I n) is not followed by a decimal digit.)
+	)
+	$(TR
+	$(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
+			is a two-digit decimal
+			number 01-99.
+			If $(I nn)th capture is undefined or more than the number
+			of parenthesized subexpressions, use the empty
+			string instead.)
+	)
+	</table>
+
+ *	Any other $ are left as is.
+ *
+ * References:
+ *	$(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
+ * Macros:
+ *	WIKI = StdRegexp
+ *	DOLLAR = $
+ */
+
+/*
+	Escape sequences:
+
+	\nnn starts out a 1, 2 or 3 digit octal sequence,
+	where n is an octal digit. If nnn is larger than
+	0377, then the 3rd digit is not part of the sequence
+	and is not consumed.
+	For maximal portability, use exactly 3 digits.
+
+	\xXX starts out a 1 or 2 digit hex sequence. X
+	is a hex character. If the first character after the \x
+	is not a hex character, the value of the sequence is 'x'
+	and the XX are not consumed.
+	For maximal portability, use exactly 2 digits.
+
+	\uUUUU is a unicode sequence. There are exactly
+	4 hex characters after the \u, if any are not, then
+	the value of the sequence is 'u', and the UUUU are not
+	consumed.
+
+	Character classes:
+
+	[a-b], where a is greater than b, will produce
+	an error.
+
+	References:
+
+	http://www.unicode.org/unicode/reports/tr18/
+ */
+
+module std.regexp;
+
+//debug = regexp;		// uncomment to turn on debugging printf's
+
+private
+{
+    import std.c.stdio;
+    import std.c.stdlib;
+    import std.c.string;
+    import std.stdio;
+    import std.string;
+    import std.ctype;
+    import std.outbuffer;
+    import std.bitarray;
+    import std.utf;
+    import std.intrinsic;
+}
+
+/** Regular expression to extract an _email address */
+const char[] email =
+    r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";
+
+/** Regular expression to extract a _url */
+const char[] url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";
+
+/************************************
+ * One of these gets thrown on compilation errors
+ */
+
+class RegExpException : Exception
+{
+    this(char[] msg)
+    {
+	super(msg);
+    }
+}
+
+struct regmatch_t
+{
+    int rm_so;			// index of start of match
+    int rm_eo;			// index past end of match
+}
+
+private alias char rchar;	// so we can make a wchar version
+
+/******************************************************
+ * Search string for matches with regular expression
+ * pattern with attributes.
+ * Replace each match with string generated from format.
+ * Params:
+ *	string = String to search.
+ *	pattern = Regular expression pattern.
+ *	format = Replacement string format.
+ *	attributes = Regular expression attributes.
+ * Returns:
+ *	the resulting string
+ * Example:
+ *	Replace the letters 'a' with the letters 'ZZ'.
+ * ---
+ * s = "Strap a rocket engine on a chicken."
+ * sub(s, "a", "ZZ")        // result: StrZZp a rocket engine on a chicken.
+ * sub(s, "a", "ZZ", "g")   // result: StrZZp ZZ rocket engine on ZZ chicken.
+ * ---
+ *	The replacement format can reference the matches using
+ *	the $&amp;, $$, $', $`, $0 .. $99 notation:
+ * ---
+ * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
+ * ---
+ */
+
+char[] sub(char[] string, char[] pattern, char[] format, char[] attributes = null)
+{
+    auto r = new RegExp(pattern, attributes);
+    auto result = r.replace(string, format);
+    delete r;
+    return result;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.sub.unittest\n");
+
+    char[] r = sub("hello", "ll", "ss");
+    assert(r == "hesso");
+}
+
+/*******************************************************
+ * Search string for matches with regular expression
+ * pattern with attributes.
+ * Pass each match to delegate dg.
+ * Replace each match with the return value from dg.
+ * Params:
+ *	string = String to search.
+ *	pattern = Regular expression pattern.
+ *	dg = Delegate
+ *	attributes = Regular expression attributes.
+ * Returns: the resulting string.
+ * Example:
+ * Capitalize the letters 'a' and 'r':
+ * ---
+ * s = "Strap a rocket engine on a chicken.";
+ * sub(s, "[ar]",
+ *    delegate char[] (RegExp m)
+ *    {
+ *         return toupper(m.match(0));
+ *    },
+ *    "g");    // result: StRAp A Rocket engine on A chicken.
+ * ---
+ */
+
+char[] sub(char[] string, char[] pattern, char[] delegate(RegExp) dg, char[] attributes = null)
+{
+    auto r = new RegExp(pattern, attributes);
+    rchar[] result;
+    int lastindex;
+    int offset;
+
+    result = string;
+    lastindex = 0;
+    offset = 0;
+    while (r.test(string, lastindex))
+    {
+	int so = r.pmatch[0].rm_so;
+	int eo = r.pmatch[0].rm_eo;
+
+	rchar[] replacement = dg(r);
+
+	// Optimize by using std.string.replace if possible - Dave Fladebo
+	rchar[] slice = result[offset + so .. offset + eo];
+	if (r.attributes & RegExp.REA.global &&		// global, so replace all
+	    !(r.attributes & RegExp.REA.ignoreCase) &&	// not ignoring case
+	    !(r.attributes & RegExp.REA.multiline) &&	// not multiline
+	    pattern == slice)				// simple pattern (exact match, no special characters) 
+	{
+	    debug(regexp)
+		printf("pattern: %.*s, slice: %.*s, replacement: %.*s\n",
+		    cast(int) pattern.length, pattern.ptr,
+		    cast(int) (eo-so), result.ptr + offset,
+		    cast(int) replacement.length, replacement.ptr);
+	    result = std.string.replace(result,slice,replacement);
+	    break;
+	}
+
+	result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
+
+	if (r.attributes & RegExp.REA.global)
+	{
+	    offset += replacement.length - (eo - so);
+
+	    if (lastindex == eo)
+		lastindex++;		// always consume some source
+	    else
+		lastindex = eo;
+	}
+	else
+	    break;
+    }
+    delete r;
+
+    return result;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.sub.unittest\n");
+
+    char[] foo(RegExp r) { return "ss"; }
+
+    char[] r = sub("hello", "ll", delegate char[](RegExp r) { return "ss"; });
+    assert(r == "hesso");
+
+    r = sub("hello", "l", delegate char[](RegExp r) { return "l"; }, "g");
+    assert(r == "hello");
+
+    auto s = sub("Strap a rocket engine on a chicken.",
+		 "[ar]",
+	         delegate char[] (RegExp m)
+	         {
+		    return std.string.toupper(m.match(0));
+	         },
+	         "g");
+    assert(s == "StRAp A Rocket engine on A chicken.");
+}
+
+
+/*************************************************
+ * Search string[] for first match with pattern[] with attributes[].
+ * Params:
+ *	string = String to search.
+ *	pattern = Regular expression pattern.
+ *	attributes = Regular expression attributes.
+ * Returns:
+ *	index into string[] of match if found, -1 if no match.
+ * Example:
+ * ---
+ * auto s = "abcabcabab";
+ * std.regexp.find(s, "b");    // match, returns 1
+ * std.regexp.find(s, "f");    // no match, returns -1
+ * ---
+ */
+
+int find(rchar[] string, char[] pattern, char[] attributes = null)
+{
+    int i = -1;
+
+    auto r = new RegExp(pattern, attributes);
+    if (r.test(string))
+    {
+	i = r.pmatch[0].rm_so;
+    }
+    delete r;
+    return i;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.find.unittest\n");
+
+    int i;
+    i = find("xabcy", "abc");
+    assert(i == 1);
+    i = find("cba", "abc");
+    assert(i == -1);
+}
+
+
+
+/*************************************************
+ * Search string[] for last match with pattern[] with attributes[].
+ * Params:
+ *	string = String to search.
+ *	pattern = Regular expression pattern.
+ *	attributes = Regular expression attributes.
+ * Returns:
+ *	index into string[] of match if found, -1 if no match.
+ * Example:
+ * ---
+ * auto s = "abcabcabab";
+ * std.regexp.find(s, "b");    // match, returns 9
+ * std.regexp.find(s, "f");    // no match, returns -1
+ * ---
+ */
+
+int rfind(rchar[] string, char[] pattern, char[] attributes = null)
+{
+    int i = -1;
+    int lastindex = 0;
+
+    auto r = new RegExp(pattern, attributes);
+    while (r.test(string, lastindex))
+    {   int eo = r.pmatch[0].rm_eo;
+	i = r.pmatch[0].rm_so;
+	if (lastindex == eo)
+	    lastindex++;		// always consume some source
+	else
+	    lastindex = eo;
+    }
+    delete r;
+    return i;
+}
+
+unittest
+{
+    int i;
+
+    debug(regexp) printf("regexp.rfind.unittest\n");
+    i = rfind("abcdefcdef", "c");
+    assert(i == 6);
+    i = rfind("abcdefcdef", "cd");
+    assert(i == 6);
+    i = rfind("abcdefcdef", "x");
+    assert(i == -1);
+    i = rfind("abcdefcdef", "xy");
+    assert(i == -1);
+    i = rfind("abcdefcdef", "");
+    assert(i == 10);
+}
+
+
+/********************************************
+ * Split string[] into an array of strings, using the regular
+ * expression pattern[] with attributes[] as the separator.
+ * Params:
+ *	string = String to search.
+ *	pattern = Regular expression pattern.
+ *	attributes = Regular expression attributes.
+ * Returns:
+ * 	array of slices into string[]
+ * Example:
+ * ---
+ * foreach (s; split("abcabcabab", "C.", "i"))
+ * {
+ *     writefln("s = '%s'", s);
+ * }
+ * // Prints:
+ * // s = 'ab'
+ * // s = 'b'
+ * // s = 'bab'
+ * ---
+ */
+
+char[][] split(char[] string, char[] pattern, char[] attributes = null)
+{
+    auto r = new RegExp(pattern, attributes);
+    auto result = r.split(string);
+    delete r;
+    return result;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.split.unittest()\n");
+    char[][] result;
+
+    result = split("ab", "a*");
+    assert(result.length == 2);
+    assert(result[0] == "");
+    assert(result[1] == "b");
+
+    foreach (i, s; split("abcabcabab", "C.", "i"))
+    {
+	writefln("s[%d] = '%s'", i, s);
+	if (i == 0) assert(s == "ab");
+	else if (i == 1) assert(s == "b");
+	else if (i == 2) assert(s == "bab");
+	else assert(0);
+    }
+}
+
+/****************************************************
+ * Search string[] for first match with pattern[] with attributes[].
+ * Params:
+ *	string = String to search.
+ *	pattern = Regular expression pattern.
+ *	attributes = Regular expression attributes.
+ * Returns:
+ *	corresponding RegExp if found, null if not.
+ * Example:
+ * ---
+ * import std.stdio;
+ * import std.regexp;
+ *
+ * void main()
+ * {
+ *     if (auto m = std.regexp.search("abcdef", "c"))
+ *     {
+ *         writefln("%s[%s]%s", m.pre, m.match(0), m.post);
+ *     }
+ * }
+ * // Prints:
+ * // ab[c]def
+ * ---
+ */
+
+RegExp search(char[] string, char[] pattern, char[] attributes = null)
+{
+    auto r = new RegExp(pattern, attributes);
+
+    if (r.test(string))
+    {
+    }
+    else
+    {	delete r;
+	r = null;
+    }
+    return r;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.string.unittest()\n");
+
+    if (auto m = std.regexp.search("abcdef", "c()"))
+    {
+	auto result = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post);
+	assert(result == "ab[c]def");
+	assert(m.match(1) == null);
+	assert(m.match(2) == null);
+    }
+    else
+	assert(0);
+
+    if (auto n = std.regexp.search("abcdef", "g"))
+    {
+	assert(0);
+    }
+}
+
+
+/* ********************************* RegExp ******************************** */
+
+/*****************************
+ * RegExp is a class to handle regular expressions.
+ *
+ * It is the core foundation for adding powerful string pattern matching
+ * capabilities to programs like grep, text editors, awk, sed, etc.
+ */
+class RegExp
+{
+    /*****
+     * Construct a RegExp object. Compile pattern
+     * with <i>attributes</i> into
+     * an internal form for fast execution.
+     * Params:
+     *	pattern = regular expression
+     *  attributes = _attributes
+     * Throws: RegExpException if there are any compilation errors.
+     * Example:
+     *  Declare two variables and assign to them a RegExp object:
+     * ---
+     * auto r = new RegExp("pattern");
+     * auto s = new RegExp(r"p[1-5]\s*");
+     * ---
+     */
+    public this(rchar[] pattern, rchar[] attributes = null)
+    {
+	pmatch = (&gmatch)[0 .. 1];
+	compile(pattern, attributes);
+    }
+
+    /*****
+     * Generate instance of RegExp.
+     * Params:
+     *	pattern = regular expression
+     *  attributes = _attributes
+     * Throws: RegExpException if there are any compilation errors.
+     * Example:
+     *  Declare two variables and assign to them a RegExp object:
+     * ---
+     * auto r = RegExp("pattern");
+     * auto s = RegExp(r"p[1-5]\s*");
+     * ---
+     */
+    public static RegExp opCall(rchar[] pattern, rchar[] attributes = null)
+    {
+	return new RegExp(pattern, attributes);
+    }
+
+    unittest
+    {
+	debug(regexp) printf("regexp.opCall.unittest()\n");
+	auto r1 = RegExp("hello", "m");
+	char[] msg;
+	try
+	{
+	    auto r2 = RegExp("hello", "q");
+	    assert(0);
+	}
+	catch (RegExpException ree)
+	{
+	    msg = ree.toString();
+	    //writefln("message: %s", ree);
+	}
+	assert(msg == "unrecognized attribute");
+    }
+
+    /************************************
+     * Set up for start of foreach loop.
+     * Returns:
+     *	search() returns instance of RegExp set up to _search string[].
+     * Example:
+     * ---
+     * import std.stdio;
+     * import std.regexp;
+     *
+     * void main()
+     * {
+     *     foreach(m; RegExp("ab").search("abcabcabab"))
+     *     {
+     *         writefln("%s[%s]%s", m.pre, m.match(0), m.post);
+     *     }
+     * }
+     * // Prints:
+     * // [ab]cabcabab
+     * // abc[ab]cabab
+     * // abcabc[ab]ab
+     * // abcabcab[ab]
+     * ---
+     */
+
+    public RegExp search(rchar[] string)
+    {
+	input = string;
+	pmatch[0].rm_eo = 0;
+	return this;
+    }
+
+    /** ditto */
+    public int opApply(int delegate(inout RegExp) dg)
+    {
+	int result;
+	RegExp r = this;
+
+	while (test())
+	{
+	    result = dg(r);
+	    if (result)
+		break;
+	}
+
+	return result;
+    }
+
+    unittest
+    {
+	debug(regexp) printf("regexp.search.unittest()\n");
+
+	int i;
+	foreach(m; RegExp("ab").search("abcabcabab"))
+	{
+	    auto s = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post);
+	    if (i == 0) assert(s == "[ab]cabcabab");
+	    else if (i == 1) assert(s == "abc[ab]cabab");
+	    else if (i == 2) assert(s == "abcabc[ab]ab");
+	    else if (i == 3) assert(s == "abcabcab[ab]");
+	    else assert(0);
+	    i++;
+	}
+    }
+
+    /******************
+     * Retrieve match n.
+     *
+     * n==0 means the matched substring, n>0 means the
+     * n'th parenthesized subexpression.
+     * if n is larger than the number of parenthesized subexpressions,
+     * null is returned.
+     */
+    public char[] match(size_t n)
+    {
+	if (n >= pmatch.length)
+	    return null;
+	else
+	{   size_t rm_so, rm_eo;
+	    rm_so = pmatch[n].rm_so;
+	    rm_eo = pmatch[n].rm_eo;
+	    if (rm_so == rm_eo)
+		return null;
+	    return input[rm_so .. rm_eo];
+	}
+    }
+
+    /*******************
+     * Return the slice of the input that precedes the matched substring.
+     */
+    public char[] pre()
+    {
+	return input[0 .. pmatch[0].rm_so];
+    }
+
+    /*******************
+     * Return the slice of the input that follows the matched substring.
+     */
+    public char[] post()
+    {
+	return input[pmatch[0].rm_eo .. $];
+    }
+
+    uint re_nsub;		// number of parenthesized subexpression matches
+    regmatch_t[] pmatch;	// array [re_nsub + 1]
+
+    rchar[] input;		// the string to search
+
+    // per instance:
+
+    rchar[] pattern;		// source text of the regular expression
+
+    rchar[] flags;		// source text of the attributes parameter
+
+    int errors;
+
+    uint attributes;
+
+    enum REA
+    {
+	global		= 1,	// has the g attribute
+	ignoreCase	= 2,	// has the i attribute
+	multiline	= 4,	// if treat as multiple lines separated
+				// by newlines, or as a single line
+	dotmatchlf	= 8,	// if . matches \n
+    }
+
+
+private:
+    size_t src;			// current source index in input[]
+    size_t src_start;		// starting index for match in input[]
+    size_t p;			// position of parser in pattern[]
+    regmatch_t gmatch;		// match for the entire regular expression
+				// (serves as storage for pmatch[0])
+
+    ubyte[] program;		// pattern[] compiled into regular expression program
+    OutBuffer buf;
+
+
+
+
+/******************************************/
+
+// Opcodes
+
+enum : ubyte
+{
+    REend,		// end of program
+    REchar,		// single character
+    REichar,		// single character, case insensitive
+    REdchar,		// single UCS character
+    REidchar,		// single wide character, case insensitive
+    REanychar,		// any character
+    REanystar,		// ".*"
+    REstring,		// string of characters
+    REistring,		// string of characters, case insensitive
+    REtestbit,		// any in bitmap, non-consuming
+    REbit,		// any in the bit map
+    REnotbit,		// any not in the bit map
+    RErange,		// any in the string
+    REnotrange,		// any not in the string
+    REor,		// a | b
+    REplus,		// 1 or more
+    REstar,		// 0 or more
+    REquest,		// 0 or 1
+    REnm,		// n..m
+    REnmq,		// n..m, non-greedy version
+    REbol,		// beginning of line
+    REeol,		// end of line
+    REparen,		// parenthesized subexpression
+    REgoto,		// goto offset
+
+    REwordboundary,
+    REnotwordboundary,
+    REdigit,
+    REnotdigit,
+    REspace,
+    REnotspace,
+    REword,
+    REnotword,
+    REbackref,
+};
+
+// BUG: should this include '$'?
+private int isword(dchar c) { return isalnum(c) || c == '_'; }
+
+private uint inf = ~0u;
+
+/* ********************************
+ * Throws RegExpException on error
+ */
+
+public void compile(rchar[] pattern, rchar[] attributes)
+{
+    //printf("RegExp.compile('%.*s', '%.*s')\n", pattern, attributes);
+
+    this.attributes = 0;
+    foreach (rchar c; attributes)
+    {   REA att;
+
+	switch (c)
+	{
+	    case 'g': att = REA.global;		break;
+	    case 'i': att = REA.ignoreCase;	break;
+	    case 'm': att = REA.multiline;	break;
+	    default:
+		error("unrecognized attribute");
+		return;
+	}
+	if (this.attributes & att)
+	{   error("redundant attribute");
+	    return;
+	}
+	this.attributes |= att;
+    }
+
+    input = null;
+
+    this.pattern = pattern;
+    this.flags = attributes;
+
+    uint oldre_nsub = re_nsub;
+    re_nsub = 0;
+    errors = 0;
+
+    buf = new OutBuffer();
+    buf.reserve(pattern.length * 8);
+    p = 0;
+    parseRegexp();
+    if (p < pattern.length)
+    {	error("unmatched ')'");
+    }
+    optimize();
+    program = buf.data;
+    buf.data = null;
+    delete buf;
+
+    if (re_nsub > oldre_nsub)
+    {
+	if (pmatch.ptr is &gmatch)
+	    pmatch = null;
+	pmatch.length = re_nsub + 1;
+    }
+    pmatch[0].rm_so = 0;
+    pmatch[0].rm_eo = 0;
+}
+
+/********************************************
+ * Split string[] into an array of strings, using the regular
+ * expression as the separator.
+ * Returns:
+ * 	array of slices into string[]
+ */
+
+public rchar[][] split(rchar[] string)
+{
+    debug(regexp) printf("regexp.split()\n");
+
+    rchar[][] result;
+
+    if (string.length)
+    {
+	int p = 0;
+	int q;
+	for (q = p; q != string.length;)
+	{
+	    if (test(string, q))
+	    {	int e;
+
+		q = pmatch[0].rm_so;
+		e = pmatch[0].rm_eo;
+		if (e != p)
+		{
+		    result ~= string[p .. q];
+		    for (int i = 1; i < pmatch.length; i++)
+		    {
+			int so = pmatch[i].rm_so;
+			int eo = pmatch[i].rm_eo;
+			if (so == eo)
+			{   so = 0;	// -1 gives array bounds error
+			    eo = 0;
+			}
+			result ~= string[so .. eo];
+		    }
+		    q = p = e;
+		    continue;
+		}
+	    }
+	    q++;
+	}
+	result ~= string[p .. string.length];
+    }
+    else if (!test(string))
+	result ~= string;
+    return result;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.split.unittest()\n");
+
+    auto r = new RegExp("a*?", null);
+    rchar[][] result;
+    rchar[] j;
+    int i;
+
+    result = r.split("ab");
+
+    assert(result.length == 2);
+    i = std.string.cmp(result[0], "a");
+    assert(i == 0);
+    i = std.string.cmp(result[1], "b");
+    assert(i == 0);
+
+    r = new RegExp("a*", null);
+    result = r.split("ab");
+    assert(result.length == 2);
+    i = std.string.cmp(result[0], "");
+    assert(i == 0);
+    i = std.string.cmp(result[1], "b");
+    assert(i == 0);
+
+    r = new RegExp("<(\\/)?([^<>]+)>", null);
+    result = r.split("a<b>font</b>bar<TAG>hello</TAG>");
+
+    for (i = 0; i < result.length; i++)
+    {
+	//debug(regexp) printf("result[%d] = '%.*s'\n", i, result[i]);
+    }
+
+    j = join(result, ",");
+    //printf("j = '%.*s'\n", j);
+    i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
+    assert(i == 0);
+
+    r = new RegExp("a[bc]", null);
+    result = r.match("123ab");
+    j = join(result, ",");
+    i = std.string.cmp(j, "ab");
+    assert(i == 0);
+    
+    result = r.match("ac");
+    j = join(result, ",");
+    i = std.string.cmp(j, "ac");
+    assert(i == 0);
+}
+
+/*************************************************
+ * Search string[] for match with regular expression.
+ * Returns:
+ *	index of match if successful, -1 if not found
+ */
+
+public int find(rchar[] string)
+{
+    int i;
+
+    i = test(string);
+    if (i)
+	i = pmatch[0].rm_so;
+    else
+	i = -1;			// no match
+    return i;
+}
+
+//deprecated alias find search;
+
+unittest
+{
+    debug(regexp) printf("regexp.find.unittest()\n");
+
+    int i;
+    RegExp r = new RegExp("abc", null);
+    i = r.find("xabcy");
+    assert(i == 1);
+    i = r.find("cba");
+    assert(i == -1);
+}
+
+
+/*************************************************
+ * Search string[] for match.
+ * Returns:
+ *	If global attribute, return same value as exec(string).
+ *	If not global attribute, return array of all matches.
+ */
+
+public rchar[][] match(rchar[] string)
+{
+    rchar[][] result;
+
+    if (attributes & REA.global)
+    {
+	int lastindex = 0;
+
+	while (test(string, lastindex))
+	{   int eo = pmatch[0].rm_eo;
+
+	    result ~= input[pmatch[0].rm_so .. eo];
+	    if (lastindex == eo)
+		lastindex++;		// always consume some source
+	    else
+		lastindex = eo;
+	}
+    }
+    else
+    {
+	result = exec(string);
+    }
+    return result;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.match.unittest()\n");
+
+    int i;
+    rchar[][] result;
+    rchar[] j;
+    RegExp r;
+
+    r = new RegExp("a[bc]", null);
+    result = r.match("1ab2ac3");
+    j = join(result, ",");
+    i = std.string.cmp(j, "ab");
+    assert(i == 0);
+
+    r = new RegExp("a[bc]", "g");
+    result = r.match("1ab2ac3");
+    j = join(result, ",");
+    i = std.string.cmp(j, "ab,ac");
+    assert(i == 0);
+}
+
+
+/*************************************************
+ * Find regular expression matches in string[]. Replace those matches
+ * with a new _string composed of format[] merged with the result of the
+ * matches.
+ * If global, replace all matches. Otherwise, replace first match.
+ * Returns: the new _string
+ */
+
+public rchar[] replace(rchar[] string, rchar[] format)
+{
+    rchar[] result;
+    int lastindex;
+    int offset;
+
+    result = string;
+    lastindex = 0;
+    offset = 0;
+    for (;;)
+    {
+	if (!test(string, lastindex))
+	    break;
+
+	int so = pmatch[0].rm_so;
+	int eo = pmatch[0].rm_eo;
+
+	rchar[] replacement = replace(format);
+
+	// Optimize by using std.string.replace if possible - Dave Fladebo
+	rchar[] slice = result[offset + so .. offset + eo];
+	if (attributes & REA.global &&		// global, so replace all
+	   !(attributes & REA.ignoreCase) &&	// not ignoring case
+	   !(attributes & REA.multiline) &&	// not multiline
+	   pattern == slice &&			// simple pattern (exact match, no special characters) 
+	   format == replacement)		// simple format, not $ formats
+	{
+	    debug(regexp)
+		printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",
+		    cast(int) pattern.length, pattern.ptr,
+		    cast(int) (eo-so), result.ptr + offset,
+		    cast(int) format.length, format.ptr,
+		    cast(int) replacement.length, replacement.ptr);
+	    result = std.string.replace(result,slice,replacement);
+	    break;
+	}
+
+	result = replaceSlice(result, result[offset + so .. offset + eo], replacement);
+
+	if (attributes & REA.global)
+	{
+	    offset += replacement.length - (eo - so);
+
+	    if (lastindex == eo)
+		lastindex++;		// always consume some source
+	    else
+		lastindex = eo;
+	}
+	else
+	    break;
+    }
+
+    return result;
+}
+
+unittest
+{
+    debug(regexp) printf("regexp.replace.unittest()\n");
+
+    int i;
+    rchar[] result;
+    RegExp r;
+
+    r = new RegExp("a[bc]", "g");
+    result = r.replace("1ab2ac3", "x$&y");
+    i = std.string.cmp(result, "1xaby2xacy3");
+    assert(i == 0);
+
+    r = new RegExp("ab", "g");
+    result = r.replace("1ab2ac3", "xy");
+    i = std.string.cmp(result, "1xy2ac3");
+    assert(i == 0);
+}
+
+
+/*************************************************
+ * Search string[] for match.
+ * Returns:
+ *	array of slices into string[] representing matches
+ */
+
+public rchar[][] exec(rchar[] string)
+{
+    debug(regexp) printf("regexp.exec(string = '%.*s')\n",
+	cast(int) string.length, string.ptr);
+    input = string;
+    pmatch[0].rm_so = 0;
+    pmatch[0].rm_eo = 0;
+    return exec();
+}
+
+/*************************************************
+ * Pick up where last exec(string) or exec() left off,
+ * searching string[] for next match.
+ * Returns:
+ *	array of slices into string[] representing matches
+ */
+
+public rchar[][] exec()
+{
+    if (!test())
+	return null;
+
+    auto result = new rchar[][pmatch.length];
+    for (int i = 0; i < pmatch.length; i++)
+    {
+	if (pmatch[i].rm_so == pmatch[i].rm_eo)
+	    result[i] = null;
+	else
+	    result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
+    }
+
+    return result;
+}
+
+/************************************************
+ * Search string[] for match.
+ * Returns: 0 for no match, !=0 for match
+ * Example:
+---
+import std.stdio;
+import std.regexp;
+import std.string;
+
+int grep(int delegate(char[]) pred, char[][] list)
+{
+  int count;
+  foreach (s; list)
+  {  if (pred(s))
+       ++count;
+  }
+  return count;
+}
+
+void main()
+{
+  auto x = grep(&RegExp("[Ff]oo").test,
+                std.string.split("mary had a foo lamb"));
+  writefln(x);
+}
+---
+ * which prints: 1
+ */
+
+public int test(rchar[] string)
+{
+    return test(string, 0 /*pmatch[0].rm_eo*/);
+}
+
+/************************************************
+ * Pick up where last test(string) or test() left off, and search again.
+ * Returns: 0 for no match, !=0 for match
+ */
+
+public int test()
+{
+    return test(input, pmatch[0].rm_eo);
+}
+
+/************************************************
+ * Test string[] starting at startindex against regular expression.
+ * Returns: 0 for no match, !=0 for match
+ */
+
+public int test(char[] string, int startindex)
+{
+    char firstc;
+    uint si;
+
+    input = string;
+    debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %d)\n",
+	cast(int) input.length, input.ptr, startindex);
+    pmatch[0].rm_so = 0;
+    pmatch[0].rm_eo = 0;
+    if (startindex < 0 || startindex > input.length)
+    {
+	return 0;			// fail
+    }
+    //debug(regexp) printProgram(program);
+
+    // First character optimization
+    firstc = 0;
+    if (program[0] == REchar)
+    {
+	firstc = program[1];
+	if (attributes & REA.ignoreCase && isalpha(firstc))
+	    firstc = 0;
+    }
+
+    for (si = startindex; ; si++)
+    {
+	if (firstc)
+	{
+	    if (si == input.length)
+		break;			// no match
+	    if (input[si] != firstc)
+	    {
+		si++;
+		if (!chr(si, firstc))	// if first character not found
+		    break;		// no match
+	    }
+	}
+	for (int i = 0; i < re_nsub + 1; i++)
+	{
+	    pmatch[i].rm_so = -1;
+	    pmatch[i].rm_eo = -1;
+	}
+	src_start = src = si;
+	if (trymatch(0, program.length))
+	{
+	    pmatch[0].rm_so = si;
+	    pmatch[0].rm_eo = src;
+	    //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
+	    return 1;
+	}
+	// If possible match must start at beginning, we are done
+	if (program[0] == REbol || program[0] == REanystar)
+	{
+	    if (attributes & REA.multiline)
+	    {
+		// Scan for the next \n
+		if (!chr(si, '\n'))
+		    break;		// no match if '\n' not found
+	    }
+	    else
+		break;
+	}
+	if (si == input.length)
+	    break;
+	//debug(regexp) printf("Starting new try: '%.*s'\n", input[si + 1 .. input.length]);
+    }
+    return 0;		// no match
+}
+
+int chr(inout uint si, rchar c)
+{
+    for (; si < input.length; si++)
+    {
+	if (input[si] == c)
+	    return 1;
+    }
+    return 0;
+}
+
+
+void printProgram(ubyte[] prog)
+{
+  //debug(regexp)
+  {
+    uint pc;
+    uint len;
+    uint n;
+    uint m;
+    ushort *pu;
+    uint *puint;
+    ubyte[] s;
+
+    printf("printProgram()\n");
+    for (pc = 0; pc < prog.length; )
+    {
+	printf("%3d: ", pc);
+
+	//printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
+	switch (prog[pc])
+	{
+	    case REchar:
+		printf("\tREchar '%c'\n", prog[pc + 1]);
+		pc += 1 + char.sizeof;
+		break;
+
+	    case REichar:
+		printf("\tREichar '%c'\n", prog[pc + 1]);
+		pc += 1 + char.sizeof;
+		break;
+
+	    case REdchar:
+		printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
+		pc += 1 + dchar.sizeof;
+		break;
+
+	    case REidchar:
+		printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
+		pc += 1 + dchar.sizeof;
+		break;
+
+	    case REanychar:
+		printf("\tREanychar\n");
+		pc++;
+		break;
+
+	    case REstring:
+		len = *cast(uint *)&prog[pc + 1];
+		s = (&prog[pc + 1 + uint.sizeof])[0 .. len];
+		printf("\tREstring x%x, '%.*s'\n", len,
+		    cast(int) s.length, s.ptr);
+		pc += 1 + uint.sizeof + len * rchar.sizeof;
+		break;
+
+	    case REistring:
+		len = *cast(uint *)&prog[pc + 1];
+		s = (&prog[pc + 1 + uint.sizeof])[0 .. len];
+		printf("\tREistring x%x, '%.*s'\n", len,
+		    cast(int) s.length, s.ptr);
+		pc += 1 + uint.sizeof + len * rchar.sizeof;
+		break;
+
+	    case REtestbit:
+		pu = cast(ushort *)&prog[pc + 1];
+		printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
+		len = pu[1];
+		pc += 1 + 2 * ushort.sizeof + len;
+		break;
+
+	    case REbit:
+		pu = cast(ushort *)&prog[pc + 1];
+		len = pu[1];
+		printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
+		for (n = 0; n < len; n++)
+		    printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
+		printf("\n");
+		pc += 1 + 2 * ushort.sizeof + len;
+		break;
+
+	    case REnotbit:
+		pu = cast(ushort *)&prog[pc + 1];
+		printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
+		len = pu[1];
+		pc += 1 + 2 * ushort.sizeof + len;
+		break;
+
+	    case RErange:
+		len = *cast(uint *)&prog[pc + 1];
+		printf("\tRErange %d\n", len);
+		// BUG: REAignoreCase?
+		pc += 1 + uint.sizeof + len;
+		break;
+
+	    case REnotrange:
+		len = *cast(uint *)&prog[pc + 1];
+		printf("\tREnotrange %d\n", len);
+		// BUG: REAignoreCase?
+		pc += 1 + uint.sizeof + len;
+		break;
+
+	    case REbol:
+		printf("\tREbol\n");
+		pc++;
+		break;
+
+	    case REeol:
+		printf("\tREeol\n");
+		pc++;
+		break;
+
+	    case REor:
+		len = *cast(uint *)&prog[pc + 1];
+		printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
+		pc += 1 + uint.sizeof;
+		break;
+
+	    case REgoto:
+		len = *cast(uint *)&prog[pc + 1];
+		printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
+		pc += 1 + uint.sizeof;
+		break;
+
+	    case REanystar:
+		printf("\tREanystar\n");
+		pc++;
+		break;
+
+	    case REnm:
+	    case REnmq:
+		// len, n, m, ()
+		puint = cast(uint *)&prog[pc + 1];
+		len = puint[0];
+		n = puint[1];
+		m = puint[2];
+		printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n",
+		    (prog[pc] == REnmq) ? cast(char*)"q" : cast(char*)" ",
+		    len, n, m, pc + 1 + uint.sizeof * 3 + len);
+		pc += 1 + uint.sizeof * 3;
+		break;
+
+	    case REparen:
+		// len, n, ()
+		puint = cast(uint *)&prog[pc + 1];
+		len = puint[0];
+		n = puint[1];
+		printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
+		pc += 1 + uint.sizeof * 2;
+		break;
+
+	    case REend:
+		printf("\tREend\n");
+		return;
+
+	    case REwordboundary:
+		printf("\tREwordboundary\n");
+		pc++;
+		break;
+
+	    case REnotwordboundary:
+		printf("\tREnotwordboundary\n");
+		pc++;
+		break;
+
+	    case REdigit:
+		printf("\tREdigit\n");
+		pc++;
+		break;
+
+	    case REnotdigit:
+		printf("\tREnotdigit\n");
+		pc++;
+		break;
+
+	    case REspace:
+		printf("\tREspace\n");
+		pc++;
+		break;
+
+	    case REnotspace:
+		printf("\tREnotspace\n");
+		pc++;
+		break;
+
+	    case REword:
+		printf("\tREword\n");
+		pc++;
+		break;
+
+	    case REnotword:
+		printf("\tREnotword\n");
+		pc++;
+		break;
+
+	    case REbackref:
+		printf("\tREbackref %d\n", prog[1]);
+		pc += 2;
+		break;
+
+	    default:
+		assert(0);
+	}
+    }
+  }
+}
+
+
+/**************************************************
+ * Match input against a section of the program[].
+ * Returns:
+ *	1 if successful match
+ *	0 no match
+ */
+
+int trymatch(int pc, int pcend)
+{   int srcsave;
+    uint len;
+    uint n;
+    uint m;
+    uint count;
+    uint pop;
+    uint ss;
+    regmatch_t *psave;
+    uint c1;
+    uint c2;
+    ushort* pu;
+    uint* puint;
+
+    debug(regexp)
+    {
+	char[] s = input[src .. input.length];
+	printf("RegExp.trymatch(pc = %d, src = '%.*s', pcend = %d)\n",
+	    pc, cast(int) s.length, s.ptr, pcend);
+    }
+    srcsave = src;
+    psave = null;
+    for (;;)
+    {
+	if (pc == pcend)		// if done matching
+	{   debug(regex) printf("\tprogend\n");
+	    return 1;
+	}
+
+	//printf("\top = %d\n", program[pc]);
+	switch (program[pc])
+	{
+	    case REchar:
+		if (src == input.length)
+		    goto Lnomatch;
+		debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
+		if (program[pc + 1] != input[src])
+		    goto Lnomatch;
+		src++;
+		pc += 1 + char.sizeof;
+		break;
+
+	    case REichar:
+		if (src == input.length)
+		    goto Lnomatch;
+		debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
+		c1 = program[pc + 1];
+		c2 = input[src];
+		if (c1 != c2)
+		{
+		    if (islower(cast(rchar)c2))
+			c2 = std.ctype.toupper(cast(rchar)c2);
+		    else
+			goto Lnomatch;
+		    if (c1 != c2)
+			goto Lnomatch;
+		}
+		src++;
+		pc += 1 + char.sizeof;
+		break;
+
+	    case REdchar:
+		debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
+		if (src == input.length)
+		    goto Lnomatch;
+		if (*(cast(dchar *)&program[pc + 1]) != input[src])
+		    goto Lnomatch;
+		src++;
+		pc += 1 + dchar.sizeof;
+		break;
+
+	    case REidchar:
+		debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
+		if (src == input.length)
+		    goto Lnomatch;
+		c1 = *(cast(dchar *)&program[pc + 1]);
+		c2 = input[src];
+		if (c1 != c2)
+		{
+		    if (islower(cast(rchar)c2))
+			c2 = std.ctype.toupper(cast(rchar)c2);
+		    else
+			goto Lnomatch;
+		    if (c1 != c2)
+			goto Lnomatch;
+		}
+		src++;
+		pc += 1 + dchar.sizeof;
+		break;
+
+	    case REanychar:
+		debug(regexp) printf("\tREanychar\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
+		    goto Lnomatch;
+		src += std.utf.stride(input, src);
+		//src++;
+		pc++;
+		break;
+
+	    case REstring:
+		len = *cast(uint *)&program[pc + 1];
+		debug(regexp)
+		{
+		    char[] s = (&program[pc + 1 + uint.sizeof])[0 .. len];
+		    printf("\tREstring x%x, '%.*s'\n", len,
+			cast(int) s.length, s.ptr);
+		}
+		if (src + len > input.length)
+		    goto Lnomatch;
+		if (memcmp(&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof))
+		    goto Lnomatch;
+		src += len;
+		pc += 1 + uint.sizeof + len * rchar.sizeof;
+		break;
+
+	    case REistring:
+		len = *cast(uint *)&program[pc + 1];
+		debug(regexp)
+		{
+		    char[] s = (&program[pc + 1 + uint.sizeof])[0 .. len];
+		    printf("\tREistring x%x, '%.*s'\n", len,
+			cast(int) s.length, s.ptr);
+		}
+		if (src + len > input.length)
+		    goto Lnomatch;
+		version (Win32)
+		{
+		    if (memicmp(cast(char*)&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof))
+			goto Lnomatch;
+		}
+		else
+		{
+		    if (icmp((cast(char*)&program[pc + 1 + uint.sizeof])[0..len],
+			     input[src .. src + len]))
+			goto Lnomatch;
+		}
+		src += len;
+		pc += 1 + uint.sizeof + len * rchar.sizeof;
+		break;
+
+	    case REtestbit:
+		pu = (cast(ushort *)&program[pc + 1]);
+		debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
+		    pu[0], pu[1], input[src], input[src]);
+		if (src == input.length)
+		    goto Lnomatch;
+		len = pu[1];
+		c1 = input[src];
+		//printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
+		if (c1 <= pu[0] &&
+		    !bt(cast(uint*)&(program[pc + 1 + 4]), c1)) // assumes BitArray implementation
+		    goto Lnomatch;
+		pc += 1 + 2 * ushort.sizeof + len;
+		break;
+
+	    case REbit:
+		pu = (cast(ushort *)&program[pc + 1]);
+		debug(regexp) printf("\tREbit %d, %d, '%c'\n",
+		    pu[0], pu[1], input[src]);
+		if (src == input.length)
+		    goto Lnomatch;
+		len = pu[1];
+		c1 = input[src];
+		if (c1 > pu[0])
+		    goto Lnomatch;
+		if (!bt(cast(uint*)&(program[pc + 1 + 4]), c1)) // assumes BitArray implementation
+		    goto Lnomatch;
+		src++;
+		pc += 1 + 2 * ushort.sizeof + len;
+		break;
+
+	    case REnotbit:
+		pu = (cast(ushort *)&program[pc + 1]);
+		debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
+		    pu[0], pu[1], input[src]);
+		if (src == input.length)
+		    goto Lnomatch;
+		len = pu[1];
+		c1 = input[src];
+		if (c1 <= pu[0] &&
+		    bt(cast(uint*)&(program[pc + 1 + 4]), c1)) // assumes BitArray implementation
+		    goto Lnomatch;
+		src++;
+		pc += 1 + 2 * ushort.sizeof + len;
+		break;
+
+	    case RErange:
+		len = *cast(uint *)&program[pc + 1];
+		debug(regexp) printf("\tRErange %d\n", len);
+		if (src == input.length)
+		    goto Lnomatch;
+		// BUG: REA.ignoreCase?
+		if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null)
+		    goto Lnomatch;
+		src++;
+		pc += 1 + uint.sizeof + len;
+		break;
+
+	    case REnotrange:
+		len = *cast(uint *)&program[pc + 1];
+		debug(regexp) printf("\tREnotrange %d\n", len);
+		if (src == input.length)
+		    goto Lnomatch;
+		// BUG: REA.ignoreCase?
+		if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null)
+		    goto Lnomatch;
+		src++;
+		pc += 1 + uint.sizeof + len;
+		break;
+
+	    case REbol:
+		debug(regexp) printf("\tREbol\n");
+		if (src == 0)
+		{
+		}
+		else if (attributes & REA.multiline)
+		{
+		    if (input[src - 1] != '\n')
+			goto Lnomatch;
+		}
+		else
+		    goto Lnomatch;
+		pc++;
+		break;
+
+	    case REeol:
+		debug(regexp) printf("\tREeol\n");
+		if (src == input.length)
+		{
+		}
+		else if (attributes & REA.multiline && input[src] == '\n')
+		    src++;
+		else
+		    goto Lnomatch;
+		pc++;
+		break;
+
+	    case REor:
+		len = (cast(uint *)&program[pc + 1])[0];
+		debug(regexp) printf("\tREor %d\n", len);
+		pop = pc + 1 + uint.sizeof;
+		ss = src;
+		if (trymatch(pop, pcend))
+		{
+		    if (pcend != program.length)
+		    {	int s;
+
+			s = src;
+			if (trymatch(pcend, program.length))
+			{   debug(regexp) printf("\tfirst operand matched\n");
+			    src = s;
+			    return 1;
+			}
+			else
+			{
+			    // If second branch doesn't match to end, take first anyway
+			    src = ss;
+			    if (!trymatch(pop + len, program.length))
+			    {
+				debug(regexp) printf("\tfirst operand matched\n");
+				src = s;
+				return 1;
+			    }
+			}
+			src = ss;
+		    }
+		    else
+		    {	debug(regexp) printf("\tfirst operand matched\n");
+			return 1;
+		    }
+		}
+		pc = pop + len;		// proceed with 2nd branch
+		break;
+
+	    case REgoto:
+		debug(regexp) printf("\tREgoto\n");
+		len = (cast(uint *)&program[pc + 1])[0];
+		pc += 1 + uint.sizeof + len;
+		break;
+
+	    case REanystar:
+		debug(regexp) printf("\tREanystar\n");
+		pc++;
+		for (;;)
+		{   int s1;
+		    int s2;
+
+		    s1 = src;
+		    if (src == input.length)
+			break;
+		    if (!(attributes & REA.dotmatchlf) && input[src] == '\n')
+			break;
+		    src++;
+		    s2 = src;
+
+		    // If no match after consumption, but it
+		    // did match before, then no match
+		    if (!trymatch(pc, program.length))
+		    {
+			src = s1;
+			// BUG: should we save/restore pmatch[]?
+			if (trymatch(pc, program.length))
+			{
+			    src = s1;		// no match
+			    break;
+			}
+		    }
+		    src = s2;
+		}
+		break;
+
+	    case REnm:
+	    case REnmq:
+		// len, n, m, ()
+		puint = cast(uint *)&program[pc + 1];
+		len = puint[0];
+		n = puint[1];
+		m = puint[2];
+		debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", (program[pc] == REnmq) ? cast(char*)"q" : cast(char*)"", len, n, m);
+		pop = pc + 1 + uint.sizeof * 3;
+		for (count = 0; count < n; count++)
+		{
+		    if (!trymatch(pop, pop + len))
+			goto Lnomatch;
+		}
+		if (!psave && count < m)
+		{
+		    //version (Win32)
+			psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof);
+		    //else
+			//psave = new regmatch_t[re_nsub + 1];
+		}
+		if (program[pc] == REnmq)	// if minimal munch
+		{
+		    for (; count < m; count++)
+		    {   int s1;
+
+			memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
+			s1 = src;
+
+			if (trymatch(pop + len, program.length))
+			{
+			    src = s1;
+			    memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
+			    break;
+			}
+
+			if (!trymatch(pop, pop + len))
+			{   debug(regexp) printf("\tdoesn't match subexpression\n");
+			    break;
+			}
+
+			// If source is not consumed, don't
+			// infinite loop on the match
+			if (s1 == src)
+			{   debug(regexp) printf("\tsource is not consumed\n");
+			    break;
+			}
+		    }
+		}
+		else	// maximal munch
+		{
+		    for (; count < m; count++)
+		    {   int s1;
+			int s2;
+
+			memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
+			s1 = src;
+			if (!trymatch(pop, pop + len))
+			{   debug(regexp) printf("\tdoesn't match subexpression\n");
+			    break;
+			}
+			s2 = src;
+
+			// If source is not consumed, don't
+			// infinite loop on the match
+			if (s1 == s2)
+			{   debug(regexp) printf("\tsource is not consumed\n");
+			    break;
+			}
+
+			// If no match after consumption, but it
+			// did match before, then no match
+			if (!trymatch(pop + len, program.length))
+			{
+			    src = s1;
+			    if (trymatch(pop + len, program.length))
+			    {
+				src = s1;		// no match
+				memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
+				break;
+			    }
+			}
+			src = s2;
+		    }
+		}
+		debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
+		pc = pop + len;
+		break;
+
+	    case REparen:
+		// len, ()
+		debug(regexp) printf("\tREparen\n");
+		puint = cast(uint *)&program[pc + 1];
+		len = puint[0];
+		n = puint[1];
+		pop = pc + 1 + uint.sizeof * 2;
+		ss = src;
+		if (!trymatch(pop, pop + len))
+		    goto Lnomatch;
+		pmatch[n + 1].rm_so = ss;
+		pmatch[n + 1].rm_eo = src;
+		pc = pop + len;
+		break;
+
+	    case REend:
+		debug(regexp) printf("\tREend\n");
+		return 1;		// successful match
+
+	    case REwordboundary:
+		debug(regexp) printf("\tREwordboundary\n");
+		if (src > 0 && src < input.length)
+		{
+		    c1 = input[src - 1];
+		    c2 = input[src];
+		    if (!(
+			  (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
+			  (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
+			 )
+		       )
+			goto Lnomatch;
+		}
+		pc++;
+		break;
+
+	    case REnotwordboundary:
+		debug(regexp) printf("\tREnotwordboundary\n");
+		if (src == 0 || src == input.length)
+		    goto Lnomatch;
+		c1 = input[src - 1];
+		c2 = input[src];
+		if (
+		    (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
+		    (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
+		   )
+		    goto Lnomatch;
+		pc++;
+		break;
+
+	    case REdigit:
+		debug(regexp) printf("\tREdigit\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (!isdigit(input[src]))
+		    goto Lnomatch;
+		src++;
+		pc++;
+		break;
+
+	    case REnotdigit:
+		debug(regexp) printf("\tREnotdigit\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (isdigit(input[src]))
+		    goto Lnomatch;
+		src++;
+		pc++;
+		break;
+
+	    case REspace:
+		debug(regexp) printf("\tREspace\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (!isspace(input[src]))
+		    goto Lnomatch;
+		src++;
+		pc++;
+		break;
+
+	    case REnotspace:
+		debug(regexp) printf("\tREnotspace\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (isspace(input[src]))
+		    goto Lnomatch;
+		src++;
+		pc++;
+		break;
+
+	    case REword:
+		debug(regexp) printf("\tREword\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (!isword(input[src]))
+		    goto Lnomatch;
+		src++;
+		pc++;
+		break;
+
+	    case REnotword:
+		debug(regexp) printf("\tREnotword\n");
+		if (src == input.length)
+		    goto Lnomatch;
+		if (isword(input[src]))
+		    goto Lnomatch;
+		src++;
+		pc++;
+		break;
+
+	    case REbackref:
+	    {
+		n = program[pc + 1];
+		debug(regexp) printf("\tREbackref %d\n", n);
+
+		int so = pmatch[n + 1].rm_so;
+		int eo = pmatch[n + 1].rm_eo;
+		len = eo - so;
+		if (src + len > input.length)
+		    goto Lnomatch;
+		else if (attributes & REA.ignoreCase)
+		{
+		    if (icmp(input[src .. src + len], input[so .. eo]))
+			goto Lnomatch;
+		}
+		else if (memcmp(&input[src], &input[so], len * rchar.sizeof))
+		    goto Lnomatch;
+		src += len;
+		pc += 2;
+		break;
+	    }
+
+	    default:
+		assert(0);
+	}
+    }
+
+Lnomatch:
+    debug(regexp) printf("\tnomatch pc=%d\n", pc);
+    src = srcsave;
+    return 0;
+}
+
+/* =================== Compiler ================== */
+
+int parseRegexp()
+{   uint offset;
+    uint gotooffset;
+    uint len1;
+    uint len2;
+
+    //printf("parseRegexp() '%.*s'\n", pattern[p .. pattern.length]);
+    offset = buf.offset;
+    for (;;)
+    {
+	assert(p <= pattern.length);
+	if (p == pattern.length)
+	{   buf.write(REend);
+	    return 1;
+	}
+	switch (pattern[p])
+	{
+	    case ')':
+		return 1;
+
+	    case '|':
+		p++;
+		gotooffset = buf.offset;
+		buf.write(REgoto);
+		buf.write(cast(uint)0);
+		len1 = buf.offset - offset;
+		buf.spread(offset, 1 + uint.sizeof);
+		gotooffset += 1 + uint.sizeof;
+		parseRegexp();
+		len2 = buf.offset - (gotooffset + 1 + uint.sizeof);
+		buf.data[offset] = REor;
+		(cast(uint *)&buf.data[offset + 1])[0] = len1;
+		(cast(uint *)&buf.data[gotooffset + 1])[0] = len2;
+		break;
+
+	    default:
+		parsePiece();
+		break;
+	}
+    }
+    assert(0);
+}
+
+int parsePiece()
+{   uint offset;
+    uint len;
+    uint n;
+    uint m;
+    ubyte op;
+    int plength = pattern.length;
+
+    //printf("parsePiece() '%.*s'\n", pattern[p .. pattern.length]);
+    offset = buf.offset;
+    parseAtom();
+    if (p == plength)
+	return 1;
+    switch (pattern[p])
+    {
+	case '*':
+	    // Special optimization: replace .* with REanystar
+	    if (buf.offset - offset == 1 &&
+		buf.data[offset] == REanychar &&
+		p + 1 < plength &&
+		pattern[p + 1] != '?')
+	    {
+		buf.data[offset] = REanystar;
+		p++;
+		break;
+	    }
+
+	    n = 0;
+	    m = inf;
+	    goto Lnm;
+
+	case '+':
+	    n = 1;
+	    m = inf;
+	    goto Lnm;
+
+	case '?':
+	    n = 0;
+	    m = 1;
+	    goto Lnm;
+
+	case '{':	// {n} {n,} {n,m}
+	    p++;
+	    if (p == plength || !isdigit(pattern[p]))
+		goto Lerr;
+	    n = 0;
+	    do
+	    {
+		// BUG: handle overflow
+		n = n * 10 + pattern[p] - '0';
+		p++;
+		if (p == plength)
+		    goto Lerr;
+	    } while (isdigit(pattern[p]));
+	    if (pattern[p] == '}')		// {n}
+	    {	m = n;
+		goto Lnm;
+	    }
+	    if (pattern[p] != ',')
+		goto Lerr;
+	    p++;
+	    if (p == plength)
+		goto Lerr;
+	    if (pattern[p] == /*{*/ '}')	// {n,}
+	    {	m = inf;
+		goto Lnm;
+	    }
+	    if (!isdigit(pattern[p]))
+		goto Lerr;
+	    m = 0;			// {n,m}
+	    do
+	    {
+		// BUG: handle overflow
+		m = m * 10 + pattern[p] - '0';
+		p++;
+		if (p == plength)
+		    goto Lerr;
+	    } while (isdigit(pattern[p]));
+	    if (pattern[p] != /*{*/ '}')
+		goto Lerr;
+	    goto Lnm;
+
+	Lnm:
+	    p++;
+	    op = REnm;
+	    if (p < plength && pattern[p] == '?')
+	    {	op = REnmq;	// minimal munch version
+		p++;
+	    }
+	    len = buf.offset - offset;
+	    buf.spread(offset, 1 + uint.sizeof * 3);
+	    buf.data[offset] = op;
+	    uint* puint = cast(uint *)&buf.data[offset + 1];
+	    puint[0] = len;
+	    puint[1] = n;
+	    puint[2] = m;
+	    break;
+
+	default:
+	    break;
+    }
+    return 1;
+
+Lerr:
+    error("badly formed {n,m}");
+    assert(0);
+}
+
+int parseAtom()
+{   ubyte op;
+    uint offset;
+    rchar c;
+
+    //printf("parseAtom() '%.*s'\n", pattern[p .. pattern.length]);
+    if (p < pattern.length)
+    {
+	c = pattern[p];
+	switch (c)
+	{
+	    case '*':
+	    case '+':
+	    case '?':
+		error("*+? not allowed in atom");
+		p++;
+		return 0;
+
+	    case '(':
+		p++;
+		buf.write(REparen);
+		offset = buf.offset;
+		buf.write(cast(uint)0);		// reserve space for length
+		buf.write(re_nsub);
+		re_nsub++;
+		parseRegexp();
+		*cast(uint *)&buf.data[offset] =
+		    buf.offset - (offset + uint.sizeof * 2);
+		if (p == pattern.length || pattern[p] != ')')
+		{
+		    error("')' expected");
+		    return 0;
+		}
+		p++;
+		break;
+
+	    case '[':
+		if (!parseRange())
+		    return 0;
+		break;
+
+	    case '.':
+		p++;
+		buf.write(REanychar);
+		break;
+
+	    case '^':
+		p++;
+		buf.write(REbol);
+		break;
+
+	    case '$':
+		p++;
+		buf.write(REeol);
+		break;
+
+	    case '\\':
+		p++;
+		if (p == pattern.length)
+		{   error("no character past '\\'");
+		    return 0;
+		}
+		c = pattern[p];
+		switch (c)
+		{
+		    case 'b':    op = REwordboundary;	 goto Lop;
+		    case 'B':    op = REnotwordboundary; goto Lop;
+		    case 'd':    op = REdigit;		 goto Lop;
+		    case 'D':    op = REnotdigit;	 goto Lop;
+		    case 's':    op = REspace;		 goto Lop;
+		    case 'S':    op = REnotspace;	 goto Lop;
+		    case 'w':    op = REword;		 goto Lop;
+		    case 'W':    op = REnotword;	 goto Lop;
+
+		    Lop:
+			buf.write(op);
+			p++;
+			break;
+
+		    case 'f':
+		    case 'n':
+		    case 'r':
+		    case 't':
+		    case 'v':
+		    case 'c':
+		    case 'x':
+		    case 'u':
+		    case '0':
+			c = cast(char)escape();
+			goto Lbyte;
+
+		    case '1': case '2': case '3':
+		    case '4': case '5': case '6':
+		    case '7': case '8': case '9':
+			c -= '1';
+			if (c < re_nsub)
+			{   buf.write(REbackref);
+			    buf.write(cast(ubyte)c);
+			}
+			else
+			{   error("no matching back reference");
+			    return 0;
+			}
+			p++;
+			break;
+
+		    default:
+			p++;
+			goto Lbyte;
+		}
+		break;
+
+	    default:
+		p++;
+	    Lbyte:
+		op = REchar;
+		if (attributes & REA.ignoreCase)
+		{
+		    if (isalpha(c))
+		    {
+			op = REichar;
+			c = cast(char)std.ctype.toupper(c);
+		    }
+		}
+		if (op == REchar && c <= 0xFF)
+		{
+		    // Look ahead and see if we can make this into
+		    // an REstring
+		    int q;
+		    int len;
+
+		    for (q = p; q < pattern.length; ++q)
+		    {	rchar qc = pattern[q];
+
+			switch (qc)
+			{
+			    case '{':
+			    case '*':
+			    case '+':
+			    case '?':
+				if (q == p)
+				    goto Lchar;
+				q--;
+				break;
+
+			    case '(':	case ')':
+			    case '|':
+			    case '[':	case ']':
+			    case '.':	case '^':
+			    case '$':	case '\\':
+			    case '}':
+				break;
+
+			    default:
+				continue;
+			}
+			break;
+		    }
+		    len = q - p;
+		    if (len > 0)
+		    {
+			debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]);
+			buf.reserve(5 + (1 + len) * rchar.sizeof);
+			buf.write((attributes & REA.ignoreCase) ? REistring : REstring);
+			buf.write(len + 1);
+			buf.write(c);
+			buf.write(pattern[p .. p + len]);
+			p = q;
+			break;
+		    }
+		}
+		if (c >= 0x80)
+		{
+		    // Convert to dchar opcode
+		    op = (op == REchar) ? REdchar : REidchar;
+		    buf.write(op);
+		    buf.write(c);
+		}
+		else
+		{
+		 Lchar:
+		    debug(regexp) printf("It's an REchar '%c'\n", c);
+		    buf.write(op);
+		    buf.write(cast(char)c);
+		}
+		break;
+	}
+    }
+    return 1;
+}
+
+private:
+class Range
+{
+    uint maxc;
+    uint maxb;
+    OutBuffer buf;
+    ubyte* base;
+    BitArray bits;
+
+    this(OutBuffer buf)
+    {
+	this.buf = buf;
+	if (buf.data.length)
+	    this.base = &buf.data[buf.offset];
+    }
+
+    void setbitmax(uint u)
+    {   uint b;
+
+	//printf("setbitmax(x%x), maxc = x%x\n", u, maxc);
+	if (u > maxc)
+	{
+	    maxc = u;
+	    b = u / 8;
+	    if (b >= maxb)
+	    {	uint u2;
+
+		u2 = base ? base - &buf.data[0] : 0;
+		++b;
+		version (BigEndian)
+		{
+		    while (b & (uint.sizeof-1))
+			++b;
+		}
+		
+		buf.fill0(b - maxb);
+		base = &buf.data[u2];
+		maxb = b;
+		// %% moved array recreate out of this condition
+		bits.ptr = cast(uint*)this.base;
+ 	    }
+	    //bits = (cast(bit*)this.base)[0 .. maxc + 1];
+	    bits.len = maxc + 1;
+	}
+    }
+
+    void setbit2(uint u)
+    {
+	setbitmax(u + 1);
+	//printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7));
+	bits[u] = 1;
+    }
+
+};
+
+int parseRange()
+{   ubyte op;
+    int c;
+    int c2;
+    uint i;
+    uint cmax;
+    uint offset;
+
+    cmax = 0x7F;
+    p++;
+    op = REbit;
+    if (p == pattern.length)
+	goto Lerr;
+    if (pattern[p] == '^')
+    {   p++;
+	op = REnotbit;
+	if (p == pattern.length)
+	    goto Lerr;
+    }
+    buf.write(op);
+    offset = buf.offset;
+    buf.write(cast(uint)0);		// reserve space for length
+    buf.reserve(128 / 8);
+    auto r = new Range(buf);
+    if (op == REnotbit)
+	r.setbit2(0);
+    switch (pattern[p])
+    {
+	case ']':
+	case '-':
+	    c = pattern[p];
+	    p++;
+	    r.setbit2(c);
+	    break;
+
+	default:
+	    break;
+    }
+
+    enum RS { start, rliteral, dash };
+    RS rs;
+
+    rs = RS.start;
+    for (;;)
+    {
+	if (p == pattern.length)
+	    goto Lerr;
+	switch (pattern[p])
+	{
+	    case ']':
+		switch (rs)
+		{   case RS.dash:
+			r.setbit2('-');
+		    case RS.rliteral:
+			r.setbit2(c);
+			break;
+		    case RS.start:
+			break;
+		    default:
+			assert(0);
+		}
+		p++;
+		break;
+
+	    case '\\':
+		p++;
+		r.setbitmax(cmax);
+		if (p == pattern.length)
+		    goto Lerr;
+		switch (pattern[p])
+		{
+		    case 'd':
+			for (i = '0'; i <= '9'; i++)
+			    r.bits[i] = 1;
+			goto Lrs;
+
+		    case 'D':
+			for (i = 1; i < '0'; i++)
+			    r.bits[i] = 1;
+			for (i = '9' + 1; i <= cmax; i++)
+			    r.bits[i] = 1;
+			goto Lrs;
+
+		    case 's':
+			for (i = 0; i <= cmax; i++)
+			    if (isspace(i))
+				r.bits[i] = 1;
+			goto Lrs;
+
+		    case 'S':
+			for (i = 1; i <= cmax; i++)
+			    if (!isspace(i))
+				r.bits[i] = 1;
+			goto Lrs;
+
+		    case 'w':
+			for (i = 0; i <= cmax; i++)
+			    if (isword(cast(rchar)i))
+				r.bits[i] = 1;
+			goto Lrs;
+
+		    case 'W':
+			for (i = 1; i <= cmax; i++)
+			    if (!isword(cast(rchar)i))
+				r.bits[i] = 1;
+			goto Lrs;
+
+		    Lrs:
+			switch (rs)
+			{   case RS.dash:
+				r.setbit2('-');
+			    case RS.rliteral:
+				r.setbit2(c);
+				break;
+			    default:
+				break;
+			}
+			rs = RS.start;
+			continue;
+
+		    default:
+			break;
+		}
+		c2 = escape();
+		goto Lrange;
+
+	    case '-':
+		p++;
+		if (rs == RS.start)
+		    goto Lrange;
+		else if (rs == RS.rliteral)
+		    rs = RS.dash;
+		else if (rs == RS.dash)
+		{
+		    r.setbit2(c);
+		    r.setbit2('-');
+		    rs = RS.start;
+		}
+		continue;
+
+	    default:
+		c2 = pattern[p];
+		p++;
+	    Lrange:
+		switch (rs)
+		{   case RS.rliteral:
+			r.setbit2(c);
+		    case RS.start:
+			c = c2;
+			rs = RS.rliteral;
+			break;
+
+		    case RS.dash:
+			if (c > c2)
+			{   error("inverted range in character class");
+			    return 0;
+			}
+			r.setbitmax(c2);
+			//printf("c = %x, c2 = %x\n",c,c2);
+			for (; c <= c2; c++)
+			    r.bits[c] = 1;
+			rs = RS.start;
+			break;
+
+		    default:
+			assert(0);
+		}
+		continue;
+	}
+	break;
+    }
+    if (attributes & REA.ignoreCase)
+    {
+	// BUG: what about dchar?
+	r.setbitmax(0x7F);
+	for (c = 'a'; c <= 'z'; c++)
+	{
+	    if (r.bits[c])
+		r.bits[c + 'A' - 'a'] = 1;
+	    else if (r.bits[c + 'A' - 'a'])
+		r.bits[c] = 1;
+	}
+    }
+    //printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb);
+    (cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc;
+    (cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb;
+    return 1;
+
+Lerr:
+    error("invalid range");
+    return 0;
+}
+
+void error(char[] msg)
+{
+    errors++;
+    debug(regexp) printf("error: %.*s\n", cast(int) msg.length, msg.ptr);
+//assert(0);
+//*(char*)0=0;
+    throw new RegExpException(msg);
+}
+
+// p is following the \ char
+int escape()
+in
+{
+    assert(p < pattern.length);
+}
+body
+{   int c;
+    int i;
+    rchar tc;
+
+    c = pattern[p];		// none of the cases are multibyte
+    switch (c)
+    {
+	case 'b':    c = '\b';	break;
+	case 'f':    c = '\f';	break;
+	case 'n':    c = '\n';	break;
+	case 'r':    c = '\r';	break;
+	case 't':    c = '\t';	break;
+	case 'v':    c = '\v';	break;
+
+	// BUG: Perl does \a and \e too, should we?
+
+	case 'c':
+	    ++p;
+	    if (p == pattern.length)
+		goto Lretc;
+	    c = pattern[p];
+	    // Note: we are deliberately not allowing dchar letters
+	    if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')))
+	    {
+	     Lcerr:
+		error("letter expected following \\c");
+		return 0;
+	    }
+	    c &= 0x1F;
+	    break;
+
+	case '0':
+	case '1':
+	case '2':
+	case '3':
+	case '4':
+	case '5':
+	case '6':
+	case '7':
+	    c -= '0';
+	    for (i = 0; i < 2; i++)
+	    {
+		p++;
+		if (p == pattern.length)
+		    goto Lretc;
+		tc = pattern[p];
+		if ('0' <= tc && tc <= '7')
+		{   c = c * 8 + (tc - '0');
+		    // Treat overflow as if last
+		    // digit was not an octal digit
+		    if (c >= 0xFF)
+		    {	c >>= 3;
+			return c;
+		    }
+		}
+		else
+		    return c;
+	    }
+	    break;
+
+	case 'x':
+	    c = 0;
+	    for (i = 0; i < 2; i++)
+	    {
+		p++;
+		if (p == pattern.length)
+		    goto Lretc;
+		tc = pattern[p];
+		if ('0' <= tc && tc <= '9')
+		    c = c * 16 + (tc - '0');
+		else if ('a' <= tc && tc <= 'f')
+		    c = c * 16 + (tc - 'a' + 10);
+		else if ('A' <= tc && tc <= 'F')
+		    c = c * 16 + (tc - 'A' + 10);
+		else if (i == 0)	// if no hex digits after \x
+		{
+		    // Not a valid \xXX sequence
+		    return 'x';
+		}
+		else
+		    return c;
+	    }
+	    break;
+
+	case 'u':
+	    c = 0;
+	    for (i = 0; i < 4; i++)
+	    {
+		p++;
+		if (p == pattern.length)
+		    goto Lretc;
+		tc = pattern[p];
+		if ('0' <= tc && tc <= '9')
+		    c = c * 16 + (tc - '0');
+		else if ('a' <= tc && tc <= 'f')
+		    c = c * 16 + (tc - 'a' + 10);
+		else if ('A' <= tc && tc <= 'F')
+		    c = c * 16 + (tc - 'A' + 10);
+		else
+		{
+		    // Not a valid \uXXXX sequence
+		    p -= i;
+		    return 'u';
+		}
+	    }
+	    break;
+
+	default:
+	    break;
+    }
+    p++;
+Lretc:
+    return c;
+}
+
+/* ==================== optimizer ======================= */
+
+void optimize()
+{   ubyte[] prog;
+
+    debug(regexp) printf("RegExp.optimize()\n");
+    prog = buf.toBytes();
+    for (size_t i = 0; 1;)
+    {
+	//printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring);
+	switch (prog[i])
+	{
+	    case REend:
+	    case REanychar:
+	    case REanystar:
+	    case REbackref:
+	    case REeol:
+	    case REchar:
+	    case REichar:
+	    case REdchar:
+	    case REidchar:
+	    case REstring:
+	    case REistring:
+	    case REtestbit:
+	    case REbit:
+	    case REnotbit:
+	    case RErange:
+	    case REnotrange:
+	    case REwordboundary:
+	    case REnotwordboundary:
+	    case REdigit:
+	    case REnotdigit:
+	    case REspace:
+	    case REnotspace:
+	    case REword:
+	    case REnotword:
+		return;
+
+	    case REbol:
+		i++;
+		continue;
+
+	    case REor:
+	    case REnm:
+	    case REnmq:
+	    case REparen:
+	    case REgoto:
+	    {
+		auto bitbuf = new OutBuffer;
+		auto r = new Range(bitbuf);
+		uint offset;
+
+		offset = i;
+		if (starrchars(r, prog[i .. prog.length]))
+		{
+		    debug(regexp) printf("\tfilter built\n");
+		    buf.spread(offset, 1 + 4 + r.maxb);
+		    buf.data[offset] = REtestbit;
+		    (cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc;
+		    (cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb;
+		    i = offset + 1 + 4;
+		    buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb];
+		}
+		return;
+	    }
+	    default:
+		assert(0);
+	}
+    }
+}
+
+/////////////////////////////////////////
+// OR the leading character bits into r.
+// Limit the character range from 0..7F,
+// trymatch() will allow through anything over maxc.
+// Return 1 if success, 0 if we can't build a filter or
+// if there is no point to one.
+
+int starrchars(Range r, ubyte[] prog)
+{   rchar c;
+    uint maxc;
+    uint maxb;
+    uint len;
+    uint b;
+    uint n;
+    uint m;
+    ubyte* pop;
+
+    //printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend);
+    for (size_t i = 0; i < prog.length;)
+    {
+	switch (prog[i])
+	{
+	    case REchar:
+		c = prog[i + 1];
+		if (c <= 0x7F)
+		    r.setbit2(c);
+		return 1;
+
+	    case REichar:
+		c = prog[i + 1];
+		if (c <= 0x7F)
+		{   r.setbit2(c);
+		    r.setbit2(std.ctype.tolower(cast(rchar)c));
+		}
+		return 1;
+
+	    case REdchar:
+	    case REidchar:
+		return 1;
+
+	    case REanychar:
+		return 0;		// no point
+
+	    case REstring:
+		len = *cast(uint *)&prog[i + 1];
+		assert(len);
+		c = *cast(rchar *)&prog[i + 1 + uint.sizeof];
+		debug(regexp) printf("\tREstring %d, '%c'\n", len, c);
+		if (c <= 0x7F)
+		    r.setbit2(c);
+		return 1;
+
+	    case REistring:
+		len = *cast(uint *)&prog[i + 1];
+		assert(len);
+		c = *cast(rchar *)&prog[i + 1 + uint.sizeof];
+		debug(regexp) printf("\tREistring %d, '%c'\n", len, c);
+		if (c <= 0x7F)
+		{   r.setbit2(std.ctype.toupper(cast(rchar)c));
+		    r.setbit2(std.ctype.tolower(cast(rchar)c));
+		}
+		return 1;
+
+	    case REtestbit:
+	    case REbit:
+		maxc = (cast(ushort *)&prog[i + 1])[0];
+		maxb = (cast(ushort *)&prog[i + 1])[1];
+		if (maxc <= 0x7F)
+		    r.setbitmax(maxc);
+		else
+		    maxb = r.maxb;
+		for (b = 0; b < maxb; b++)
+		    r.base[b] |= prog[i + 1 + 4 + b];
+		return 1;
+
+	    case REnotbit:
+		maxc = (cast(ushort *)&prog[i + 1])[0];
+		maxb = (cast(ushort *)&prog[i + 1])[1];
+		if (maxc <= 0x7F)
+		    r.setbitmax(maxc);
+		else
+		    maxb = r.maxb;
+		for (b = 0; b < maxb; b++)
+		    r.base[b] |= ~prog[i + 1 + 4 + b];
+		return 1;
+
+	    case REbol:
+	    case REeol:
+		return 0;
+
+	    case REor:
+		len = (cast(uint *)&prog[i + 1])[0];
+		return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) &&
+		       starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]);
+
+	    case REgoto:
+		len = (cast(uint *)&prog[i + 1])[0];
+		i += 1 + uint.sizeof + len;
+		break;
+
+	    case REanystar:
+		return 0;
+
+	    case REnm:
+	    case REnmq:
+		// len, n, m, ()
+		len = (cast(uint *)&prog[i + 1])[0];
+		n   = (cast(uint *)&prog[i + 1])[1];
+		m   = (cast(uint *)&prog[i + 1])[2];
+		pop = &prog[i + 1 + uint.sizeof * 3];
+		if (!starrchars(r, pop[0 .. len]))
+		    return 0;
+		if (n)
+		    return 1;
+		i += 1 + uint.sizeof * 3 + len;
+		break;
+
+	    case REparen:
+		// len, ()
+		len = (cast(uint *)&prog[i + 1])[0];
+		n   = (cast(uint *)&prog[i + 1])[1];
+		pop = &prog[0] + i + 1 + uint.sizeof * 2;
+		return starrchars(r, pop[0 .. len]);
+
+	    case REend:
+		return 0;
+
+	    case REwordboundary:
+	    case REnotwordboundary:
+		return 0;
+
+	    case REdigit:
+		r.setbitmax('9');
+		for (c = '0'; c <= '9'; c++)
+		    r.bits[c] = 1;
+		return 1;
+
+	    case REnotdigit:
+		r.setbitmax(0x7F);
+		for (c = 0; c <= '0'; c++)
+		    r.bits[c] = 1;
+		for (c = '9' + 1; c <= r.maxc; c++)
+		    r.bits[c] = 1;
+		return 1;
+
+	    case REspace:
+		r.setbitmax(0x7F);
+		for (c = 0; c <= r.maxc; c++)
+		    if (isspace(c))
+			r.bits[c] = 1;
+		return 1;
+
+	    case REnotspace:
+		r.setbitmax(0x7F);
+		for (c = 0; c <= r.maxc; c++)
+		    if (!isspace(c))
+			r.bits[c] = 1;
+		return 1;
+
+	    case REword:
+		r.setbitmax(0x7F);
+		for (c = 0; c <= r.maxc; c++)
+		    if (isword(cast(rchar)c))
+			r.bits[c] = 1;
+		return 1;
+
+	    case REnotword:
+		r.setbitmax(0x7F);
+		for (c = 0; c <= r.maxc; c++)
+		    if (!isword(cast(rchar)c))
+			r.bits[c] = 1;
+		return 1;
+
+	    case REbackref:
+		return 0;
+
+	    default:
+		assert(0);
+	}
+    }
+    return 1;
+}
+
+/* ==================== replace ======================= */
+
+/***********************
+ * After a match is found with test(), this function
+ * will take the match results and, using the format
+ * string, generate and return a new string.
+ */
+
+public rchar[] replace(rchar[] format)
+{
+    return replace3(format, input, pmatch[0 .. re_nsub + 1]);
+}
+
+// Static version that doesn't require a RegExp object to be created
+
+public static rchar[] replace3(rchar[] format, rchar[] input, regmatch_t[] pmatch)
+{
+    rchar[] result;
+    uint c2;
+    int rm_so;
+    int rm_eo;
+    int i;
+
+//    printf("replace3(format = '%.*s', input = '%.*s')\n", format, input);
+    result.length = format.length;
+    result.length = 0;
+    for (size_t f = 0; f < format.length; f++)
+    {
+	auto c = format[f];
+      L1:
+	if (c != '$')
+	{
+	    result ~= c;
+	    continue;
+	}
+	++f;
+	if (f == format.length)
+	{
+	    result ~= '$';
+	    break;
+	}
+	c = format[f];
+	switch (c)
+	{
+	    case '&':
+		rm_so = pmatch[0].rm_so;
+		rm_eo = pmatch[0].rm_eo;
+		goto Lstring;
+
+	    case '`':
+		rm_so = 0;
+		rm_eo = pmatch[0].rm_so;
+		goto Lstring;
+
+	    case '\'':
+		rm_so = pmatch[0].rm_eo;
+		rm_eo = input.length;
+		goto Lstring;
+
+	    case '0': case '1': case '2': case '3': case '4':
+	    case '5': case '6': case '7': case '8': case '9':
+		i = c - '0';
+		if (f + 1 == format.length)
+		{
+		    if (i == 0)
+		    {
+			result ~= '$';
+			result ~= c;
+			continue;
+		    }
+		}
+		else
+		{
+		    c2 = format[f + 1];
+		    if (c2 >= '0' && c2 <= '9')
+		    {   i = (c - '0') * 10 + (c2 - '0');
+			f++;
+		    }
+		    if (i == 0)
+		    {
+			result ~= '$';
+			result ~= c;
+			c = cast(char)c2;
+			goto L1;
+		    }
+		}
+
+		if (i < pmatch.length)
+		{   rm_so = pmatch[i].rm_so;
+		    rm_eo = pmatch[i].rm_eo;
+		    goto Lstring;
+		}
+		break;
+
+	    Lstring:
+		if (rm_so != rm_eo)
+		    result ~= input[rm_so .. rm_eo];
+		break;
+
+	    default:
+		result ~= '$';
+		result ~= c;
+		break;
+	}
+    }
+    return result;
+}
+
+/************************************
+ * Like replace(char[] format), but uses old style formatting:
+		<table border=1 cellspacing=0 cellpadding=5>
+		<th>Format
+		<th>Description
+		<tr>
+		<td><b>&</b>
+		<td>replace with the match
+		</tr>
+		<tr>
+		<td><b>\</b><i>n</i>
+		<td>replace with the <i>n</i>th parenthesized match, <i>n</i> is 1..9
+		</tr>
+		<tr>
+		<td><b>\</b><i>c</i>
+		<td>replace with char <i>c</i>.
+		</tr>
+		</table>
+ */
+
+public rchar[] replaceOld(rchar[] format)
+{
+    rchar[] result;
+
+//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo);
+//printf("3input = '%.*s'\n", input);
+    result.length = format.length;
+    result.length = 0;
+    for (size_t i; i < format.length; i++)
+    {
+	auto c = format[i];
+	switch (c)
+	{
+	    case '&':
+//printf("match = '%.*s'\n", input[pmatch[0].rm_so .. pmatch[0].rm_eo]);
+		result ~= input[pmatch[0].rm_so .. pmatch[0].rm_eo];
+		break;
+
+	    case '\\':
+		if (i + 1 < format.length)
+		{
+		    c = format[++i];
+		    if (c >= '1' && c <= '9')
+		    {   uint j;
+
+			j = c - '0';
+			if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo)
+			    result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo];
+			break;
+		    }
+		}
+		result ~= c;
+		break;
+
+	    default:
+		result ~= c;
+		break;
+	}
+    }
+    return result;
+}
+
+}
+
+unittest
+{   // Created and placed in public domain by Don Clugston
+
+    auto m = search("aBC r s", `bc\x20r[\40]s`, "i");
+    assert(m.pre=="a");
+    assert(m.match(0)=="BC r s");
+    auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`);
+    assert(m2.match(0)=="7xxyxx");
+    // Just check the parsing.
+    auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`);
+    auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`);
+    auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`);
+    auto m6 = search("xxy", `.*y`);
+    assert(m6.match(0)=="xxy");
+    auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i");
+    assert(m7.match(0)=="DEfG");
+    auto m8 = search("dcbxx", `a?\B\s\S`);
+    auto m9 = search("dcbxx", `[-w]`);
+    auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i");
+    auto m11 = search("dcbsfd", `[]a-]`);
+    m.replaceOld(`a&b\1c`);
+    m.replace(`a$&b$'$1c`);
+}
author	Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date	Mon, 04 Aug 2008 19:28:49 +0200
parents
children	88e23f8c2354