diff options
Diffstat (limited to 'plugins/SmileyAdd/regexp/WCPattern.h')
-rw-r--r-- | plugins/SmileyAdd/regexp/WCPattern.h | 1663 |
1 files changed, 1663 insertions, 0 deletions
diff --git a/plugins/SmileyAdd/regexp/WCPattern.h b/plugins/SmileyAdd/regexp/WCPattern.h new file mode 100644 index 0000000000..3d52a7fd2e --- /dev/null +++ b/plugins/SmileyAdd/regexp/WCPattern.h @@ -0,0 +1,1663 @@ +#ifndef __WCPATTERN_H__
+#define __WCPATTERN_H__
+
+#ifdef _WIN32
+ #pragma warning(disable:4786)
+#endif
+
+#include "bkstring.h"
+
+#include <vector>
+#include <map>
+
+class WCMatcher;
+class NFAUNode;
+class NFAQuantifierUNode;
+
+/**
+ This pattern class is very similar in functionality to Java's
+ java.util.regex.WCPattern class. The pattern class represents an immutable
+ regular expression object. Instead of having a single object contain both the
+ regular expression object and the matching object, instead the two objects are
+ split apart. The {@link WCMatcher WCMatcher} class represents the maching
+ object.
+
+ The WCPattern class works primarily off of "compiled" patterns. A typical
+ instantiation of a regular expression looks like:
+
+ <pre>
+ WCPattern * p = WCPattern::compile(L"a*b");
+ WCMatcher * m = p->createWCMatcher(L"aaaaaab");
+ if (m->matches()) ...
+ </pre>
+
+ However, if you do not need to use a pattern more than once, it is often times
+ okay to use the WCPattern's static methods insteads. An example looks like this:
+
+ <pre>
+ if (WCPattern::matches(L"a*b", L"aaaab")) { ... }
+ </pre>
+
+ This class does not currently support unicode. The unicode update for this
+ class is coming soon.
+
+ This class is partially immutable. It is completely safe to call createWCMatcher
+ concurrently in different threads, but the other functions (e.g. split) should
+ not be called concurrently on the same <code>WCPattern</code>.
+
+ <table border="0" cellpadding="1" cellspacing="0">
+ <tr align="left" bgcolor="#CCCCFF">
+ <td>
+ <b>Construct</b>
+ </td>
+ <td>
+ <b>Matches</b>
+ </th>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Characters</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x</i></code>
+ </td>
+ <td>
+ The character <code><i>x</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\\</code>
+ </td>
+ <td>
+ The character <code>\</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\0<i>nn</i></code>
+ </td>
+ <td>
+ The character with octal ASCII value <code><i>nn</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\0<i>nnn</i></code>
+ </td>
+ <td>
+ The character with octal ASCII value <code><i>nnn</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\x<i>hh</i></code>
+ </td>
+ <td>
+ The character with hexadecimal ASCII value <code><i>hh</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\t</code>
+ </td>
+ <td>
+ A tab character
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\r</code>
+ </td>
+ <td>
+ A carriage return character
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\n</code>
+ </td>
+ <td>
+ A new-line character
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>Character Classes</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[abc]</code>
+ </td>
+ <td>
+ Either <code>a</code>, <code>b</code>, or <code>c</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[^abc]</code>
+ </td>
+ <td>
+ Any character but <code>a</code>, <code>b</code>, or <code>c</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-zA-Z]</code>
+ </td>
+ <td>
+ Any character ranging from <code>a</code> thru <code>z</code>, or
+ <code>A</code> thru <code>Z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[^a-zA-Z]</code>
+ </td>
+ <td>
+ Any character except those ranging from <code>a</code> thru
+ <code>z</code>, or <code>A</code> thru <code>Z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a\-z]</code>
+ </td>
+ <td>
+ Either <code>a</code>, <code>-</code>, or <code>z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z[A-Z]]</code>
+ </td>
+ <td>
+ Same as <code>[a-zA-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z&&[g-i]]</code>
+ </td>
+ <td>
+ Any character in the intersection of <code>a-z</code> and
+ <code>g-i</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z&&[^g-i]]</code>
+ </td>
+ <td>
+ Any character in <code>a-z</code> and not in <code>g-i</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Prefefined character classes</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><b>.</b></code>
+ </td>
+ <td>
+ Any character. Multiline matching must be compiled into the pattern for
+ <code><b>.</b></code> to match a <code>\r</code> or a <code>\n</code>.
+ Even if multiline matching is enabled, <code><b>.</b></code> will not
+ match a <code>\r\n</code>, only a <code>\r</code> or a <code>\n</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\d</code>
+ </td>
+ <td>
+ <code>[0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\D</code>
+ </td>
+ <td>
+ <code>[^\d]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\s</code>
+ </td>
+ <td>
+ <code>[ \t\r\n\x0B]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\S</code>
+ </td>
+ <td>
+ <code>[^\s]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\w</code>
+ </td>
+ <td>
+ <code>[a-zA-Z0-9_]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\W</code>
+ </td>
+ <td>
+ <code>[^\w]</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>POSIX character classes
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Lower}</code>
+ </td>
+ <td>
+ <code>[a-z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Upper}</code>
+ </td>
+ <td>
+ <code>[A-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{ASCII}</code>
+ </td>
+ <td>
+ <code>[\x00-\x7F]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Alpha}</code>
+ </td>
+ <td>
+ <code>[a-zA-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Digit}</code>
+ </td>
+ <td>
+ <code>[0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Alnum}</code>
+ </td>
+ <td>
+ <code>[\w&&[^_]]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Punct}</code>
+ </td>
+ <td>
+ <code>[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{XDigit}</code>
+ </td>
+ <td>
+ <code>[a-fA-F0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Boundary Matches</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>^</code>
+ </td>
+ <td>
+ The beginning of a line. Also matches the beginning of input.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>$</code>
+ </td>
+ <td>
+ The end of a line. Also matches the end of input.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\b</code>
+ </td>
+ <td>
+ A word boundary
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\B</code>
+ </td>
+ <td>
+ A non word boundary
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\A</code>
+ </td>
+ <td>
+ The beginning of input
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\G</code>
+ </td>
+ <td>
+ The end of the previous match. Ensures that a "next" match will only
+ happen if it begins with the character immediately following the end of
+ the "current" match.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\Z</code>
+ </td>
+ <td>
+ The end of input. Will also match if there is a single trailing
+ <code>\r\n</code>, a single trailing <code>\r</code>, or a single
+ trailing <code>\n</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\z</code>
+ </td>
+ <td>
+ The end of input
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Greedy Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x?</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x+</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Possessive Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x?+</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*+</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x++</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Reluctant Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x??</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*?</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x+?</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Operators</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>xy</i></code>
+ </td>
+ <td>
+ <code><i>x</i></code> then <code><i>y</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x</i></code>|<code><i>y</i></code>
+ </td>
+ <td>
+ <code><i>x</i></code> or <code><i>y</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code> as a capturing group
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Quoting</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\Q</code>
+ </td>
+ <td>
+ Nothing, but treat every character (including \s) literally until a
+ matching <code>\E</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\E</code>
+ </td>
+ <td>
+ Nothing, but ends its matching <code>\Q</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Special Constructs</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?:<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, but not as a capturing group
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?=<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via positive lookahead. This means that the
+ expression will match only if it is trailed by <code><i>x</i></code>.
+ It will not "eat" any of the characters matched by
+ <code><i>x</i></code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?!<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via negative lookahead. This means that the
+ expression will match only if it is not trailed by
+ <code><i>x</i></code>. It will not "eat" any of the characters
+ matched by <code><i>x</i></code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?<=<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via positive lookbehind. <code><i>x</i></code>
+ cannot contain any quantifiers.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?<!<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via negative lookbehind. <code><i>x</i></code>
+ cannot contain any quantifiers.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?><i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i>{1}+</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Registered Expression Matching</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>{<i>x</i>}</code>
+ </td>
+ <td>
+ The registered pattern <code><i>x</i></code>
+ </td>
+ </tr>
+ </table>
+
+ <hr>
+
+ <i>Begin Text Extracted And Modified From java.util.regex.WCPattern documentation</i>
+
+ <h4> Backslashes, escapes, and quoting </h4>
+
+ <p> The backslash character (<tt>(wchar_t)'\'</tt>) serves to introduce escaped
+ constructs, as defined in the table above, as well as to quote characters
+ that otherwise would be interpreted as unescaped constructs. Thus the
+ expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
+ left brace.
+
+ <p> It is an error to use a backslash prior to any alphabetic character that
+ does not denote an escaped construct; these are reserved for future
+ extensions to the regular-expression language. A backslash may be used
+ prior to a non-alphabetic character regardless of whether that character is
+ part of an unescaped construct.
+
+ <p>It is necessary to double backslashes in string literals that represent
+ regular expressions to protect them from interpretation by a compiler. The
+ string literal <tt>"\b"</tt>, for example, matches a single backspace
+ character when interpreted as a regular expression, while
+ <tt>"\\b"</tt> matches a word boundary. The string litera
+ <tt>"\(hello\)"</tt> is illegal and leads to a compile-time error;
+ in order to match the string <tt>(hello)</tt> the string literal
+ <tt>"\\(hello\\)"</tt> must be used.
+
+ <h4> Character Classes </h4>
+
+ <p> Character classes may appear within other character classes, and
+ may be composed by the union operator (implicit) and the intersection
+ operator (<tt>&&</tt>).
+ The union operator denotes a class that contains every character that is
+ in at least one of its operand classes. The intersection operator
+ denotes a class that contains every character that is in both of its
+ operand classes.
+
+ <p> The precedence of character-class operators is as follows, from
+ highest to lowest:
+
+ <blockquote><table border="0" cellpadding="1" cellspacing="0"
+ summary="Precedence of character class operators.">
+
+ <tr><th>1 </th>
+ <td>Literal escape </td>
+ <td><tt>\x</tt></td></tr>
+ <tr><th>2 </th>
+ <td>Range</td>
+ <td><tt>a-z</tt></td></tr>
+ <tr><th>3 </th>
+ <td>Grouping</td>
+ <td><tt>[...]</tt></td></tr>
+ <tr><th>4 </th>
+ <td>Intersection</td>
+ <td><tt>[a-z&&[aeiou]]</tt></td></tr>
+ <tr><th>5 </th>
+ <td>Union</td>
+ <td><tt>[a-e][i-u]<tt></td></tr>
+ </table></blockquote>
+
+ <p> Note that a different set of metacharacters are in effect inside
+ a character class than outside a character class. For instance, the
+ regular expression <tt>.</tt> loses its special meaning inside a
+ character class, while the expression <tt>-</tt> becomes a range
+ forming metacharacter.
+
+ <a name="lt">
+
+ <a name="cg">
+ <h4> Groups and capturing </h4>
+
+ <p> Capturing groups are numbered by counting their opening parentheses from
+ left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
+ are four such groups: </p>
+
+ <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
+
+ <tr><th>1 </th>
+ <td><tt>((A)(B(C)))</tt></td></tr>
+ <tr><th>2 </th>
+ <td><tt>(A)</tt></td></tr>
+ <tr><th>3 </th>
+ <td><tt>(B(C))</tt></td></tr>
+
+ <tr><th>4 </th>
+ <td><tt>(C)</tt></td></tr>
+ </table></blockquote>
+
+ <p> Group zero always stands for the entire expression.
+
+ <p> Capturing groups are so named because, during a match, each subsequence
+ of the input sequence that matches such a group is saved. The captured
+ subsequence may be used later in the expression, via a back reference, and
+ may also be retrieved from the matcher once the match operation is complete.
+
+ <p> The captured input associated with a group is always the subsequence
+ that the group most recently matched. If a group is evaluated a second time
+ because of quantification then its previously-captured value, if any, will
+ be retained if the second evaluation fails. Matching the string
+ <tt>L"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
+ group two set to <tt>L"b"</tt>. All captured input is discarded at the
+ beginning of each match.
+
+ <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
+ that do not capture text and do not count towards the group total.
+
+
+ <h4> WC support </h4>
+
+ <p> Coming Soon.
+
+ <h4> Comparison to Perl 5 </h4>
+
+ <p>The <code>WCPattern</code> engine performs traditional NFA-based matching
+ with ordered alternation as occurs in Perl 5.
+
+ <p> Perl constructs not supported by this class: </p>
+
+ <ul>
+
+ <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
+ <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
+ </p></li>
+
+ <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
+ and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
+
+ <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
+
+ <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>,
+ <tt>\L</tt>, and <tt>\U</tt>. </p></li>
+
+ <li><p> Embedded flags</p></li>
+
+ </ul>
+
+ <p> Constructs supported by this class but not by Perl: </p>
+
+ <ul>
+
+ <li><p> Possessive quantifiers, which greedily match as much as they can
+ and do not back off, even when doing so would allow the overall match to
+ succeed. </p></li>
+
+ <li><p> Character-class union and intersection as described
+ above.</p></li>
+
+ </ul>
+
+ <p> Notable differences from Perl: </p>
+
+ <ul>
+
+ <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
+ as back references; a backslash-escaped number greater than <tt>9</tt> is
+ treated as a back reference if at least that many subexpressions exist,
+ otherwise it is interpreted, if possible, as an octal escape. In this
+ class octal escapes must always begin with a zero. In this class,
+ <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
+ references, and a larger number is accepted as a back reference if at
+ least that many subexpressions exist at that point in the regular
+ expression, otherwise the parser will drop digits until the number is
+ smaller or equal to the existing number of groups or it is one digit.
+ </p></li>
+
+ <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
+ where the last match left off. This functionality is provided implicitly
+ by the <CODE>WCMatcher</CODE> class: Repeated invocations of the
+ <code>find</code> method will resume where the last match left off,
+ unless the matcher is reset. </p></li>
+
+ <li><p> Perl is forgiving about malformed matching constructs, as in the
+ expression <tt>*a</tt>, as well as dangling brackets, as in the
+ expression <tt>abc]</tt>, and treats them as literals. This
+ class also strict and will not compile a pattern when dangling characters
+ are encountered.</p></li>
+
+ </ul>
+
+
+ <p> For a more precise description of the behavior of regular expression
+ constructs, please see <a href="http://www.oreilly.com/catalog/regex2/">
+ <i>Mastering Regular Expressions, 2nd Edition</i>, Jeffrey E. F. Friedl,
+ O'Reilly and Associates, 2002.</a>
+ </p>
+ <P>
+
+ <i>End Text Extracted And Modified From java.util.regex.WCPattern documentation</i>
+
+ <hr>
+
+ @author Jeffery Stuart
+ @since March 2003, Stable Since November 2004
+ @version 1.07.00
+ @memo A class used to represent "PERL 5"-ish regular expressions
+ */
+class WCPattern
+{
+ friend class WCMatcher;
+ friend class NFAUNode;
+ friend class NFAQuantifierUNode;
+ private:
+ /**
+ This constructor should not be called directly. Those wishing to use the
+ WCPattern class should instead use the {@link compile compile} method.
+
+ @param rhs The pattern to compile
+ @memo Creates a new pattern from the regular expression in <code>rhs</code>.
+ */
+ WCPattern(const bkstring & rhs);
+ protected:
+ /**
+ This currently is not used, so don't try to do anything with it.
+ @memo Holds all the compiled patterns for quick access.
+ */
+ static std::map<bkstring, WCPattern *> compiledWCPatterns;
+ /**
+ Holds all of the registered patterns as strings. Due to certain problems
+ with compilation of patterns, especially with capturing groups, this seemed
+ to be the best way to do it.
+ */
+ static std::map<bkstring, std::pair<bkstring, unsigned long> > registeredWCPatterns;
+ protected:
+ /**
+ Holds all the NFA nodes used. This makes deletion of a pattern, as well as
+ clean-up from an unsuccessful compile much easier and faster.
+ */
+ std::map<NFAUNode*, bool> nodes;
+ /**
+ Used when methods like split are called. The matcher class uses a lot of
+ dynamic memeory, so having an instance increases speedup of certain
+ operations.
+ */
+ WCMatcher * matcher;
+ /**
+ The front node of the NFA.
+ */
+ NFAUNode * head;
+ /**
+ The actual regular expression we rerpesent
+ */
+ bkstring pattern;
+ /**
+ Flag used during compilation. Once the pattern is successfully compiled,
+ <code>error</code> is no longer used.
+ */
+ bool error;
+ /**
+ Used during compilation to keep track of the current index into
+ <code>{@link pattern pattern}<code>. Once the pattern is successfully
+ compiled, <code>error</code> is no longer used.
+ */
+ int curInd;
+ /**
+ The number of capture groups this contains.
+ */
+ int groupCount;
+ /**
+ The number of non-capture groups this contains.
+ */
+ int nonCapGroupCount;
+ /**
+ The flags specified when this was compiled.
+ */
+ unsigned long flags;
+ protected:
+ /**
+ Raises an error during compilation. Compilation will cease at that point
+ and compile will return <code>NULL</code>.
+ */
+ void raiseError();
+ /**
+ Convenience function for registering a node in <code>nodes</code>.
+ @param node The node to register
+ @return The registered node
+ */
+ NFAUNode * registerNode(NFAUNode * node);
+
+ /**
+ Calculates the union of two strings. This function will first sort the
+ strings and then use a simple selection algorithm to find the union.
+ @param s1 The first "class" to union
+ @param s2 The second "class" to union
+ @return A new string containing all unique characters. Each character
+ must have appeared in one or both of <code>s1</code> and
+ <code>s2</code>.
+ */
+ bkstring classUnion (bkstring s1, bkstring s2) const;
+ /**
+ Calculates the intersection of two strings. This function will first sort
+ the strings and then use a simple selection algorithm to find the
+ intersection.
+ @param s1 The first "class" to intersect
+ @param s2 The second "class" to intersect
+ @return A new string containing all unique characters. Each character
+ must have appeared both <code>s1</code> and <code>s2</code>.
+ */
+ bkstring classIntersect (bkstring s1, bkstring s2) const;
+ /**
+ Calculates the negation of a string. The negation is the set of all
+ characters between <code>\x00</code> and <code>\xFF</code> not
+ contained in <code>s1</code>.
+ @param s1 The "class" to be negated.
+ @param s2 The second "class" to intersect
+ @return A new string containing all unique characters. Each character
+ must have appeared both <code>s1</code> and <code>s2</code>.
+ */
+ bkstring classNegate (bkstring s1) const;
+ /**
+ Creates a new "class" representing the range from <code>low</code> thru
+ <code>hi</code>. This function will wrap if <code>low</code> >
+ <code>hi</code>. This is a feature, not a buf. Sometimes it is useful
+ to be able to say [\x70-\x10] instead of [\x70-\x7F\x00-\x10].
+ @param low The beginning character
+ @param hi The ending character
+ @return A new string containing all the characters from low thru hi.
+ */
+ bkstring classCreateRange(wchar_t low, wchar_t hi) const;
+
+ /**
+ Extracts a decimal number from the substring of member-variable
+ <code>{@link pattern pattern}<code> starting at <code>start</code> and
+ ending at <code>end</code>.
+ @param start The starting index in <code>{@link pattern pattern}<code>
+ @param end The last index in <code>{@link pattern pattern}<code>
+ @return The decimal number in <code>{@link pattern pattern}<code>
+ */
+ int getInt(int start, int end);
+ /**
+ Parses a <code>{n,m}</code> string out of the member-variable
+ <code>{@link pattern pattern}<code> stores the result in <code>sNum</code>
+ and <code>eNum</code>.
+ @param sNum Output parameter. The minimum number of matches required
+ by the curly quantifier are stored here.
+ @param eNum Output parameter. The maximum number of matches allowed
+ by the curly quantifier are stored here.
+ @return Success/Failure. Fails when the curly does not have the proper
+ syntax
+ */
+ bool quantifyCurly(int & sNum, int & eNum);
+ /**
+ Tries to quantify the currently parsed group. If the group being parsed
+ is indeed quantified in the member-variable
+ <code>{@link pattern pattern}<code>, then the NFA is modified accordingly.
+ @param start The starting node of the current group being parsed
+ @param stop The ending node of the current group being parsed
+ @param gn The group number of the current group being parsed
+ @return The node representing the starting node of the group. If the
+ group becomes quantified, then this node is not necessarily
+ a GroupHead node.
+ */
+ NFAUNode * quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn);
+
+ /**
+ Tries to quantify the last parsed expression. If the character was indeed
+ quantified, then the NFA is modified accordingly.
+ @param newNode The recently created expression node
+ @return The node representing the last parsed expression. If the
+ expression was quantified, <code>return value != newNode</code>
+ */
+ NFAUNode * quantify(NFAUNode * newNode);
+ /**
+ Parses the current class being examined in
+ <code>{@link pattern pattern}</code>.
+ @return A string of unique characters contained in the current class being
+ parsed
+ */
+ bkstring parseClass();
+ /**
+ Parses the current POSIX class being examined in
+ <code>{@link pattern pattern}</code>.
+ @return A string of unique characters representing the POSIX class being
+ parsed
+ */
+ bkstring parsePosix();
+ /**
+ Returns a string containing the octal character being parsed
+ @return The string contained the octal value being parsed
+ */
+ bkstring parseOctal();
+ /**
+ Returns a string containing the hex character being parsed
+ @return The string contained the hex value being parsed
+ */
+ bkstring parseHex();
+ /**
+ Returns a new node representing the back reference being parsed
+ @return The new node representing the back reference being parsed
+ */
+ NFAUNode * parseBackref();
+ /**
+ Parses the escape sequence currently being examined. Determines if the
+ escape sequence is a class, a single character, or the beginning of a
+ quotation sequence.
+ @param inv Output parameter. Whether or not to invert the returned class
+ @param quo Output parameter. Whether or not this sequence starts a
+ quotation.
+ @return The characters represented by the class
+ */
+ bkstring parseEscape(bool & inv, bool & quo);
+ /**
+ Parses a supposed registered pattern currently under compilation. If the
+ sequence of characters does point to a registered pattern, then the
+ registered pattern is appended to <code>*end<code>. The registered pattern
+ is parsed with the current compilation flags.
+ @param end The ending node of the thus-far compiled pattern
+ @return The new end node of the current pattern
+ */
+ NFAUNode * parseRegisteredWCPattern(NFAUNode ** end);
+ /**
+ Parses a lookbehind expression. Appends the necessary nodes
+ <code>*end</code>.
+ @param pos Positive or negative look behind
+ @param end The ending node of the current pattern
+ @return The new end node of the current pattern
+ */
+ NFAUNode * parseBehind(const bool pos, NFAUNode ** end);
+ /**
+ Parses the current expression and tacks on nodes until a \E is found.
+ @return The end of the current pattern
+ */
+ NFAUNode * parseQuote();
+ /**
+ Parses <code>{@link pattern pattern}</code>. This function is called
+ recursively when an or (<code>|</code>) or a group is encountered.
+ @param inParen Are we currently parsing inside a group
+ @param inOr Are we currently parsing one side of an or (<code>|</code>)
+ @param end The end of the current expression
+ @return The starting node of the NFA constructed from this parse
+ */
+ NFAUNode * parse(const bool inParen = 0, const bool inOr = 0, NFAUNode ** end = NULL);
+ public:
+ /// We should match regardless of case
+ const static unsigned long CASE_INSENSITIVE;
+ /// We are implicitly quoted
+ const static unsigned long LITERAL;
+ /// @memo We should treat a <code><b>.</b></code> as [\x00-\x7F]
+ const static unsigned long DOT_MATCHES_ALL;
+ /** <code>^</code> and <code>$</code> should anchor to the beginning and
+ ending of lines, not all input
+ */
+ const static unsigned long MULTILINE_MATCHING;
+ /** When enabled, only instances of <code>\n</codes> are recognized as
+ line terminators
+ */
+ const static unsigned long UNIX_LINE_MODE;
+ /// The absolute minimum number of matches a quantifier can match (0)
+ const static int MIN_QMATCH;
+ /// The absolute maximum number of matches a quantifier can match (0x7FFFFFFF)
+ const static int MAX_QMATCH;
+ public:
+ /**
+ Call this function to compile a regular expression into a
+ <code>WCPattern</code> object. Special values can be assigned to
+ <code>mode</code> when certain non-standard behaviors are expected from
+ the <code>WCPattern</code> object.
+ @param pattern The regular expression to compile
+ @param mode A bitwise or of flags signalling what special behaviors are
+ wanted from this <code>WCPattern</code> object
+ @return If successful, <code>compile</code> returns a <code>WCPattern</code>
+ pointer. Upon failure, <code>compile</code> returns
+ <code>NULL</code>
+ */
+ static WCPattern * compile (const bkstring & pattern,
+ const unsigned long mode = 0);
+ /**
+ Dont use this function. This function will compile a pattern, and cache
+ the result. This will eventually be used as an optimization when people
+ just want to call static methods using the same pattern over and over
+ instead of first compiling the pattern and then using the compiled
+ instance for matching.
+ @param pattern The regular expression to compile
+ @param mode A bitwise or of flags signalling what special behaviors are
+ wanted from this <code>WCPattern</code> object
+ @return If successful, <code>compileAndKeep</code> returns a
+ <code>WCPattern</code> pointer. Upon failure, <code>compile</code>
+ returns <code>NULL</code>.
+ */
+ static WCPattern * compileAndKeep (const bkstring & pattern,
+ const unsigned long mode = 0);
+
+ /**
+ Searches through <code>replace</code> and replaces all substrings matched
+ by <code>pattern</code> with <code>str</code>. <code>str</code> may
+ contain backreferences (e.g. <code>\1</code>) to capture groups. A typical
+ invocation looks like:
+ <p>
+ <code>
+ WCPattern::replace(L"(a+)b(c+)", L"abcccbbabcbabc", L"\\2b\\1");
+ </code>
+ <p>
+ which would replace <code>abcccbbabcbabc</code> with
+ <code>cccbabbcbabcba</code>.
+ @param pattern The regular expression
+ @param str The replacement text
+ @param replacementText The string in which to perform replacements
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the replacement process
+ @return The text with the replacement string substituted where necessary
+ */
+ static bkstring replace (const bkstring & pattern,
+ const bkstring & str,
+ const bkstring & replacementText,
+ const unsigned long mode = 0);
+
+ /**
+ Splits the specified string over occurrences of the specified pattern.
+ Empty strings can be optionally ignored. The number of strings returned is
+ configurable. A typical invocation looks like:
+ <p>
+ <code>
+ bkstring str(strSize, 0);<br>
+ FILE * fp = fopen(fileName, "r");<br>
+ fread((char*)str.data(), strSize * 2, 1, fp);<br>
+ fclose(fp);<br>
+ <br>
+ std::vector<bkstring> lines = WCPattern::split(L"[\r\n]+", str, true);<br>
+ <br>
+ </code>
+
+ @param pattern The regular expression
+ @param replace The string to split
+ @param keepEmptys Whether or not to keep empty strings
+ @param limit The maximum number of splits to make
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the split process
+ @return All substrings of <code>str</code> split across <code>pattern</code>.
+ */
+ static std::vector<bkstring> split (const bkstring & pattern,
+ const bkstring & str,
+ const bool keepEmptys = 0,
+ const unsigned long limit = 0,
+ const unsigned long mode = 0);
+
+ /**
+ Finds all the instances of the specified pattern within the string. You
+ should be careful to only pass patterns with a minimum length of one. For
+ example, the pattern <code>a*</code> can be matched by an empty string, so
+ instead you should pass <code>a+</code> since at least one character must
+ be matched. A typical invocation of <code>findAll</code> looks like:
+ <p>
+ <code>
+ std::vector<td::string> numbers = WCPattern::findAll(L"\\d+", string);
+ </code>
+ <p>
+
+ @param pattern The pattern for which to search
+ @param str The string to search
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the find process
+ @return All instances of <code>pattern</code> in <code>str</code>
+ */
+ static std::vector<bkstring> findAll (const bkstring & pattern,
+ const bkstring & str,
+ const unsigned long mode = 0);
+
+ /**
+ Determines if an entire string matches the specified pattern
+
+ @param pattern The pattern for to match
+ @param str The string to match
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the replacement process
+ @return True if <code>str</code> is recognized by <code>pattern</code>
+ */
+ static bool matches (const bkstring & pattern,
+ const bkstring & str,
+ const unsigned long mode = 0);
+
+ /**
+ Registers a pattern under a specific name for use in later compilations.
+ A typical invocation and later use looks like:
+ <p>
+ <code>
+ WCPattern::registerWCPattern(L"ip", L"(?:\\d{1,3}\\.){3}\\d{1,3}");<br>
+ WCPattern * p1 = WCPattern::compile(L"{ip}:\\d+");<br>
+ WCPattern * p2 = WCPattern::compile(L"Connection from ({ip}) on port \\d+");<br>
+ </code>
+ <p>
+ Multiple calls to <code>registerWCPattern</code> with the same
+ <code>name</code> will result in the pattern getting overwritten.
+
+ @param name The name to give to the pattern
+ @param pattern The pattern to register
+ @param mode Any special flags to use when compiling pattern
+ @return Success/Failure. Fails only if <code>pattern</code> has invalid
+ syntax
+ */
+ static bool registerWCPattern(const bkstring & name,
+ const bkstring & pattern,
+ const unsigned long mode = 0);
+
+ /**
+ Clears the pattern registry
+ */
+ static void unregisterWCPatterns();
+ /**
+ Don't use
+ */
+ static void clearWCPatternCache();
+
+ /**
+ Searches through a string for the <code>n<sup>th</sup></code> match of the
+ given pattern in the string. Match indeces start at zero, not one.
+ A typical invocation looks like this:
+ <p>
+ <code>
+ std::pair<bkstring, int> match = WCPattern::findNthMatch(L"\\d{1,3}", L"192.168.1.101:22", 1);<br>
+ wprintf(L"%s %i\n", match.first.c_str(), match.second);<br>
+ <br>
+ Output: 168 4<br>
+ <br>
+
+ @param pattern The pattern for which to search
+ @param str The string to search
+ @param matchNum Which match to find
+ @param mode Any special flags to use during the matching process
+ @return A string and an integer. The string is the string matched. The
+ integer is the starting location of the matched string in
+ <code>str</code>. You can check for success/failure by making sure
+ that the integer returned is greater than or equal to zero.
+ */
+ static std::pair<bkstring, int> findNthMatch (const bkstring & pattern,
+ const bkstring & str,
+ const int matchNum,
+ const unsigned long mode = 0);
+ public:
+ /**
+ Deletes all NFA nodes allocated during compilation
+ */
+ ~WCPattern();
+
+ bkstring replace (const bkstring & str,
+ const bkstring & replacementText);
+ std::vector<bkstring> split (const bkstring & str, const bool keepEmptys = 0,
+ const unsigned long limit = 0);
+ std::vector<bkstring> findAll (const bkstring & str);
+ bool matches (const bkstring & str);
+ /**
+ Returns the flags used during compilation of this pattern
+ @return The flags used during compilation of this pattern
+ */
+ unsigned long getFlags () const;
+ /**
+ Returns the regular expression this pattern represents
+ @return The regular expression this pattern represents
+ */
+ bkstring getWCPattern () const;
+ /**
+ Creates a matcher object using the specified string and this pattern.
+ @param str The string to match against
+ @return A new matcher using object using this pattern and the specified
+ string
+ */
+ WCMatcher * createWCMatcher (const bkstring & str);
+};
+
+class NFAUNode
+{
+ friend class WCMatcher;
+ public:
+ NFAUNode * next;
+ NFAUNode();
+ virtual ~NFAUNode();
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const = 0;
+ inline virtual bool isGroupHeadNode() const { return false; }
+ inline virtual bool isStartOfInputNode() const { return false; }
+};
+class NFACharUNode : public NFAUNode
+{
+ protected:
+ wchar_t ch;
+ public:
+ NFACharUNode(const wchar_t c);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFACICharUNode : public NFAUNode
+{
+ protected:
+ wchar_t ch;
+ public:
+ NFACICharUNode(const wchar_t c);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAStartUNode : public NFAUNode
+{
+ public:
+ NFAStartUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAEndUNode : public NFAUNode
+{
+ public:
+ NFAEndUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAQuantifierUNode : public NFAUNode
+{
+ public:
+ int min, max;
+ NFAUNode * inner;
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ NFAQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGreedyQuantifierUNode : public NFAQuantifierUNode
+{
+ public:
+ NFAGreedyQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ virtual int matchInternal(const bkstring & str, WCMatcher * matcher, const int curInd, const int soFar) const;
+};
+class NFALazyQuantifierUNode : public NFAQuantifierUNode
+{
+ public:
+ NFALazyQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAPossessiveQuantifierUNode : public NFAQuantifierUNode
+{
+ public:
+ NFAPossessiveQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAAcceptUNode : public NFAUNode
+{
+ public:
+ NFAAcceptUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAClassUNode : public NFAUNode
+{
+ public:
+ bool inv;
+ std::map<wchar_t, bool> vals;
+ NFAClassUNode(const bool invert = 0);
+ NFAClassUNode(const bkstring & clazz, const bool invert);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFACIClassUNode : public NFAUNode
+{
+ public:
+ bool inv;
+ std::map<wchar_t, bool> vals;
+ NFACIClassUNode(const bool invert = 0);
+ NFACIClassUNode(const bkstring & clazz, const bool invert);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFASubStartUNode : public NFAUNode
+{
+ public:
+ NFASubStartUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAOrUNode : public NFAUNode
+{
+ public:
+ NFAUNode * one;
+ NFAUNode * two;
+ NFAOrUNode(NFAUNode * first, NFAUNode * second);
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAQuoteUNode : public NFAUNode
+{
+ public:
+ bkstring qStr;
+ NFAQuoteUNode(const bkstring & quoted);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFACIQuoteUNode : public NFAUNode
+{
+ public:
+ bkstring qStr;
+ NFACIQuoteUNode(const bkstring & quoted);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFALookAheadUNode : public NFAUNode
+{
+ public:
+ bool pos;
+ NFAUNode * inner;
+ NFALookAheadUNode(NFAUNode * internal, const bool positive);
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFALookBehindUNode : public NFAUNode
+{
+ public:
+ bool pos;
+ bkstring mStr;
+ NFALookBehindUNode(const bkstring & str, const bool positive);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAStartOfLineUNode : public NFAUNode
+{
+ public:
+ NFAStartOfLineUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAEndOfLineUNode : public NFAUNode
+{
+ public:
+ NFAEndOfLineUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAReferenceUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAReferenceUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAStartOfInputUNode : public NFAUNode
+{
+ public:
+ NFAStartOfInputUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ inline virtual bool isStartOfInputNode() const { return false; }
+};
+class NFAEndOfInputUNode : public NFAUNode
+{
+ public:
+ bool term;
+ NFAEndOfInputUNode(const bool lookForTerm);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAWordBoundaryUNode : public NFAUNode
+{
+ public:
+ bool pos;
+ NFAWordBoundaryUNode(const bool positive);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAEndOfMatchUNode : public NFAUNode
+{
+ public:
+ NFAEndOfMatchUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupHeadUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAGroupHeadUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ inline virtual bool isGroupHeadNode() const { return false; }
+};
+class NFAGroupTailUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAGroupTailUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupLoopPrologueUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAGroupLoopPrologueUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupLoopUNode : public NFAUNode
+{
+ public:
+ int gi, min, max, type;
+ NFAUNode * inner;
+ NFAGroupLoopUNode(NFAUNode * internal, const int minMatch,
+ const int maxMatch, const int groupIndex, const int matchType);
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ int matchGreedy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ int matchLazy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ int matchPossessive(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+
+#endif
+
|