From a33833212f040272fc6c97047c8cb335b6f5405a Mon Sep 17 00:00:00 2001 From: Vadim Dashevskiy Date: Tue, 24 Jul 2012 06:41:19 +0000 Subject: SimpleAR, SimpleStatusMsg, SmileyAdd, SpellChecker: changed folder structure git-svn-id: http://svn.miranda-ng.org/main/trunk@1149 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c --- plugins/SmileyAdd/src/regexp/WCPattern.h | 1663 ++++++++++++++++++++++++++++++ 1 file changed, 1663 insertions(+) create mode 100644 plugins/SmileyAdd/src/regexp/WCPattern.h (limited to 'plugins/SmileyAdd/src/regexp/WCPattern.h') diff --git a/plugins/SmileyAdd/src/regexp/WCPattern.h b/plugins/SmileyAdd/src/regexp/WCPattern.h new file mode 100644 index 0000000000..3d52a7fd2e --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/WCPattern.h @@ -0,0 +1,1663 @@ +#ifndef __WCPATTERN_H__ +#define __WCPATTERN_H__ + +#ifdef _WIN32 + #pragma warning(disable:4786) +#endif + +#include "bkstring.h" + +#include +#include + +class WCMatcher; +class NFAUNode; +class NFAQuantifierUNode; + +/** + This pattern class is very similar in functionality to Java's + java.util.regex.WCPattern class. The pattern class represents an immutable + regular expression object. Instead of having a single object contain both the + regular expression object and the matching object, instead the two objects are + split apart. The {@link WCMatcher WCMatcher} class represents the maching + object. + + The WCPattern class works primarily off of "compiled" patterns. A typical + instantiation of a regular expression looks like: + +
+  WCPattern * p = WCPattern::compile(L"a*b");
+  WCMatcher * m = p->createWCMatcher(L"aaaaaab");
+  if (m->matches()) ...
+  
+ + However, if you do not need to use a pattern more than once, it is often times + okay to use the WCPattern's static methods insteads. An example looks like this: + +
+  if (WCPattern::matches(L"a*b", L"aaaab")) { ... }
+  
+ + This class does not currently support unicode. The unicode update for this + class is coming soon. + + This class is partially immutable. It is completely safe to call createWCMatcher + concurrently in different threads, but the other functions (e.g. split) should + not be called concurrently on the same WCPattern. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Construct + + Matches + +
+   +
+ Characters +
+ x + + The character x +
+ \\ + + The character \ +
+ \0nn + + The character with octal ASCII value nn +
+ \0nnn + + The character with octal ASCII value nnn +
+ \xhh + + The character with hexadecimal ASCII value hh +
+ \t + + A tab character +
+ \r + + A carriage return character +
+ \n + + A new-line character +
+   +
+ Character Classes +
+ [abc] + + Either a, b, or c +
+ [^abc] + + Any character but a, b, or c +
+ [a-zA-Z] + + Any character ranging from a thru z, or + A thru Z +
+ [^a-zA-Z] + + Any character except those ranging from a thru + z, or A thru Z +
+ [a\-z] + + Either a, -, or z +
+ [a-z[A-Z]] + + Same as [a-zA-Z] +
+ [a-z&&[g-i]] + + Any character in the intersection of a-z and + g-i +
+ [a-z&&[^g-i]] + + Any character in a-z and not in g-i +
+   +
+ Prefefined character classes +
+ . + + Any character. Multiline matching must be compiled into the pattern for + . to match a \r or a \n. + Even if multiline matching is enabled, . will not + match a \r\n, only a \r or a \n. +
+ \d + + [0-9] +
+ \D + + [^\d] +
+ \s + + [ \t\r\n\x0B] +
+ \S + + [^\s] +
+ \w + + [a-zA-Z0-9_] +
+ \W + + [^\w] +
+   +
+ POSIX character classes +
+ \p{Lower} + + [a-z] +
+ \p{Upper} + + [A-Z] +
+ \p{ASCII} + + [\x00-\x7F] +
+ \p{Alpha} + + [a-zA-Z] +
+ \p{Digit} + + [0-9] +
+ \p{Alnum} + + [\w&&[^_]] +
+ \p{Punct} + + [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~] +
+ \p{XDigit} + + [a-fA-F0-9] +
+   +
+ Boundary Matches +
+ ^ + + The beginning of a line. Also matches the beginning of input. +
+ $ + + The end of a line. Also matches the end of input. +
+ \b + + A word boundary +
+ \B + + A non word boundary +
+ \A + + The beginning of input +
+ \G + + The end of the previous match. Ensures that a "next" match will only + happen if it begins with the character immediately following the end of + the "current" match. +
+ \Z + + The end of input. Will also match if there is a single trailing + \r\n, a single trailing \r, or a single + trailing \n. +
+ \z + + The end of input +
+   +
+ Greedy Quantifiers +
+ x? + + x, either zero times or one time +
+ x* + + x, zero or more times +
+ x+ + + x, one or more times +
+ x{n} + + x, exactly n times +
+ x{n,} + + x, at least n times +
+ x{,m} + + x, at most m times +
+ x{n,m} + + x, at least n times and at most + m times +
+   +
+ Possessive Quantifiers +
+ x?+ + + x, either zero times or one time +
+ x*+ + + x, zero or more times +
+ x++ + + x, one or more times +
+ x{n}+ + + x, exactly n times +
+ x{n,}+ + + x, at least n times +
+ x{,m}+ + + x, at most m times +
+ x{n,m}+ + + x, at least n times and at most + m times +
+   +
+ Reluctant Quantifiers +
+ x?? + + x, either zero times or one time +
+ x*? + + x, zero or more times +
+ x+? + + x, one or more times +
+ x{n}? + + x, exactly n times +
+ x{n,}? + + x, at least n times +
+ x{,m}? + + x, at most m times +
+ x{n,m}? + + x, at least n times and at most + m times +
+   +
+ Operators +
+ xy + + x then y +
+ x|y + + x or y +
+ (x) + + x as a capturing group +
+   +
+ Quoting +
+ \Q + + Nothing, but treat every character (including \s) literally until a + matching \E +
+ \E + + Nothing, but ends its matching \Q +
+   +
+ Special Constructs +
+ (?:x) + + x, but not as a capturing group +
+ (?=x) + + x, via positive lookahead. This means that the + expression will match only if it is trailed by x. + It will not "eat" any of the characters matched by + x. +
+ (?!x) + + x, via negative lookahead. This means that the + expression will match only if it is not trailed by + x. It will not "eat" any of the characters + matched by x. +
+ (?<=x) + + x, via positive lookbehind. x + cannot contain any quantifiers. +
+ (?x) + + x, via negative lookbehind. x + cannot contain any quantifiers. +
+ (?>x) + + x{1}+ +
+   +
+ Registered Expression Matching +
+ {x} + + The registered pattern x +
+ +
+ + Begin Text Extracted And Modified From java.util.regex.WCPattern documentation + +

Backslashes, escapes, and quoting

+ +

The backslash character ((wchar_t)'\') serves to introduce escaped + constructs, as defined in the table above, as well as to quote characters + that otherwise would be interpreted as unescaped constructs. Thus the + expression \\ matches a single backslash and \{ matches a + left brace. + +

It is an error to use a backslash prior to any alphabetic character that + does not denote an escaped construct; these are reserved for future + extensions to the regular-expression language. A backslash may be used + prior to a non-alphabetic character regardless of whether that character is + part of an unescaped construct. + +

It is necessary to double backslashes in string literals that represent + regular expressions to protect them from interpretation by a compiler. The + string literal "\b", for example, matches a single backspace + character when interpreted as a regular expression, while + "\\b" matches a word boundary. The string litera + "\(hello\)" is illegal and leads to a compile-time error; + in order to match the string (hello) the string literal + "\\(hello\\)" must be used. + +

Character Classes

+ +

Character classes may appear within other character classes, and + may be composed by the union operator (implicit) and the intersection + operator (&&). + The union operator denotes a class that contains every character that is + in at least one of its operand classes. The intersection operator + denotes a class that contains every character that is in both of its + operand classes. + +

The precedence of character-class operators is as follows, from + highest to lowest: + +

+ + + + + + + + + + + + + + + + +
1    Literal escape    \x
2    Rangea-z
3    Grouping[...]
4    Intersection[a-z&&[aeiou]]
5    Union[a-e][i-u]
+ +

Note that a different set of metacharacters are in effect inside + a character class than outside a character class. For instance, the + regular expression . loses its special meaning inside a + character class, while the expression - becomes a range + forming metacharacter. + + + + +

Groups and capturing

+ +

Capturing groups are numbered by counting their opening parentheses from + left to right. In the expression ((A)(B(C))), for example, there + are four such groups:

+ +
+ + + + + + + + + + +
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
+ +

Group zero always stands for the entire expression. + +

Capturing groups are so named because, during a match, each subsequence + of the input sequence that matches such a group is saved. The captured + subsequence may be used later in the expression, via a back reference, and + may also be retrieved from the matcher once the match operation is complete. + +

The captured input associated with a group is always the subsequence + that the group most recently matched. If a group is evaluated a second time + because of quantification then its previously-captured value, if any, will + be retained if the second evaluation fails. Matching the string + L"aba" against the expression (a(b)?)+, for example, leaves + group two set to L"b". All captured input is discarded at the + beginning of each match. + +

Groups beginning with (? are pure, non-capturing groups + that do not capture text and do not count towards the group total. + + +

WC support

+ +

Coming Soon. + +

Comparison to Perl 5

+ +

The WCPattern engine performs traditional NFA-based matching + with ordered alternation as occurs in Perl 5. + +

Perl constructs not supported by this class:

+ +
    + +
  • The conditional constructs (?{X}) and + (?(condition)X|Y), +

  • + +
  • The embedded code constructs (?{code}) + and (??{code}),

  • + +
  • The embedded comment syntax (?#comment), and

  • + +
  • The preprocessing operations \l \u, + \L, and \U.

  • + +
  • Embedded flags

  • + +
+ +

Constructs supported by this class but not by Perl:

+ +
    + +
  • Possessive quantifiers, which greedily match as much as they can + and do not back off, even when doing so would allow the overall match to + succeed.

  • + +
  • Character-class union and intersection as described + above.

  • + +
+ +

Notable differences from Perl:

+ +
    + +
  • In Perl, \1 through \9 are always interpreted + as back references; a backslash-escaped number greater than 9 is + treated as a back reference if at least that many subexpressions exist, + otherwise it is interpreted, if possible, as an octal escape. In this + class octal escapes must always begin with a zero. In this class, + \1 through \9 are always interpreted as back + references, and a larger number is accepted as a back reference if at + least that many subexpressions exist at that point in the regular + expression, otherwise the parser will drop digits until the number is + smaller or equal to the existing number of groups or it is one digit. +

  • + +
  • Perl uses the g flag to request a match that resumes + where the last match left off. This functionality is provided implicitly + by the WCMatcher class: Repeated invocations of the + find method will resume where the last match left off, + unless the matcher is reset.

  • + +
  • Perl is forgiving about malformed matching constructs, as in the + expression *a, as well as dangling brackets, as in the + expression abc], and treats them as literals. This + class also strict and will not compile a pattern when dangling characters + are encountered.

  • + +
+ + +

For a more precise description of the behavior of regular expression + constructs, please see + Mastering Regular Expressions, 2nd Edition, Jeffrey E. F. Friedl, + O'Reilly and Associates, 2002. +

+

+ + End Text Extracted And Modified From java.util.regex.WCPattern documentation + +


+ + @author Jeffery Stuart + @since March 2003, Stable Since November 2004 + @version 1.07.00 + @memo A class used to represent "PERL 5"-ish regular expressions + */ +class WCPattern +{ + friend class WCMatcher; + friend class NFAUNode; + friend class NFAQuantifierUNode; + private: + /** + This constructor should not be called directly. Those wishing to use the + WCPattern class should instead use the {@link compile compile} method. + + @param rhs The pattern to compile + @memo Creates a new pattern from the regular expression in rhs. + */ + WCPattern(const bkstring & rhs); + protected: + /** + This currently is not used, so don't try to do anything with it. + @memo Holds all the compiled patterns for quick access. + */ + static std::map compiledWCPatterns; + /** + Holds all of the registered patterns as strings. Due to certain problems + with compilation of patterns, especially with capturing groups, this seemed + to be the best way to do it. + */ + static std::map > registeredWCPatterns; + protected: + /** + Holds all the NFA nodes used. This makes deletion of a pattern, as well as + clean-up from an unsuccessful compile much easier and faster. + */ + std::map nodes; + /** + Used when methods like split are called. The matcher class uses a lot of + dynamic memeory, so having an instance increases speedup of certain + operations. + */ + WCMatcher * matcher; + /** + The front node of the NFA. + */ + NFAUNode * head; + /** + The actual regular expression we rerpesent + */ + bkstring pattern; + /** + Flag used during compilation. Once the pattern is successfully compiled, + error is no longer used. + */ + bool error; + /** + Used during compilation to keep track of the current index into + {@link pattern pattern}. Once the pattern is successfully + compiled, error is no longer used. + */ + int curInd; + /** + The number of capture groups this contains. + */ + int groupCount; + /** + The number of non-capture groups this contains. + */ + int nonCapGroupCount; + /** + The flags specified when this was compiled. + */ + unsigned long flags; + protected: + /** + Raises an error during compilation. Compilation will cease at that point + and compile will return NULL. + */ + void raiseError(); + /** + Convenience function for registering a node in nodes. + @param node The node to register + @return The registered node + */ + NFAUNode * registerNode(NFAUNode * node); + + /** + Calculates the union of two strings. This function will first sort the + strings and then use a simple selection algorithm to find the union. + @param s1 The first "class" to union + @param s2 The second "class" to union + @return A new string containing all unique characters. Each character + must have appeared in one or both of s1 and + s2. + */ + bkstring classUnion (bkstring s1, bkstring s2) const; + /** + Calculates the intersection of two strings. This function will first sort + the strings and then use a simple selection algorithm to find the + intersection. + @param s1 The first "class" to intersect + @param s2 The second "class" to intersect + @return A new string containing all unique characters. Each character + must have appeared both s1 and s2. + */ + bkstring classIntersect (bkstring s1, bkstring s2) const; + /** + Calculates the negation of a string. The negation is the set of all + characters between \x00 and \xFF not + contained in s1. + @param s1 The "class" to be negated. + @param s2 The second "class" to intersect + @return A new string containing all unique characters. Each character + must have appeared both s1 and s2. + */ + bkstring classNegate (bkstring s1) const; + /** + Creates a new "class" representing the range from low thru + hi. This function will wrap if low > + hi. This is a feature, not a buf. Sometimes it is useful + to be able to say [\x70-\x10] instead of [\x70-\x7F\x00-\x10]. + @param low The beginning character + @param hi The ending character + @return A new string containing all the characters from low thru hi. + */ + bkstring classCreateRange(wchar_t low, wchar_t hi) const; + + /** + Extracts a decimal number from the substring of member-variable + {@link pattern pattern} starting at start and + ending at end. + @param start The starting index in {@link pattern pattern} + @param end The last index in {@link pattern pattern} + @return The decimal number in {@link pattern pattern} + */ + int getInt(int start, int end); + /** + Parses a {n,m} string out of the member-variable + {@link pattern pattern} stores the result in sNum + and eNum. + @param sNum Output parameter. The minimum number of matches required + by the curly quantifier are stored here. + @param eNum Output parameter. The maximum number of matches allowed + by the curly quantifier are stored here. + @return Success/Failure. Fails when the curly does not have the proper + syntax + */ + bool quantifyCurly(int & sNum, int & eNum); + /** + Tries to quantify the currently parsed group. If the group being parsed + is indeed quantified in the member-variable + {@link pattern pattern}, then the NFA is modified accordingly. + @param start The starting node of the current group being parsed + @param stop The ending node of the current group being parsed + @param gn The group number of the current group being parsed + @return The node representing the starting node of the group. If the + group becomes quantified, then this node is not necessarily + a GroupHead node. + */ + NFAUNode * quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn); + + /** + Tries to quantify the last parsed expression. If the character was indeed + quantified, then the NFA is modified accordingly. + @param newNode The recently created expression node + @return The node representing the last parsed expression. If the + expression was quantified, return value != newNode + */ + NFAUNode * quantify(NFAUNode * newNode); + /** + Parses the current class being examined in + {@link pattern pattern}. + @return A string of unique characters contained in the current class being + parsed + */ + bkstring parseClass(); + /** + Parses the current POSIX class being examined in + {@link pattern pattern}. + @return A string of unique characters representing the POSIX class being + parsed + */ + bkstring parsePosix(); + /** + Returns a string containing the octal character being parsed + @return The string contained the octal value being parsed + */ + bkstring parseOctal(); + /** + Returns a string containing the hex character being parsed + @return The string contained the hex value being parsed + */ + bkstring parseHex(); + /** + Returns a new node representing the back reference being parsed + @return The new node representing the back reference being parsed + */ + NFAUNode * parseBackref(); + /** + Parses the escape sequence currently being examined. Determines if the + escape sequence is a class, a single character, or the beginning of a + quotation sequence. + @param inv Output parameter. Whether or not to invert the returned class + @param quo Output parameter. Whether or not this sequence starts a + quotation. + @return The characters represented by the class + */ + bkstring parseEscape(bool & inv, bool & quo); + /** + Parses a supposed registered pattern currently under compilation. If the + sequence of characters does point to a registered pattern, then the + registered pattern is appended to *end. The registered pattern + is parsed with the current compilation flags. + @param end The ending node of the thus-far compiled pattern + @return The new end node of the current pattern + */ + NFAUNode * parseRegisteredWCPattern(NFAUNode ** end); + /** + Parses a lookbehind expression. Appends the necessary nodes + *end. + @param pos Positive or negative look behind + @param end The ending node of the current pattern + @return The new end node of the current pattern + */ + NFAUNode * parseBehind(const bool pos, NFAUNode ** end); + /** + Parses the current expression and tacks on nodes until a \E is found. + @return The end of the current pattern + */ + NFAUNode * parseQuote(); + /** + Parses {@link pattern pattern}. This function is called + recursively when an or (|) or a group is encountered. + @param inParen Are we currently parsing inside a group + @param inOr Are we currently parsing one side of an or (|) + @param end The end of the current expression + @return The starting node of the NFA constructed from this parse + */ + NFAUNode * parse(const bool inParen = 0, const bool inOr = 0, NFAUNode ** end = NULL); + public: + /// We should match regardless of case + const static unsigned long CASE_INSENSITIVE; + /// We are implicitly quoted + const static unsigned long LITERAL; + /// @memo We should treat a . as [\x00-\x7F] + const static unsigned long DOT_MATCHES_ALL; + /** ^ and $ should anchor to the beginning and + ending of lines, not all input + */ + const static unsigned long MULTILINE_MATCHING; + /** When enabled, only instances of \n are recognized as + line terminators + */ + const static unsigned long UNIX_LINE_MODE; + /// The absolute minimum number of matches a quantifier can match (0) + const static int MIN_QMATCH; + /// The absolute maximum number of matches a quantifier can match (0x7FFFFFFF) + const static int MAX_QMATCH; + public: + /** + Call this function to compile a regular expression into a + WCPattern object. Special values can be assigned to + mode when certain non-standard behaviors are expected from + the WCPattern object. + @param pattern The regular expression to compile + @param mode A bitwise or of flags signalling what special behaviors are + wanted from this WCPattern object + @return If successful, compile returns a WCPattern + pointer. Upon failure, compile returns + NULL + */ + static WCPattern * compile (const bkstring & pattern, + const unsigned long mode = 0); + /** + Dont use this function. This function will compile a pattern, and cache + the result. This will eventually be used as an optimization when people + just want to call static methods using the same pattern over and over + instead of first compiling the pattern and then using the compiled + instance for matching. + @param pattern The regular expression to compile + @param mode A bitwise or of flags signalling what special behaviors are + wanted from this WCPattern object + @return If successful, compileAndKeep returns a + WCPattern pointer. Upon failure, compile + returns NULL. + */ + static WCPattern * compileAndKeep (const bkstring & pattern, + const unsigned long mode = 0); + + /** + Searches through replace and replaces all substrings matched + by pattern with str. str may + contain backreferences (e.g. \1) to capture groups. A typical + invocation looks like: +

+ + WCPattern::replace(L"(a+)b(c+)", L"abcccbbabcbabc", L"\\2b\\1"); + +

+ which would replace abcccbbabcbabc with + cccbabbcbabcba. + @param pattern The regular expression + @param str The replacement text + @param replacementText The string in which to perform replacements + @param mode The special mode requested of the WCPattern + during the replacement process + @return The text with the replacement string substituted where necessary + */ + static bkstring replace (const bkstring & pattern, + const bkstring & str, + const bkstring & replacementText, + const unsigned long mode = 0); + + /** + Splits the specified string over occurrences of the specified pattern. + Empty strings can be optionally ignored. The number of strings returned is + configurable. A typical invocation looks like: +

+ + bkstring str(strSize, 0);
+ FILE * fp = fopen(fileName, "r");
+ fread((char*)str.data(), strSize * 2, 1, fp);
+ fclose(fp);
+
+ std::vector<bkstring> lines = WCPattern::split(L"[\r\n]+", str, true);
+
+
+ + @param pattern The regular expression + @param replace The string to split + @param keepEmptys Whether or not to keep empty strings + @param limit The maximum number of splits to make + @param mode The special mode requested of the WCPattern + during the split process + @return All substrings of str split across pattern. + */ + static std::vector split (const bkstring & pattern, + const bkstring & str, + const bool keepEmptys = 0, + const unsigned long limit = 0, + const unsigned long mode = 0); + + /** + Finds all the instances of the specified pattern within the string. You + should be careful to only pass patterns with a minimum length of one. For + example, the pattern a* can be matched by an empty string, so + instead you should pass a+ since at least one character must + be matched. A typical invocation of findAll looks like: +

+ + std::vector<td::string> numbers = WCPattern::findAll(L"\\d+", string); + +

+ + @param pattern The pattern for which to search + @param str The string to search + @param mode The special mode requested of the WCPattern + during the find process + @return All instances of pattern in str + */ + static std::vector findAll (const bkstring & pattern, + const bkstring & str, + const unsigned long mode = 0); + + /** + Determines if an entire string matches the specified pattern + + @param pattern The pattern for to match + @param str The string to match + @param mode The special mode requested of the WCPattern + during the replacement process + @return True if str is recognized by pattern + */ + static bool matches (const bkstring & pattern, + const bkstring & str, + const unsigned long mode = 0); + + /** + Registers a pattern under a specific name for use in later compilations. + A typical invocation and later use looks like: +

+ + WCPattern::registerWCPattern(L"ip", L"(?:\\d{1,3}\\.){3}\\d{1,3}");
+ WCPattern * p1 = WCPattern::compile(L"{ip}:\\d+");
+ WCPattern * p2 = WCPattern::compile(L"Connection from ({ip}) on port \\d+");
+
+

+ Multiple calls to registerWCPattern with the same + name will result in the pattern getting overwritten. + + @param name The name to give to the pattern + @param pattern The pattern to register + @param mode Any special flags to use when compiling pattern + @return Success/Failure. Fails only if pattern has invalid + syntax + */ + static bool registerWCPattern(const bkstring & name, + const bkstring & pattern, + const unsigned long mode = 0); + + /** + Clears the pattern registry + */ + static void unregisterWCPatterns(); + /** + Don't use + */ + static void clearWCPatternCache(); + + /** + Searches through a string for the nth match of the + given pattern in the string. Match indeces start at zero, not one. + A typical invocation looks like this: +

+ + std::pair<bkstring, int> match = WCPattern::findNthMatch(L"\\d{1,3}", L"192.168.1.101:22", 1);
+ wprintf(L"%s %i\n", match.first.c_str(), match.second);
+
+ Output: 168 4
+
+ + @param pattern The pattern for which to search + @param str The string to search + @param matchNum Which match to find + @param mode Any special flags to use during the matching process + @return A string and an integer. The string is the string matched. The + integer is the starting location of the matched string in + str. You can check for success/failure by making sure + that the integer returned is greater than or equal to zero. + */ + static std::pair findNthMatch (const bkstring & pattern, + const bkstring & str, + const int matchNum, + const unsigned long mode = 0); + public: + /** + Deletes all NFA nodes allocated during compilation + */ + ~WCPattern(); + + bkstring replace (const bkstring & str, + const bkstring & replacementText); + std::vector split (const bkstring & str, const bool keepEmptys = 0, + const unsigned long limit = 0); + std::vector findAll (const bkstring & str); + bool matches (const bkstring & str); + /** + Returns the flags used during compilation of this pattern + @return The flags used during compilation of this pattern + */ + unsigned long getFlags () const; + /** + Returns the regular expression this pattern represents + @return The regular expression this pattern represents + */ + bkstring getWCPattern () const; + /** + Creates a matcher object using the specified string and this pattern. + @param str The string to match against + @return A new matcher using object using this pattern and the specified + string + */ + WCMatcher * createWCMatcher (const bkstring & str); +}; + +class NFAUNode +{ + friend class WCMatcher; + public: + NFAUNode * next; + NFAUNode(); + virtual ~NFAUNode(); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const = 0; + inline virtual bool isGroupHeadNode() const { return false; } + inline virtual bool isStartOfInputNode() const { return false; } +}; +class NFACharUNode : public NFAUNode +{ + protected: + wchar_t ch; + public: + NFACharUNode(const wchar_t c); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFACICharUNode : public NFAUNode +{ + protected: + wchar_t ch; + public: + NFACICharUNode(const wchar_t c); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAStartUNode : public NFAUNode +{ + public: + NFAStartUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAEndUNode : public NFAUNode +{ + public: + NFAEndUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAQuantifierUNode : public NFAUNode +{ + public: + int min, max; + NFAUNode * inner; + virtual void findAllNodes(std::map & soFar); + NFAQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGreedyQuantifierUNode : public NFAQuantifierUNode +{ + public: + NFAGreedyQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + virtual int matchInternal(const bkstring & str, WCMatcher * matcher, const int curInd, const int soFar) const; +}; +class NFALazyQuantifierUNode : public NFAQuantifierUNode +{ + public: + NFALazyQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAPossessiveQuantifierUNode : public NFAQuantifierUNode +{ + public: + NFAPossessiveQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAAcceptUNode : public NFAUNode +{ + public: + NFAAcceptUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAClassUNode : public NFAUNode +{ + public: + bool inv; + std::map vals; + NFAClassUNode(const bool invert = 0); + NFAClassUNode(const bkstring & clazz, const bool invert); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFACIClassUNode : public NFAUNode +{ + public: + bool inv; + std::map vals; + NFACIClassUNode(const bool invert = 0); + NFACIClassUNode(const bkstring & clazz, const bool invert); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFASubStartUNode : public NFAUNode +{ + public: + NFASubStartUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAOrUNode : public NFAUNode +{ + public: + NFAUNode * one; + NFAUNode * two; + NFAOrUNode(NFAUNode * first, NFAUNode * second); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAQuoteUNode : public NFAUNode +{ + public: + bkstring qStr; + NFAQuoteUNode(const bkstring & quoted); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFACIQuoteUNode : public NFAUNode +{ + public: + bkstring qStr; + NFACIQuoteUNode(const bkstring & quoted); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFALookAheadUNode : public NFAUNode +{ + public: + bool pos; + NFAUNode * inner; + NFALookAheadUNode(NFAUNode * internal, const bool positive); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFALookBehindUNode : public NFAUNode +{ + public: + bool pos; + bkstring mStr; + NFALookBehindUNode(const bkstring & str, const bool positive); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAStartOfLineUNode : public NFAUNode +{ + public: + NFAStartOfLineUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAEndOfLineUNode : public NFAUNode +{ + public: + NFAEndOfLineUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAReferenceUNode : public NFAUNode +{ + public: + int gi; + NFAReferenceUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAStartOfInputUNode : public NFAUNode +{ + public: + NFAStartOfInputUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + inline virtual bool isStartOfInputNode() const { return false; } +}; +class NFAEndOfInputUNode : public NFAUNode +{ + public: + bool term; + NFAEndOfInputUNode(const bool lookForTerm); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAWordBoundaryUNode : public NFAUNode +{ + public: + bool pos; + NFAWordBoundaryUNode(const bool positive); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAEndOfMatchUNode : public NFAUNode +{ + public: + NFAEndOfMatchUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGroupHeadUNode : public NFAUNode +{ + public: + int gi; + NFAGroupHeadUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + inline virtual bool isGroupHeadNode() const { return false; } +}; +class NFAGroupTailUNode : public NFAUNode +{ + public: + int gi; + NFAGroupTailUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGroupLoopPrologueUNode : public NFAUNode +{ + public: + int gi; + NFAGroupLoopPrologueUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGroupLoopUNode : public NFAUNode +{ + public: + int gi, min, max, type; + NFAUNode * inner; + NFAGroupLoopUNode(NFAUNode * internal, const int minMatch, + const int maxMatch, const int groupIndex, const int matchType); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + int matchGreedy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + int matchLazy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + int matchPossessive(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; + +#endif + -- cgit v1.2.3