From a33833212f040272fc6c97047c8cb335b6f5405a Mon Sep 17 00:00:00 2001 From: Vadim Dashevskiy Date: Tue, 24 Jul 2012 06:41:19 +0000 Subject: SimpleAR, SimpleStatusMsg, SmileyAdd, SpellChecker: changed folder structure git-svn-id: http://svn.miranda-ng.org/main/trunk@1149 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c --- plugins/SmileyAdd/src/regexp/Matcher.cpp | 178 +++ plugins/SmileyAdd/src/regexp/Matcher.h | 248 ++++ plugins/SmileyAdd/src/regexp/Pattern.cpp | 1709 +++++++++++++++++++++++++++ plugins/SmileyAdd/src/regexp/Pattern.h | 1663 ++++++++++++++++++++++++++ plugins/SmileyAdd/src/regexp/WCMatcher.cpp | 181 +++ plugins/SmileyAdd/src/regexp/WCMatcher.h | 234 ++++ plugins/SmileyAdd/src/regexp/WCPattern.cpp | 1747 ++++++++++++++++++++++++++++ plugins/SmileyAdd/src/regexp/WCPattern.h | 1663 ++++++++++++++++++++++++++ plugins/SmileyAdd/src/regexp/test.cpp | 38 + 9 files changed, 7661 insertions(+) create mode 100644 plugins/SmileyAdd/src/regexp/Matcher.cpp create mode 100644 plugins/SmileyAdd/src/regexp/Matcher.h create mode 100644 plugins/SmileyAdd/src/regexp/Pattern.cpp create mode 100644 plugins/SmileyAdd/src/regexp/Pattern.h create mode 100644 plugins/SmileyAdd/src/regexp/WCMatcher.cpp create mode 100644 plugins/SmileyAdd/src/regexp/WCMatcher.h create mode 100644 plugins/SmileyAdd/src/regexp/WCPattern.cpp create mode 100644 plugins/SmileyAdd/src/regexp/WCPattern.h create mode 100644 plugins/SmileyAdd/src/regexp/test.cpp (limited to 'plugins/SmileyAdd/src/regexp') diff --git a/plugins/SmileyAdd/src/regexp/Matcher.cpp b/plugins/SmileyAdd/src/regexp/Matcher.cpp new file mode 100644 index 0000000000..ebd1f57850 --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/Matcher.cpp @@ -0,0 +1,178 @@ +#include +#include + +const int Matcher::MATCH_ENTIRE_STRING = 0x01; + +/* + Detailed documentation is provided in this class' header file + + @author Jeffery Stuart + @since November 2004 + @version 1.07.00 +*/ + +Matcher::Matcher(Pattern * pattern, const bkstring & text) +{ + pat = pattern; + str = &text; + gc = pattern->groupCount; + ncgc = -pattern->nonCapGroupCount; + flags = 0; + matchedSomething = false; + starts = new int[gc + ncgc]; + ends = new int[gc + ncgc]; + groups = new int[gc + ncgc]; + groupPos = new int[gc + ncgc]; + groupIndeces = new int[gc + ncgc]; + starts = starts + ncgc; + ends = ends + ncgc; + groups = groups + ncgc; + groupPos = groupPos + ncgc; + groupIndeces = groupIndeces + ncgc; + for (int i = 0; i < gc; ++i) starts[i] = ends[i] = 0; +} +Matcher::~Matcher() +{ + delete [] (starts - ncgc); + delete [] (ends - ncgc); + delete [] (groups - ncgc); + delete [] (groupIndeces - ncgc); + delete [] (groupPos - ncgc); +} +void Matcher::clearGroups() +{ + int i; + lm = 0; + for (i = 0; i < gc; ++i) groups[i] = starts[i] = ends[i] = -1; + for (i = 1; i <= ncgc; ++i) groups[0 - i] = -1; +} +bkstring Matcher::replaceWithGroups(const bkstring & str) +{ + bkstring ret = ""; + + bkstring t = str; + while (t.size() > 0) + { + if (t[0] == '\\') + { + t.erase(0, 1); + if (t.size() == 0) + { + ret += "\\"; + } + else if (t[0] < '0' || t[0] > '9') + { + ret += t[0]; + t.erase(0, 1); + } + else + { + int gn = 0; + while (t.size() > 0 && t[0] >= '0' && t[0] <= '9') + { + gn = gn * 10 + (t[0] - '0'); + t.erase(0, 1); + } + ret += getGroup(gn); + } + } + else + { + ret += t[0]; + t.erase(0, 1); + } + } + + return ret; +} +unsigned long Matcher::getFlags() const +{ + return flags; +} +const bkstring& Matcher::getText() const +{ + return *str; +} + +bool Matcher::matches() +{ + flags = MATCH_ENTIRE_STRING; + matchedSomething = false; + clearGroups(); + lm = 0; + return pat->head->match(*str, this, 0) == (int)str->size(); +} +bool Matcher::findFirstMatch() +{ + starts[0] = 0; + flags = 0; + clearGroups(); + start = 0; + lm = 0; + ends[0] = pat->head->match(*str, this, 0); + if (ends[0] >= 0) + { + matchedSomething = true; + return 1; + } + return 0; +} +bool Matcher::findNextMatch() +{ + int s = starts[0], e = ends[0]; + + if (!matchedSomething) return findFirstMatch(); + if (s == e) ++e; + flags = 0; + clearGroups(); + + starts[0] = e; + if (e >= (int)str->size()) return 0; + start = e; + lm = e; + ends[0] = pat->head->match(*str, this, e); + return ends[0] >= 0; +} +std::vector Matcher::findAll() +{ + std::vector ret; + reset(); + while (findNextMatch()) + { + ret.push_back(getGroup()); + } + return ret; +} + +void Matcher::reset() +{ + lm = 0; + clearGroups(); + matchedSomething = false; +} + +int Matcher::getStartingIndex(const int groupNum) const +{ + if (groupNum < 0 || groupNum >= gc) return -1; + return starts[groupNum]; +} +int Matcher::getEndingIndex(const int groupNum) const +{ + if (groupNum < 0 || groupNum >= gc) return -1; + return ends[groupNum]; +} +bkstring Matcher::getGroup(const int groupNum) const +{ + if (groupNum < 0 || groupNum >= gc) return ""; + if (starts[groupNum] < 0 || ends[groupNum] < 0) return ""; + return str->substr(starts[groupNum], ends[groupNum] - starts[groupNum]); +} +std::vector Matcher::getGroups(const bool includeGroupZero) const +{ + int i, start = (includeGroupZero ? 0 : 1); + std::vector ret; + + for (i = start; i < gc; ++i) ret.push_back(getGroup(i)); + return ret; +} + diff --git a/plugins/SmileyAdd/src/regexp/Matcher.h b/plugins/SmileyAdd/src/regexp/Matcher.h new file mode 100644 index 0000000000..621184d8c2 --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/Matcher.h @@ -0,0 +1,248 @@ +#ifndef __MATCHER_H__ +#define __MATCHER_H__ + +#include "bkstring.h" +#include + +class Vector; +class NFANode; +class NFAStartNode; +class NFAEndNode; +class NFAGroupHeadNode; +class NFAGroupLoopNode; +class NFAGroupLoopPrologueNode; +class NFAGroupTailNode; +class NFALookBehindNode; +class NFAStartOfLineNode; +class NFAEndOfLineNode; +class NFAEndOfMatchNode; +class NFAReferenceNode; +class Pattern; + +/** + A matcher is a non thread-safe object used to scan strings using a given + {@link Pattern Pattern} object. Using a Matcher is the preferred + method for scanning strings. Matchers are not thread-safe. Matchers require + very little dynamic memory, hence one is encouraged to create several + instances of a matcher when necessary as opposed to sharing a single instance + of a matcher. +

+ The most common methods needed by the matcher are matches, + findNextMatch, and getGroup. matches + and findNextMatch both return success or failure, and further + details can be gathered from their documentation. +

+ Unlike Java's Matcher, this class allows you to change the string + you are matching against. This provides a small optimization, since you no + longer need multiple matchers for a single pattern in a single thread. +

+ This class also provides an extremely handy method for replacing text with + captured data via the replaceWithGroups method. A typical + invocation looks like: +

+  char buf[10000];
+  bkstring str = "\\5 (user name \\1) uses \\7 for his/her shell and \\6 is their home directory";
+  FILE * fp = fopen("/etc/passwd", "r");
+  Pattern::registerPattern("entry", "[^:]+");
+  Pattern * p = Pattern::compile("^({entry}):({entry}):({entry}):({entry}):({entry}):({entry}):({entry})$",
+                                 Pattern::MULTILINE_MATCHING | Pattern::UNIX_LINE_MODE);
+  Matcher * m = p->createMatcher("");
+  while (fgets(buf, 9999, fp))
+  {
+    m->setString(buf);
+    if (m->matches())
+    {
+      printf("%s\n", m->replaceWithGroups(str).c_str());
+    }
+  }
+  fclose(fp);
+
+  
+ Calling any of the following functions before first calling + matches, findFirstMatch, or + findNextMatch results in undefined behavior and may cause your + program to crash. + +
    +
  • replaceWithGroups +
  • getStartingIndex
  • +
  • getEndingIndex
  • +
  • getGroup
  • +
  • getGroups
  • +
+ +

+ The function findFirstMatch will attempt to find the first match + in the input string. The same results can be obtained by first calling + reset followed by findNextMatch. +

+ To eliminate the necessity of looping through a string to find all the + matching substrings, findAll was created. The function will find + all matching substrings and return them in a vector. If you need + to examine specific capture groups within the substrings, then this method + should not be used. + + @author Jeffery Stuart + @since March 2003, Stable Since November 2004 + @version 1.07.00 + @memo Mutable object used on instances of a Pattern class + */ +class Matcher +{ + friend class NFANode; + friend class NFAStartNode; + friend class NFAEndNode; + friend class NFAGroupHeadNode; + friend class NFAGroupLoopNode; + friend class NFAGroupLoopPrologueNode; + friend class NFAGroupTailNode; + friend class NFALookBehindNode; + friend class NFAStartOfLineNode; + friend class NFAEndOfLineNode; + friend class NFAEndOfMatchNode; + friend class NFAReferenceNode; + friend class Pattern; + private: + /** + Creates a new matcher object against text using + pattern. + + @param pattern The pattern with which to search + @param text The text in which to search + */ + Matcher(Pattern * pattern, const bkstring & text); + protected: + /// The pattern we use to match + Pattern * pat; + /// The string in which we are matching + const bkstring* str; + /// The starting point of our match + int start; + /// An array of the starting positions for each group + int * starts; + /// An array of the ending positions for each group + int * ends; + /// An array of private data used by NFANodes during matching + int * groups; + /// An array of private data used by NFANodes during matching + int * groupIndeces; + /// An array of private data used by NFANodes during matching + int * groupPos; + /// The ending index of the last match + int lm; + /// The number of capturing groups we have + int gc; + /// The number of non-capturing groups we havew + int ncgc; + /// Whether or not we have matched something (used only by findFirstMatch and findNextMatch) + int matchedSomething; + /// The flags with which we were made + unsigned long flags; + /// Called by reset to clear the group arrays + void clearGroups(); + public: + /// Used internally by match to signify we want the entire string matched + const static int MATCH_ENTIRE_STRING; + public: + /// Cleans up the dynamic memory used by this matcher + ~Matcher(); + /** + Replaces the contents of str with the appropriate captured + text. str should have at least one back reference, otherwise + this function does nothing. + @param str The string in which to replace text + @return A string with all backreferences appropriately replaced + */ + bkstring replaceWithGroups(const bkstring & str); + /** + The flags currently being used by the matcher. + @return Zero + */ + unsigned long getFlags() const; + /** + The text being searched by the matcher. + @return the text being searched by the matcher. + */ + const bkstring& getText() const; + + /** + Scans the string from start to finish for a match. The entire string must + match for this function to return success. Group variables are + appropriately set and can be queried after this function returns. + + @return Success if and only if the entire string matches the pattern + */ + bool matches(); + /** + Scans the string for the first substring matching the pattern. The entire + string does not necessarily have to match for this function to return + success. Group variables are appropriately set and can be queried after + this function returns. + + @return Success if any substring matches the specified pattern + */ + bool findFirstMatch(); + /** + Scans the string for the next substring matching the pattern. If no calls + have been made to findFirstMatch of findNextMatch since the last call to + reset, matches, or setString, then this function's behavior results to + that of findFirstMatch. + + @return Success if another substring can be found that matches the pattern + */ + bool findNextMatch(); + /** + Returns a vector of every substring in order which matches the given + pattern. + + @return Every substring in order which matches the given pattern + */ + std::vector findAll(); + /** + Resets the internal state of the matcher + */ + void reset(); + /** + Same as getText. Left n for backwards compatibilty with old source code + @return Returns the string that is currently being used for matching + */ + inline const bkstring& getString() const { return *str; } + /** + Sets the string to scan + @param newStr The string to scan for subsequent matches + */ + inline void setString(const bkstring & newStr) { str = &newStr; reset(); } + + /** + Returns the starting index of the specified group. + @param groupNum The group to query + @return The starting index of the group if it was matched, -1 for an + invalid group or if the group was not matched + */ + int getStartingIndex(const int groupNum = 0) const; + /** + Returns the ending index of the specified group. + @param groupNum The group to query + @return The ending index of the group if it was matched, -1 for an + invalid group or if the group was not matched + */ + int getEndingIndex(const int groupNum = 0) const; + /** + Returns the specified group. An empty string ("") does not necessarily + mean the group was not matched. A group such as (a*b?) could be matched by + a zero length. If an empty string is returned, getStartingIndex can be + called to determine if the group was actually matched. + @param groupNum The group to query + @return The text of the group + */ + bkstring getGroup(const int groupNum = 0) const; + /** + Returns every capture group in a vector + + @param includeGroupZero Whether or not include capture group zero + @return Every capture group + */ + std::vector getGroups(const bool includeGroupZero = 0) const; +}; + +#endif diff --git a/plugins/SmileyAdd/src/regexp/Pattern.cpp b/plugins/SmileyAdd/src/regexp/Pattern.cpp new file mode 100644 index 0000000000..4487dddd4b --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/Pattern.cpp @@ -0,0 +1,1709 @@ +/** + From the author (Jeff Stuart) + " + Let me start by saying this file is pretty big. If you feel up to it, you can + try making changes yourself, but you would be better off to just email me at + stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you + would like added. This project is very "near and dear" to me, so I am fairly + quick to make bug fixes. The header files for Pattern and Matcher are fairly + well documented and the function names are pretty self-explanatory, but if you + are having any trouble, feel free to email me at stuart@cs.ucdavis.edu. + + If you email me, make sure you put something like C++RE in the subject because + I tend to delete email if I don't recognize the name and the subject is + something like "I Need Your Help" or "Got A Second" or "I Found It". + " + */ + +/* + Detailed documentation is provided in this class' header file + + @author Jeffery Stuart + @since November 2004 + @version 1.07.00 +*/ + +#define to_lower(a) (char)(unsigned)CharLowerA((LPSTR)static_cast(a)) +#define is_alpha IsCharAlphaA + +#ifdef _WIN32 + #pragma warning(push) + #pragma warning(disable:4996) + #define str_icmp lstrcmpiA +#else + #define str_icmp strcasecmp +#endif + +#include +#include +#include +#include + +std::map Pattern::compiledPatterns; +std::map > Pattern::registeredPatterns; + +const int Pattern::MIN_QMATCH = 0x00000000; +const int Pattern::MAX_QMATCH = 0x7FFFFFFF; + +const unsigned long Pattern::CASE_INSENSITIVE = 0x01; +const unsigned long Pattern::LITERAL = 0x02; +const unsigned long Pattern::DOT_MATCHES_ALL = 0x04; +const unsigned long Pattern::MULTILINE_MATCHING = 0x08; +const unsigned long Pattern::UNIX_LINE_MODE = 0x10; + +Pattern::Pattern(const bkstring & rhs) +{ + matcher = NULL; + pattern = rhs; + curInd = 0; + groupCount = 0; + nonCapGroupCount = 0; + error = 0; + head = NULL; +} +// convenience function in case we want to add any extra debugging output +void Pattern::raiseError() +{ +/* switch (pattern[curInd - 1]) + { + case '*': + case ')': + case '+': + case '?': + case ']': + case '}': + fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' '); + fprintf(stderr, "Syntax Error near here. Possible unescaped meta character.\n"); + break; + default: + fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' '); + fprintf(stderr, "Syntax Error near here. \n"); + break; + }*/ + error = 1; +} +NFANode * Pattern::registerNode(NFANode * node) +{ + nodes[node] = 1; + return node; +} + +bkstring Pattern::classUnion (bkstring s1, bkstring s2) const +{ + char out[300]; + std::sort(s1.begin(), s1.end()); + std::sort(s2.begin(), s2.end()); + *std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0; + return out; +} +bkstring Pattern::classIntersect (bkstring s1, bkstring s2) const +{ + char out[300]; + std::sort(s1.begin(), s1.end()); + std::sort(s2.begin(), s2.end()); + *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0; + return out; +} +bkstring Pattern::classNegate (bkstring s1) const +{ + char out[300]; + int i, ind = 0; + std::map m; + + for (i = 0; i < (int)s1.size(); ++i) m[s1[i]] = 1; + for (i = 0xFF; i >= 0; --i) if (m.find((char)i) == m.end()) out[ind++] = (char)i; + out[ind] = 0; + return bkstring(out, ind); +} +bkstring Pattern::classCreateRange(char low, char hi) const +{ + char out[300]; + int ind = 0; + while (low != hi) out[ind++] = low++; + out[ind++] = low; + return bkstring(out, ind); +} + +int Pattern::getInt(int start, int end) +{ + int ret = 0; + for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - '0'); + return ret; +} +bool Pattern::quantifyCurly(int & sNum, int & eNum) +{ + bool good = 1; + int i, ci = curInd + 1; + int commaInd = ci, endInd = ci, len = (int)pattern.size(); + sNum = eNum = 0; + + while (endInd < len && pattern[endInd ] != '}') ++endInd; + while (commaInd < endInd && pattern[commaInd] != ',') ++commaInd; + if (endInd >= len) { raiseError(); return 0; } + for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0; + if (!good && commaInd < endInd) { raiseError(); return 0; } + if (!good) return 0; + /* so now everything in here is either a comma (and there is at most one comma) or a digit */ + if (commaInd == ci) // {,*} + { + if (endInd == commaInd + 1) { sNum = MIN_QMATCH; eNum = MAX_QMATCH; } // {,} = * + else { sNum = MIN_QMATCH; eNum = getInt(commaInd + 1, endInd - 1); } // {,+} + } + else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH; } // {+,} + else if (commaInd == endInd) { sNum = getInt(ci, endInd - 1); eNum = sNum; } // {+} + else { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1); } // {+,+} + curInd = endInd + 1; + return 1; +} +NFANode * Pattern::quantifyGroup(NFANode * start, NFANode * stop, const int gn) +{ + NFANode * newNode = NULL; + int type = 0; + + if (curInd < (int)pattern.size()) + { + char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1]; + switch (pattern[curInd]) + { + case '*': + ++curInd; + switch (ch) + { + case '?': ++curInd; type = 1; break; + case '+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueNode(gn)); + newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, MAX_QMATCH, gn, type)); + stop->next = newNode->next; + return newNode; + case '?': + ++curInd; + switch (ch) + { + case '?': ++curInd; type = 1; break; + case '+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueNode(gn)); + newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, 1, gn, type)); + stop->next = newNode->next; + return newNode; + case '+': + ++curInd; + switch (ch) + { + case '?': ++curInd; type = 1; break; + case '+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueNode(gn)); + newNode->next = registerNode(new NFAGroupLoopNode(start, 1, MAX_QMATCH, gn, type)); + stop->next = newNode->next; + return newNode; + case '{': + { + int s, e; + if (quantifyCurly(s, e)) + { + ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1; + switch (ch) + { + case '?': ++curInd; type = 1; break; + case '+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueNode(gn)); + newNode->next = registerNode(new NFAGroupLoopNode(start, s, e, gn, type)); + stop->next = newNode->next; + return newNode; + } + } + default: + break; + } + } + return NULL; +} + +NFANode * Pattern::quantify(NFANode * newNode) +{ + if (curInd < (int)pattern.size()) + { + char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1]; + switch (pattern[curInd]) + { + case '*': + ++curInd; + switch (ch) + { + case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break; + case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break; + default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break; + } + break; + case '?': + ++curInd; + switch (ch) + { + case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, MIN_QMATCH, 1)); break; + case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, 1)); break; + default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, MIN_QMATCH, 1)); break; + } + break; + case '+': + ++curInd; + switch (ch) + { + case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, 1, MAX_QMATCH)); break; + case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, 1, MAX_QMATCH)); break; + default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, 1, MAX_QMATCH)); break; + } + break; + case '{': + { + int s, e; + if (quantifyCurly(s, e)) + { + ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1; + switch (ch) + { + case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, s, e)); break; + case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, s, e)); break; + default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, s, e)); break; + } + } + } + break; + default: + break; + } + } + return newNode; +} +bkstring Pattern::parseClass() +{ + bkstring t, ret = ""; + char ch, c1, c2; + bool inv = 0, neg = 0, quo = 0; + + if (curInd < (int)pattern.size() && pattern[curInd] == '^') + { + ++curInd; + neg = 1; + } + while (curInd < (int)pattern.size() && pattern[curInd] != ']') + { + ch = pattern[curInd++]; + if (ch == '[') + { + t = parseClass(); + ret = classUnion(ret, t); + } + /*else if (ch == '-') + { + raiseError(); + curInd = pattern.size(); + }*/ + else if (ch == '&' && curInd < (int)pattern.size() && pattern[curInd] == '&') + { + if (pattern[++curInd] != '[') + { + raiseError(); + curInd = (int)pattern.size(); + } + else + { + ++curInd; + t = parseClass(); + ret = classIntersect(ret, t); + } + } + else if (ch == '\\') + { + t = parseEscape(inv, quo); + if (quo) + { + raiseError(); + curInd = (int)pattern.size(); + } + else if (inv || t.size() > 1) // cant be part of a range (a-z) + { + if (inv) t = classNegate(t); + ret = classUnion(ret, t); + } + else if (curInd < (int)pattern.size() && pattern[curInd] == '-') // part of a range (a-z) + { + c1 = t[0]; + ++curInd; + if (curInd >= (int)pattern.size()) raiseError(); + else + { + c2 = pattern[curInd++]; + if (c2 == '\\') + { + t = parseEscape(inv, quo); + if (quo) + { + raiseError(); + curInd = (int)pattern.size(); + } + else if (inv || t.size() > 1) raiseError(); + else ret = classUnion(ret, classCreateRange(c1, c2)); + } + else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&') + { + raiseError(); + curInd = (int)pattern.size(); + } + else ret = classUnion(ret, classCreateRange(c1, c2)); + } + } + else + { + ret = classUnion(ret, t); + } + } + else if (curInd < (int)pattern.size() && pattern[curInd] == '-') + { + c1 = ch; + ++curInd; + if (curInd >= (int)pattern.size()) raiseError(); + else + { + c2 = pattern[curInd++]; + if (c2 == '\\') + { + t = parseEscape(inv, quo); + if (quo) + { + raiseError(); + curInd = (int)pattern.size(); + } + else if (inv || t.size() > 1) raiseError(); + else ret = classUnion(ret, classCreateRange(c1, c2)); + } + else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&') + { + raiseError(); + curInd = (int)pattern.size(); + } + else + { + ret = classUnion(ret, classCreateRange(c1, c2)); + } + } + } + else + { + ret += " "; + ret[ret.size() - 1] = ch; + } + } + if (curInd >= (int)pattern.size() || pattern[curInd] != ']') + { + raiseError(); + ret = ""; + } + else + { + ++curInd; + if (neg) ret = classNegate(ret); + } + return ret; +} +bkstring Pattern::parsePosix() +{ + bkstring s7 = pattern.substr(curInd, 7); + if (s7 == "{Lower}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyz"; } + if (s7 == "{Upper}") { curInd += 7; return "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; } + if (s7 == "{Alpha}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; } + if (s7 == "{Digit}") { curInd += 7; return "0123456789"; } + if (s7 == "{Alnum}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; } + if (s7 == "{Punct}") { curInd += 7; return "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } + if (s7 == "{Graph}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } + if (s7 == "{Print}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } + if (s7 == "{Blank}") { curInd += 7; return " \t"; } + if (s7 == "{Space}") { curInd += 7; return " \t\n\x0B\f\r"; } + if (s7 == "{Cntrl}") + { + bkstring::value_type i; + bkstring s = " "; + + for (i = 0; i < 5; ++i) s += s; + s += " "; + for (i = 0; i <= 0x1F; ++i) s[i] = i; + s[0x20] = 0x7F; + curInd += 7; + return s; + } + if (s7 == "{ASCII}") + { + bkstring s(0x80, ' '); + for (int i = 0; i <= 0x7f; ++i) s[i] = (bkstring::value_type)i; + curInd += 7; + return s; + } + if (pattern.substr(curInd, 8) == "{XDigit}") { curInd += 8; return "abcdefABCDEF0123456789"; } + raiseError(); + return ""; +} +NFANode * Pattern::parseBackref() +{ + #define is_dig(x) ((x) >= '0' && (x) <= '9') + #define to_int(x) ((x) - '0') + int ci = curInd; + int oldRef = 0, ref = 0; + + while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount)) + { + oldRef = ref; + ref = ref * 10 + to_int(pattern[ci++]); + } + if (ci == (int)pattern.size()) + { + oldRef = ref; + ++ci; + } + if (oldRef < 0 || ci <= curInd) + { + raiseError(); + return registerNode(new NFAReferenceNode(-1)); + } + curInd = ci; + return registerNode(new NFAReferenceNode(ref)); + + #undef is_dig + #undef to_int +} +bkstring Pattern::parseOctal() +{ + #define islowoc(x) ((x) >= '0' && (x) <= '3') + #define isoc(x) ((x) >= '0' && (x) <= '7') + #define fromoc(x) ((x) - '0') + int ci = curInd; + char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1; + char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1; + char ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : -1; + bkstring s = " "; + + if (islowoc(ch1) && isoc(ch2)) + { + curInd += 2; + s[0] = fromoc(ch1) * 8 + fromoc(ch2); + if (isoc(ch3)) + { + ++curInd; + s[0] = s[0] * 8 + fromoc(ch3); + } + } + else if (isoc(ch1) && isoc(ch2)) + { + curInd += 2; + s[0] = fromoc(ch1) * 8 + fromoc(ch2); + } + else raiseError(); + + return s; + #undef islowoc + #undef isoc + #undef fromoc +} +bkstring Pattern::parseHex() +{ + #define to_low(x) (((x) >= 'A' && (x) <= 'Z') ? ((x) - 'A' + 'a') : (x)) + #define is_dig(x) ((x) >= '0' && (x) <= '9') + #define is_hex(x) (is_dig(x) || (to_low(x) >= 'a' && to_low(x) <= 'f')) + #define to_int(x) ((is_dig(x)) ? ((x) - '0') : (to_low(x) - 'a' + 10)) + + int ci = curInd; + char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1; + char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1; + bkstring s = " "; + + if (is_hex(ch1) && is_hex(ch2)) + { + curInd += 2; + s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F); + } + + return s; + #undef to_low + #undef is_dig + #undef is_hex + #undef to_int +} +bkstring Pattern::parseEscape(bool & inv, bool & quo) +{ + char ch = pattern[curInd++]; + bkstring classes = ""; + + if (curInd > (int)pattern.size()) + { + raiseError(); + return NULL; + } + + quo = 0; + inv = 0; + switch (ch) + { + case 'p': classes = parsePosix(); break; + case 'P': classes = "!!"; classes += parsePosix(); break; + case 'd': classes = "0123456789"; break; + case 'D': classes = "!!0123456789"; break; + case 's': classes = " \t\r\n\f"; break; + case 'S': classes = "!! \t\r\n\f"; break; + case 'w': classes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break; + case 'W': classes = "!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break; + case '0': classes = parseOctal(); break; + case 'x': classes = parseHex(); break; + + case 'Q': quo = 1; break; + case 't': classes = "\t"; break; + case 'r': classes = "\r"; break; + case 'n': classes = "\n"; break; + case 'f': classes = "\f"; break; + case 'a': classes = "\a"; break; + case 'e': classes = "\r"; break; + default: classes = " "; classes[0] = ch; break; + } + if (classes.substr(0, 2) == "!!") + { + classes = classes.substr(2); + inv = 1; + } + return classes; +} +NFANode * Pattern::parseRegisteredPattern(NFANode ** end) +{ + int i, j; + bkstring s; + NFANode * ret = NULL; + for (i = curInd; i < (int)pattern.size() && pattern[i] != '}'; ++i) { } + if (pattern[i] != '}') { raiseError(); return NULL; } + if (i == curInd + 1) { raiseError(); return NULL; } // {} + if ( + !( + (pattern[curInd] >= 'a' && pattern[curInd] <= 'z') || + (pattern[curInd] >= 'A' && pattern[curInd] <= 'Z') || + (pattern[curInd] == '_') + ) + ) + { + raiseError(); + return NULL; + } + for (j = curInd; !error && j < i; ++j) + { + if ( + !( + (pattern[j] >= 'a' && pattern[j] <= 'z') || + (pattern[j] >= 'A' && pattern[j] <= 'Z') || + (pattern[j] >= '0' && pattern[j] <= '9') || + (pattern[j] == '_') + ) + ) + { + raiseError(); + return NULL; + } + } + s = pattern.substr(curInd, i - curInd); + if (registeredPatterns.find(s) == registeredPatterns.end()) raiseError(); + else + { + unsigned long oflags = flags; + bkstring op = pattern; + int ci = i + 1; + + pattern = registeredPatterns[s].first; + curInd = 0; + flags = registeredPatterns[s].second; + + --groupCount; + ret = parse(0, 0, end); + + pattern = op; + curInd = ci; + flags = oflags; + } + if (error) { *end = ret = NULL; } + return ret; +} + +// look behind should interpret everything as a literal (except \\) since the +// pattern must have a concrete length +NFANode * Pattern::parseBehind(const bool pos, NFANode ** end) +{ + bkstring t = ""; + while (curInd < (int)pattern.size() && pattern[curInd] != ')') + { + char ch = pattern[curInd++]; + t += " "; + if (ch == '\\') + { + if (curInd + 1 >= (int)pattern.size()) + { + raiseError(); + return *end = registerNode(new NFACharNode(' ')); + } + ch = pattern[curInd++]; + } + t[t.size() - 1] = ch; + } + if (curInd >= (int)pattern.size() || pattern[curInd] != ')') raiseError(); + else ++curInd; + return *end = registerNode(new NFALookBehindNode(t, pos)); +} +NFANode * Pattern::parseQuote() +{ + bool done = 0; + bkstring s = ""; + + while (!done) + { + if (curInd >= (int)pattern.size()) + { + raiseError(); + done = 1; + } + else if (pattern.substr(curInd, 2) == "\\E") + { + curInd += 2; + done = 1; + } + else if (pattern[curInd] == '\\') + { + s += " "; + s[s.size() - 1] = pattern[++curInd]; + ++curInd; + } + else + { + s += " "; + s[s.size() - 1] = pattern[curInd++]; + } + } + if ((flags & Pattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteNode(s)); + return registerNode(new NFAQuoteNode(s)); +} +NFANode * Pattern::parse(const bool inParen, const bool inOr, NFANode ** end) +{ + NFANode * start, * cur, * next = NULL; + bkstring t; + int grc = groupCount++; + bool inv, quo; + bool ahead = 0, pos = 0, noncap = 0, indep = 0; + unsigned long oldFlags = flags; + + if (inParen) + { + if (pattern[curInd] == '?') + { + ++curInd; + --groupCount; + if (pattern[curInd] == ':') { noncap = 1; ++curInd; grc = --nonCapGroupCount; } + else if (pattern[curInd] == '=') { ++curInd; ahead = 1; pos = 1; } + else if (pattern[curInd] == '!') { ++curInd; ahead = 1; pos = 0; } + else if (pattern.substr(curInd, 2) == "<=") { curInd += 2; return parseBehind(1, end); } + else if (pattern.substr(curInd, 2) == "') { ++curInd; indep = 1; } + else + { + bool negate = false, done = false; + while (!done) + { + if (curInd >= (int)pattern.size()) + { + raiseError(); + return NULL; + } + else if (negate) + { + switch (pattern[curInd]) + { + case 'i': flags &= ~Pattern::CASE_INSENSITIVE; break; + case 'd': flags &= ~Pattern::UNIX_LINE_MODE; break; + case 'm': flags &= ~Pattern::MULTILINE_MATCHING; break; + case 's': flags &= ~Pattern::DOT_MATCHES_ALL; break; + case ':': done = true; break; + case ')': + ++curInd; + *end = registerNode(new NFALookBehindNode("", true)); + return *end; + case '-': + default: raiseError(); return NULL; + } + } + else + { + switch (pattern[curInd]) + { + case 'i': flags |= Pattern::CASE_INSENSITIVE; break; + case 'd': flags |= Pattern::UNIX_LINE_MODE; break; + case 'm': flags |= Pattern::MULTILINE_MATCHING; break; + case 's': flags |= Pattern::DOT_MATCHES_ALL; break; + case ':': done = true; break; + case '-': negate = true; break; + case ')': + ++curInd; + *end = registerNode(new NFALookBehindNode("", true)); + return *end; + default: raiseError(); return NULL; + } + } + ++curInd; + } + noncap = 1; + grc = --nonCapGroupCount; + } + if (noncap) cur = start = registerNode(new NFAGroupHeadNode(grc)); + else cur = start = registerNode(new NFASubStartNode); + } + else cur = start = registerNode(new NFAGroupHeadNode(grc)); + } + else cur = start = registerNode(new NFASubStartNode); + while (curInd < (int)pattern.size()) + { + char ch = pattern[curInd++]; + + next = NULL; + if (error) return NULL; + switch (ch) + { + case '^': + if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineNode); + else next = registerNode(new NFAStartOfInputNode); + break; + case '$': + if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineNode); + else next = registerNode(new NFAEndOfInputNode(0)); + break; + case '|': + --groupCount; + cur->next = registerNode(new NFAAcceptNode); + cur = start = registerNode(new NFAOrNode(start, parse(inParen, 1))); + break; + case '\\': + if (curInd < (int)pattern.size()) + { + bool eoi = 0; + switch (pattern[curInd]) + { + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': next = parseBackref(); break; + case 'A': ++curInd; next = registerNode(new NFAStartOfInputNode); break; + case 'B': ++curInd; next = registerNode(new NFAWordBoundaryNode(0)); break; + case 'b': ++curInd; next = registerNode(new NFAWordBoundaryNode(1)); break; + case 'G': ++curInd; next = registerNode(new NFAEndOfMatchNode); break; + case 'Z': eoi = 1; + case 'z': ++curInd; next = registerNode(new NFAEndOfInputNode(eoi)); break; + default: + t = parseEscape(inv, quo); + if (!quo) + { + if (t.size() > 1 || inv) + { + if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassNode(t, inv)); + else next = registerNode(new NFAClassNode(t, inv)); + } + else + { + next = registerNode(new NFACharNode(t[0])); + } + } + else + { + next = parseQuote(); + } + } + } + else raiseError(); + break; + case '[': + if ((flags & Pattern::CASE_INSENSITIVE) == 0) + { + NFAClassNode * clazz = new NFAClassNode(); + bkstring s = parseClass(); + for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1; + next = registerNode(clazz); + } + else + { + NFACIClassNode * clazz = new NFACIClassNode(); + bkstring s = parseClass(); + for (int i = 0; i < (int)s.size(); ++i) clazz->vals[to_lower(s[i])] = 1; + next = registerNode(clazz); + } + break; + case '.': + { + bool useN = 1, useR = 1; + NFAClassNode * clazz = new NFAClassNode(1); + if ((flags & Pattern::UNIX_LINE_MODE) != 0) useR = 0; + if ((flags & Pattern::DOT_MATCHES_ALL) != 0) useN = useR = 0; + if (useN) clazz->vals['\n'] = 1; + if (useR) clazz->vals['\r'] = 1; + next = registerNode(clazz); + } + break; + case '(': + { + NFANode * end, * t1, * t2; + t1 = parse(1, 0, &end); + if (!t1) raiseError(); + else if (t1->isGroupHeadNode() && (t2 = quantifyGroup(t1, end, grc)) != NULL) + { + cur->next = t2; + cur = t2->next; + } + else + { + cur->next = t1; + cur = end; + } + } + break; + case ')': + if (!inParen) raiseError(); + else if (inOr) + { + --curInd; + cur = cur->next = registerNode(new NFAAcceptNode); + flags = oldFlags; + return start; + } + else + { + if (ahead) + { + cur = cur->next = registerNode(new NFAAcceptNode); + flags = oldFlags; + return *end = registerNode(new NFALookAheadNode(start, pos)); + } + else if (indep) + { + cur = cur->next = registerNode(new NFAAcceptNode); + flags = oldFlags; + return *end = registerNode(new NFAPossessiveQuantifierNode(this, start, 1, 1)); + } + else // capping or noncapping, it doesnt matter + { + *end = cur = cur->next = registerNode(new NFAGroupTailNode(grc)); + next = quantifyGroup(start, *end, grc); + if (next) + { + start = next; + *end = next->next; + } + flags = oldFlags; + return start; + } + } + break; + case '{': // registered pattern + cur->next = parseRegisteredPattern(&next); + if (cur->next) cur = next; + break; + case '*': + case '+': + case '?': +// case '}': +// case ']': + raiseError(); + break; + default: + if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACICharNode(ch)); + else next = registerNode(new NFACharNode(ch)); + break; + } + if (next) + { + cur = cur->next = quantify(next); + } + } + if (inParen) raiseError(); + else + { + if (inOr) cur = cur->next = registerNode(new NFAAcceptNode); + if (end) *end = cur; + } + + flags = oldFlags; + if (error) return NULL; + + return start; +} + +Pattern * Pattern::compile(const bkstring & pattern, const unsigned long mode) +{ + Pattern * p = new Pattern(pattern); + NFANode * end; + + p->flags = mode; + if ((mode & Pattern::LITERAL) != 0) + { + p->head = p->registerNode(new NFAStartNode); + if ((mode & Pattern::CASE_INSENSITIVE) != 0) p->head->next = p->registerNode(new NFACIQuoteNode(pattern)); + else p->head->next = p->registerNode(new NFAQuoteNode(pattern)); + p->head->next->next = p->registerNode(new NFAEndNode); + } + else + { + p->head = p->parse(0, 0, &end); + if (!p->head) + { + delete p; + p = NULL; + } + else + { + if (!(p->head && p->head->isStartOfInputNode())) + { + NFANode * n = p->registerNode(new NFAStartNode); + n->next = p->head; + p->head = n; + } + end->next = p->registerNode(new NFAEndNode); + } + } + if (p != NULL) + { + p->matcher = new Matcher(p, ""); + } + + return p; +} + +Pattern * Pattern::compileAndKeep(const bkstring & pattern, const unsigned long mode) +{ + Pattern * ret = NULL; + std::map::iterator it = compiledPatterns.find(pattern); + + if (it != compiledPatterns.end()) + { + ret = it->second; + } + else + { + ret = compile(pattern, mode); + compiledPatterns[pattern] = ret; + } + + return ret; +} +bkstring Pattern::replace(const bkstring & pattern, const bkstring & str, + const bkstring & replacementText, const unsigned long mode) +{ + bkstring ret; + Pattern * p = Pattern::compile(pattern, mode); + if (p) + { + ret = p->replace(str, replacementText); + delete p; + } + return ret; +} + +std::vector Pattern::split(const bkstring & pattern, const bkstring & str, const bool keepEmptys, + const unsigned long limit, const unsigned long mode) +{ + std::vector ret; + Pattern * p = Pattern::compile(pattern, mode); + if (p) + { + ret = p->split(str, keepEmptys, limit); + delete p; + } + return ret; +} + +std::vector Pattern::findAll(const bkstring & pattern, const bkstring & str, const unsigned long mode) +{ + std::vector ret; + Pattern * p = Pattern::compile(pattern, mode); + if (p) + { + ret = p->findAll(str); + delete p; + } + return ret; +} + +bool Pattern::matches(const bkstring & pattern, const bkstring & str, const unsigned long mode) +{ + bool ret = 0; + Pattern * p = compile(pattern, mode); + + if (p) + { + ret = p->matches(str); + delete p; + } + + return ret; +} + +bool Pattern::registerPattern(const bkstring & name, const bkstring & pattern, const unsigned long mode) +{ + Pattern * p = Pattern::compile(pattern, mode); + if (!p) return 0; + Pattern::registeredPatterns[name] = std::make_pair(pattern, mode); + delete p; + return 1; +} + +void Pattern::unregisterPatterns() +{ + registeredPatterns.clear(); +} +void Pattern::clearPatternCache() +{ + std::map::iterator it; + for (it = compiledPatterns.begin(); it != compiledPatterns.end(); ++it) + { + delete it->second; + } + compiledPatterns.clear(); +} + +std::pair Pattern::findNthMatch(const bkstring & pattern, const bkstring & str, + const int matchNum, const unsigned long mode) +{ + std::pair ret; + Pattern * p = Pattern::compile(pattern, mode); + + ret.second = -1; + if (p) + { + int i = -1; + p->matcher->setString(str); + while (i < matchNum && p->matcher->findNextMatch()) { ++i; } + if (i == matchNum && p->matcher->getStartingIndex() >= 0) + { + ret.first = p->matcher->getGroup(0); + ret.second = p->matcher->getStartingIndex(); + } + delete p; + } + + return ret; +} + +Pattern::~Pattern() +{ + if (matcher) delete matcher; + for (std::map::iterator it = nodes.begin(); it != nodes.end(); ++it) + { + delete it->first; + } +} +bkstring Pattern::replace(const bkstring & str, const bkstring & replacementText) +{ + int li = 0; + bkstring ret = ""; + + matcher->setString(str); + while (matcher->findNextMatch()) + { + ret += str.substr(li, matcher->getStartingIndex() - li); + ret += matcher->replaceWithGroups(replacementText); + li = matcher->getEndingIndex(); + } + ret += str.substr(li); + + return ret; +} +std::vector Pattern::split(const bkstring & str, const bool keepEmptys, const unsigned long limit) +{ + unsigned long lim = (limit == 0 ? MAX_QMATCH : limit); + int li = 0; + std::vector ret; + + matcher->setString(str); + + while (matcher->findNextMatch() && ret.size() < lim) + { + if (matcher->getStartingIndex() == 0 && keepEmptys) ret.push_back(""); + if ((matcher->getStartingIndex() != matcher->getEndingIndex()) || keepEmptys) + { + if (li != matcher->getStartingIndex() || keepEmptys) + { + ret.push_back(str.substr(li, matcher->getStartingIndex() - li)); + } + li = matcher->getEndingIndex(); + } + } + if (li < (int)str.size()) ret.push_back(str.substr(li)); + + return ret; +} +std::vector Pattern::findAll(const bkstring & str) +{ + matcher->setString(str); + return matcher->findAll(); +} +bool Pattern::matches(const bkstring & str) +{ + matcher->setString(str); + return matcher->matches(); +} +unsigned long Pattern::getFlags() const +{ + return flags; +} +bkstring Pattern::getPattern() const +{ + return pattern; +} +Matcher * Pattern::createMatcher(const bkstring & str) +{ + return new Matcher(this, str); +} + +// NFANode + +NFANode::NFANode() { next = NULL; } +NFANode::~NFANode() { } +void NFANode::findAllNodes(std::map & soFar) +{ + if (soFar.find(this) == soFar.end()) return; + soFar[this] = 1; + if (next) next->findAllNodes(soFar); +} + +// NFACharNode + +NFACharNode::NFACharNode(const char c) { ch = c; } +int NFACharNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && str[curInd] == ch) return next->match(str, matcher, curInd + 1); + return -1; +} + +// NFACICharNode + +NFACICharNode::NFACICharNode(const char c) { ch = to_lower(c); } +int NFACICharNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && to_lower(str[curInd]) == ch) return next->match(str, matcher, curInd + 1); + return -1; +} + +// NFAStartNode + +NFAStartNode::NFAStartNode() { } +int NFAStartNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int ret = -1, ci = curInd; + + matcher->starts[0] = curInd; + if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) == (unsigned int)Matcher::MATCH_ENTIRE_STRING) + { + if (curInd != 0) + { + matcher->starts[0] = -1; + return -1; + } + return next->match(str, matcher, 0); + } + while ((ret = next->match(str, matcher, ci)) == -1 && ci < (int)str.size()) + { + matcher->clearGroups(); + matcher->starts[0] = ++ci; + } + if (ret < 0) matcher->starts[0] = -1; + return ret; +} + +// NFAEndNode + +NFAEndNode::NFAEndNode() { } +int NFAEndNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + matcher->ends[0] = curInd; + if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) != 0) + { + if (curInd == (int)str.size()) return curInd; + matcher->ends[0] = -1; + return -1; + } + return curInd; +} + +// NFAQuantifierNode + +void NFAQuantifierNode::findAllNodes(std::map & soFar) +{ + inner->findAllNodes(soFar); + NFANode::findAllNodes(soFar); +} +NFAQuantifierNode::NFAQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch) +{ + inner = internal; + inner->next = pat->registerNode(new NFAAcceptNode); + min = (minMatch < Pattern::MIN_QMATCH) ? Pattern::MIN_QMATCH : minMatch; + max = (maxMatch > Pattern::MAX_QMATCH) ? Pattern::MAX_QMATCH : maxMatch; +} + +int NFAQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int i0, i1, i2 = 0; + + i0 = i1 = curInd; + while (i2 < min) + { + + ++i2; + i1 = inner->match(str, matcher, i0); + if (i1 <= i0) return i1; // i1 < i0 means i1 is -1 + i0 = i1; + } + + return i1; +} +// NFAGreedyQuantifierNode + +NFAGreedyQuantifierNode::NFAGreedyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch) + : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { } +int NFAGreedyQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int t = NFAQuantifierNode::match(str, matcher, curInd); + if (t != -1) return matchInternal(str, matcher, t, min); + return t; +} +int NFAGreedyQuantifierNode::matchInternal(const bkstring & str, Matcher * matcher, const int curInd, const int soFar) const +{ + if (soFar >= max) return next->match(str, matcher, curInd); + + int i, j; + + i = inner->match(str, matcher, curInd); + if (i != -1) + { + j = matchInternal(str, matcher, i, soFar + 1); + if (j != -1) return j; + } + return next->match(str, matcher, curInd); +} + +// NFALazyQuantifierNode + +NFALazyQuantifierNode::NFALazyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch) + : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { } +int NFALazyQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int i, j, m = NFAQuantifierNode::match(str, matcher, curInd); + + if (m == -1) return -1; + + for (i = min; i < max; ++i) + { + j = next->match(str, matcher, m); + if (j == -1) + { + j = inner->match(str, matcher, m); + // if j < m, then j is -1, so we bail. + // if j == m, then we would just go and call next->match on the same index, + // but it already failed trying to match right there, so we know we can + // just bail + if (j <= m) return -1; + m = j; + } + else return j; + } + return next->match(str, matcher, m); +} + +// NFAPossessiveQuantifierNode + +NFAPossessiveQuantifierNode::NFAPossessiveQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch) + : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { } +int NFAPossessiveQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int i, j, m = NFAQuantifierNode::match(str, matcher, curInd); + + if (m == -1) return -1; + for (i = min; i < max; ++i) + { + j = inner->match(str, matcher, m); + if (j <= m) return next->match(str, matcher, m); + m = j; + } + return next->match(str, matcher, m); +} + +// NFAAcceptNode + +NFAAcceptNode::NFAAcceptNode() { } +int NFAAcceptNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (!next) return curInd; + else return next->match(str, matcher, curInd); +} + +// NFAClassNode + +NFAClassNode::NFAClassNode(const bool invert) +{ + inv = invert; +} +NFAClassNode::NFAClassNode(const bkstring & clazz, const bool invert) +{ + inv = invert; + for (int i = 0; i < (int)clazz.size(); ++i) vals[clazz[i]] = 1; +} +int NFAClassNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && ((vals.find(str[curInd]) != vals.end()) ^ inv)) + { + return next->match(str, matcher, curInd + 1); + } + return -1; +} + +// NFACIClassNode + +NFACIClassNode::NFACIClassNode(const bool invert) +{ + inv = invert; +} +NFACIClassNode::NFACIClassNode(const bkstring & clazz, const bool invert) +{ + inv = invert; + for (int i = 0; i < (int)clazz.size(); ++i) vals[to_lower(clazz[i])] = 1; +} +int NFACIClassNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && ((vals.find(to_lower(str[curInd])) != vals.end()) ^ inv)) + { + return next->match(str, matcher, curInd + 1); + } + return -1; +} + +// NFASubStartNode + +NFASubStartNode::NFASubStartNode() { } +int NFASubStartNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + return next->match(str, matcher, curInd); +} + +// NFAOrNode + +NFAOrNode::NFAOrNode(NFANode * first, NFANode * second) : one(first), two(second) { } +void NFAOrNode::findAllNodes(std::map & soFar) +{ + if (one) one->findAllNodes(soFar); + if (two) two->findAllNodes(soFar); + NFANode::findAllNodes(soFar); +} +int NFAOrNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int ci = one->match(str, matcher, curInd); + + if (ci != -1) ci = next->match(str, matcher, ci); + if (ci != -1) return ci; + if (ci == -1) ci = two->match(str, matcher, curInd); + if (ci != -1) ci = next->match(str, matcher, ci); + return ci; +} + +// NFAQuoteNode + +NFAQuoteNode::NFAQuoteNode(const bkstring & quoted) : qStr(quoted) { } +int NFAQuoteNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd + qStr.size() > str.size()) return -1; + if (str.substr(curInd, qStr.size()) != qStr) return -1; + return next->match(str, matcher, curInd + qStr.size()); +} + +// NFACIQuoteNode + +NFACIQuoteNode::NFACIQuoteNode(const bkstring & quoted) : qStr(quoted) { } +int NFACIQuoteNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd + qStr.size() > str.size()) return -1; + if (str_icmp(str.substr(curInd, qStr.size()).c_str(), qStr.c_str())) return -1; + return next->match(str, matcher, qStr.size()); +} + +// NFALookAheadNode + +NFALookAheadNode::NFALookAheadNode(NFANode * internal, const bool positive) : NFANode(), pos(positive), inner(internal) { } +void NFALookAheadNode::findAllNodes(std::map & soFar) +{ + if (inner) inner->findAllNodes(soFar); + NFANode::findAllNodes(soFar); +} +int NFALookAheadNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + return ((inner->match(str, matcher, curInd) == -1) ^ pos) ? next->match(str, matcher, curInd) : -1; +} + +// NFALookBehindNode + +NFALookBehindNode::NFALookBehindNode(const bkstring & str, const bool positive) : pos(positive), mStr(str) { } +int NFALookBehindNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (pos) + { + if (curInd < (int)mStr.size()) return -1; + if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return next->match(str, matcher, curInd); + } + else + { + if (curInd < (int)mStr.size()) return next->match(str, matcher, curInd); + if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return -1; + return next->match(str, matcher, curInd); + } + return -1; +} + +// NFAStartOfLineNode + +NFAStartOfLineNode::NFAStartOfLineNode() { } +int NFAStartOfLineNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd == 0 || str[curInd - 1] == '\n' || str[curInd - 1] == '\r') + { + return next->match(str, matcher, curInd); + } + return -1; +} + +// NFAEndOfLineNode + +NFAEndOfLineNode::NFAEndOfLineNode() { } +int NFAEndOfLineNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd >= (int)str.size() || str[curInd] == '\n' || str[curInd] == '\r') + { + return next->match(str, matcher, curInd); + } + return -1; +} + +// NFAReferenceNode + +NFAReferenceNode::NFAReferenceNode(const int groupIndex) : gi(groupIndex) { } +int NFAReferenceNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int len = matcher->ends[gi] - matcher->starts[gi]; + int ni = -1; + if (gi < 1 || matcher->ends[gi] < matcher->starts[gi] || len == 0) ni = curInd; + else if (curInd + len > (int)str.size()) return -1; + else if (str.substr(curInd, len) != str.substr(matcher->starts[gi], len)) return -1; + else ni = curInd + len; + + return next->match(str, matcher, ni); +} + +// NFAStartOfInputNode + +NFAStartOfInputNode::NFAStartOfInputNode() { } +int NFAStartOfInputNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd == 0) return next->match(str, matcher, curInd); + return -1; +} + +// NFAEndOfInputNode + +NFAEndOfInputNode::NFAEndOfInputNode(const bool lookForTerm) : term(lookForTerm) { } +int NFAEndOfInputNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int len = (int)str.size(); + if (curInd == len) return next->match(str, matcher, curInd); + else if (term) + { + if (curInd == len - 1 && (str[curInd] == '\r' || str[curInd] == '\n')) + { + return next->match(str, matcher, curInd); + } + else if (curInd == len - 2 && str.substr(curInd, 2) == "\r\n") + { + return next->match(str, matcher, curInd); + } + } + return -1; +} + +// NFAWordBoundaryNode + +NFAWordBoundaryNode::NFAWordBoundaryNode(const bool positive) : pos(positive) { } +int NFAWordBoundaryNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int len = (int)str.size(); + + char c1 = (curInd - 1 < len && curInd > 0) ? str[curInd - 1] : '\n'; + char c2 = (curInd < len) ? str[curInd ] : '\n'; + + if (curInd == len) return next->match(str, matcher, curInd); + bool ok = is_alpha(c1) != is_alpha(c2); + if (ok && pos) return next->match(str, matcher, curInd); + return -1; +} + +// NFAEndOfMatchNode + +NFAEndOfMatchNode::NFAEndOfMatchNode() { } +int NFAEndOfMatchNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + if (curInd == matcher->lm) return next->match(str, matcher, curInd); + return -1; +} + +// NFAGroupHeadNode + +NFAGroupHeadNode::NFAGroupHeadNode(const int groupIndex) : gi(groupIndex) { } +int NFAGroupHeadNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int ret, o = matcher->starts[gi]; + + matcher->starts[gi] = curInd; + ret = next->match(str, matcher, curInd); + if (ret < 0) matcher->starts[gi] = o; + + return ret; +} + +// NFAGroupTailNode + +NFAGroupTailNode::NFAGroupTailNode(const int groupIndex) : gi(groupIndex) { } +int NFAGroupTailNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int ret, o = matcher->ends[gi]; + + matcher->ends[gi] = curInd; + ret = next->match(str, matcher, curInd); + if (ret < 0) matcher->ends[gi] = o; + + return ret; +} + +// NFAGroupLoopPrologueNode + +NFAGroupLoopPrologueNode::NFAGroupLoopPrologueNode(const int groupIndex) : gi(groupIndex) { } +int NFAGroupLoopPrologueNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int ret, o1 = matcher->groups[gi], o2 = matcher->groupPos[gi], o3 = matcher->groupIndeces[gi]; + + matcher->groups[gi] = 0; + matcher->groupPos[gi] = 0; + matcher->groupIndeces[gi] = -1; + ret = next->match(str, matcher, curInd); + if (ret < 0) + { + matcher->groups[gi] = o1; + matcher->groupPos[gi] = o2; + matcher->groupIndeces[gi] = o3; + } + + return ret; +} + +// NFAGroupLoopNode + +NFAGroupLoopNode::NFAGroupLoopNode(NFANode * internal, const int minMatch, const int maxMatch, + const int groupIndex, const int matchType) +{ + inner = internal; + min = minMatch; + max = maxMatch; + gi = groupIndex; + type = matchType; +} +void NFAGroupLoopNode::findAllNodes(std::map & soFar) +{ + if (inner) inner->findAllNodes(soFar); + NFANode::findAllNodes(soFar); +} +int NFAGroupLoopNode::match(const bkstring & str, Matcher * matcher, const int curInd) const +{ + bool b = (curInd > matcher->groupIndeces[gi]); + + if (b && matcher->groups[gi] < min) + { + ++matcher->groups[gi]; + int o = matcher->groupIndeces[gi]; + matcher->groupIndeces[gi] = curInd; + int ret = inner->match(str, matcher, curInd); + if (ret < 0) + { + matcher->groupIndeces[gi] = o; + --matcher->groups[gi]; + } + return ret; + } + else if (!b || matcher->groups[gi] >= max) + { + return next->match(str, matcher, curInd); + } + else + { + switch (type) + { + case 0: return matchGreedy(str, matcher, curInd); + case 1: return matchLazy(str, matcher, curInd); + case 2: return matchPossessive(str, matcher, curInd); + } + } + return -1; +} +int NFAGroupLoopNode::matchGreedy(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int o = matcher->groupIndeces[gi]; // save our info for backtracking + matcher->groupIndeces[gi] = curInd; // move along + ++matcher->groups[gi]; + int ret = inner->match(str, matcher, curInd); // match internally + if (ret < 0) + { // if we failed, then restore info and match next + --matcher->groups[gi]; + matcher->groupIndeces[gi] = o; + ret = next->match(str, matcher, curInd); + } + return ret; +} +int NFAGroupLoopNode::matchLazy(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int ret = next->match(str, matcher, curInd); // be lazy, just go on + if (ret < 0) + { + int o = matcher->groupIndeces[gi]; // save info for backtracking + matcher->groupIndeces[gi] = curInd; // advance our position + ++matcher->groups[gi]; + ret = inner->match(str, matcher, curInd); // match our internal stuff + if (ret < 0) // if we failed, then restore the info + { + --matcher->groups[gi]; + matcher->groupIndeces[gi] = o; + } + } + return ret; +} +int NFAGroupLoopNode::matchPossessive(const bkstring & str, Matcher * matcher, const int curInd) const +{ + int o = matcher->groupIndeces[gi]; // save info for backtracking + matcher->groupPos[gi] = matcher->groups[gi]; // set a flag stating we have matcher at least this much + matcher->groupIndeces[gi] = curInd; // move along + ++matcher->groups[gi]; + int ret = inner->match(str, matcher, curInd); // try and match again + if (ret < 0) + { // if we fail, back off, but to an extent + --matcher->groups[gi]; + matcher->groupIndeces[gi] = o; + if (matcher->groups[gi] == matcher->groupPos[gi]) ret = next->match(str, matcher, curInd); + } + return ret; +} + +#ifdef _WIN32 + #pragma warning(pop) +#endif diff --git a/plugins/SmileyAdd/src/regexp/Pattern.h b/plugins/SmileyAdd/src/regexp/Pattern.h new file mode 100644 index 0000000000..bb16ad90fa --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/Pattern.h @@ -0,0 +1,1663 @@ +#ifndef __PATTERN_H__ +#define __PATTERN_H__ + +#ifdef _WIN32 + #pragma warning(disable:4786) +#endif + +#include +#include + +#include "bkstring.h" + +class Matcher; +class NFANode; +class NFAQuantifierNode; + +/** + This pattern class is very similar in functionality to Java's + java.util.regex.Pattern class. The pattern class represents an immutable + regular expression object. Instead of having a single object contain both the + regular expression object and the matching object, instead the two objects are + split apart. The {@link Matcher Matcher} class represents the maching + object. + + The Pattern class works primarily off of "compiled" patterns. A typical + instantiation of a regular expression looks like: + +

+  Pattern * p = Pattern::compile("a*b");
+  Matcher * m = p->createMatcher("aaaaaab");
+  if (m->matches()) ...
+  
+ + However, if you do not need to use a pattern more than once, it is often times + okay to use the Pattern's static methods insteads. An example looks like this: + +
+  if (Pattern::matches("a*b", "aaaab")) { ... }
+  
+ + This class does not currently support unicode. The unicode update for this + class is coming soon. + + This class is partially immutable. It is completely safe to call createMatcher + concurrently in different threads, but the other functions (e.g. split) should + not be called concurrently on the same Pattern. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Construct + + Matches + +
+   +
+ Characters +
+ x + + The character x +
+ \\ + + The character \ +
+ \0nn + + The character with octal ASCII value nn +
+ \0nnn + + The character with octal ASCII value nnn +
+ \xhh + + The character with hexadecimal ASCII value hh +
+ \t + + A tab character +
+ \r + + A carriage return character +
+ \n + + A new-line character +
+   +
+ Character Classes +
+ [abc] + + Either a, b, or c +
+ [^abc] + + Any character but a, b, or c +
+ [a-zA-Z] + + Any character ranging from a thru z, or + A thru Z +
+ [^a-zA-Z] + + Any character except those ranging from a thru + z, or A thru Z +
+ [a\-z] + + Either a, -, or z +
+ [a-z[A-Z]] + + Same as [a-zA-Z] +
+ [a-z&&[g-i]] + + Any character in the intersection of a-z and + g-i +
+ [a-z&&[^g-i]] + + Any character in a-z and not in g-i +
+   +
+ Prefefined character classes +
+ . + + Any character. Multiline matching must be compiled into the pattern for + . to match a \r or a \n. + Even if multiline matching is enabled, . will not + match a \r\n, only a \r or a \n. +
+ \d + + [0-9] +
+ \D + + [^\d] +
+ \s + + [ \t\r\n\x0B] +
+ \S + + [^\s] +
+ \w + + [a-zA-Z0-9_] +
+ \W + + [^\w] +
+   +
+ POSIX character classes +
+ \p{Lower} + + [a-z] +
+ \p{Upper} + + [A-Z] +
+ \p{ASCII} + + [\x00-\x7F] +
+ \p{Alpha} + + [a-zA-Z] +
+ \p{Digit} + + [0-9] +
+ \p{Alnum} + + [\w&&[^_]] +
+ \p{Punct} + + [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~] +
+ \p{XDigit} + + [a-fA-F0-9] +
+   +
+ Boundary Matches +
+ ^ + + The beginning of a line. Also matches the beginning of input. +
+ $ + + The end of a line. Also matches the end of input. +
+ \b + + A word boundary +
+ \B + + A non word boundary +
+ \A + + The beginning of input +
+ \G + + The end of the previous match. Ensures that a "next" match will only + happen if it begins with the character immediately following the end of + the "current" match. +
+ \Z + + The end of input. Will also match if there is a single trailing + \r\n, a single trailing \r, or a single + trailing \n. +
+ \z + + The end of input +
+   +
+ Greedy Quantifiers +
+ x? + + x, either zero times or one time +
+ x* + + x, zero or more times +
+ x+ + + x, one or more times +
+ x{n} + + x, exactly n times +
+ x{n,} + + x, at least n times +
+ x{,m} + + x, at most m times +
+ x{n,m} + + x, at least n times and at most + m times +
+   +
+ Possessive Quantifiers +
+ x?+ + + x, either zero times or one time +
+ x*+ + + x, zero or more times +
+ x++ + + x, one or more times +
+ x{n}+ + + x, exactly n times +
+ x{n,}+ + + x, at least n times +
+ x{,m}+ + + x, at most m times +
+ x{n,m}+ + + x, at least n times and at most + m times +
+   +
+ Reluctant Quantifiers +
+ x?? + + x, either zero times or one time +
+ x*? + + x, zero or more times +
+ x+? + + x, one or more times +
+ x{n}? + + x, exactly n times +
+ x{n,}? + + x, at least n times +
+ x{,m}? + + x, at most m times +
+ x{n,m}? + + x, at least n times and at most + m times +
+   +
+ Operators +
+ xy + + x then y +
+ x|y + + x or y +
+ (x) + + x as a capturing group +
+   +
+ Quoting +
+ \Q + + Nothing, but treat every character (including \s) literally until a + matching \E +
+ \E + + Nothing, but ends its matching \Q +
+   +
+ Special Constructs +
+ (?:x) + + x, but not as a capturing group +
+ (?=x) + + x, via positive lookahead. This means that the + expression will match only if it is trailed by x. + It will not "eat" any of the characters matched by + x. +
+ (?!x) + + x, via negative lookahead. This means that the + expression will match only if it is not trailed by + x. It will not "eat" any of the characters + matched by x. +
+ (?<=x) + + x, via positive lookbehind. x + cannot contain any quantifiers. +
+ (?x) + + x, via negative lookbehind. x + cannot contain any quantifiers. +
+ (?>x) + + x{1}+ +
+   +
+ Registered Expression Matching +
+ {x} + + The registered pattern x +
+ +
+ + Begin Text Extracted And Modified From java.util.regex.Pattern documentation + +

Backslashes, escapes, and quoting

+ +

The backslash character ('\') serves to introduce escaped + constructs, as defined in the table above, as well as to quote characters + that otherwise would be interpreted as unescaped constructs. Thus the + expression \\ matches a single backslash and \{ matches a + left brace. + +

It is an error to use a backslash prior to any alphabetic character that + does not denote an escaped construct; these are reserved for future + extensions to the regular-expression language. A backslash may be used + prior to a non-alphabetic character regardless of whether that character is + part of an unescaped construct. + +

It is necessary to double backslashes in string literals that represent + regular expressions to protect them from interpretation by a compiler. The + string literal "\b", for example, matches a single backspace + character when interpreted as a regular expression, while + "\\b" matches a word boundary. The string litera + "\(hello\)" is illegal and leads to a compile-time error; + in order to match the string (hello) the string literal + "\\(hello\\)" must be used. + +

Character Classes

+ +

Character classes may appear within other character classes, and + may be composed by the union operator (implicit) and the intersection + operator (&&). + The union operator denotes a class that contains every character that is + in at least one of its operand classes. The intersection operator + denotes a class that contains every character that is in both of its + operand classes. + +

The precedence of character-class operators is as follows, from + highest to lowest: + +

+ + + + + + + + + + + + + + + + +
1    Literal escape    \x
2    Rangea-z
3    Grouping[...]
4    Intersection[a-z&&[aeiou]]
5    Union[a-e][i-u]
+ +

Note that a different set of metacharacters are in effect inside + a character class than outside a character class. For instance, the + regular expression . loses its special meaning inside a + character class, while the expression - becomes a range + forming metacharacter. + + + + +

Groups and capturing

+ +

Capturing groups are numbered by counting their opening parentheses from + left to right. In the expression ((A)(B(C))), for example, there + are four such groups:

+ +
+ + + + + + + + + + +
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
+ +

Group zero always stands for the entire expression. + +

Capturing groups are so named because, during a match, each subsequence + of the input sequence that matches such a group is saved. The captured + subsequence may be used later in the expression, via a back reference, and + may also be retrieved from the matcher once the match operation is complete. + +

The captured input associated with a group is always the subsequence + that the group most recently matched. If a group is evaluated a second time + because of quantification then its previously-captured value, if any, will + be retained if the second evaluation fails. Matching the string + "aba" against the expression (a(b)?)+, for example, leaves + group two set to "b". All captured input is discarded at the + beginning of each match. + +

Groups beginning with (? are pure, non-capturing groups + that do not capture text and do not count towards the group total. + + +

Unicode support

+ +

Coming Soon. + +

Comparison to Perl 5

+ +

The Pattern engine performs traditional NFA-based matching + with ordered alternation as occurs in Perl 5. + +

Perl constructs not supported by this class:

+ +
    + +
  • The conditional constructs (?{X}) and + (?(condition)X|Y), +

  • + +
  • The embedded code constructs (?{code}) + and (??{code}),

  • + +
  • The embedded comment syntax (?#comment), and

  • + +
  • The preprocessing operations \l \u, + \L, and \U.

  • + +
  • Embedded flags

  • + +
+ +

Constructs supported by this class but not by Perl:

+ +
    + +
  • Possessive quantifiers, which greedily match as much as they can + and do not back off, even when doing so would allow the overall match to + succeed.

  • + +
  • Character-class union and intersection as described + above.

  • + +
+ +

Notable differences from Perl:

+ +
    + +
  • In Perl, \1 through \9 are always interpreted + as back references; a backslash-escaped number greater than 9 is + treated as a back reference if at least that many subexpressions exist, + otherwise it is interpreted, if possible, as an octal escape. In this + class octal escapes must always begin with a zero. In this class, + \1 through \9 are always interpreted as back + references, and a larger number is accepted as a back reference if at + least that many subexpressions exist at that point in the regular + expression, otherwise the parser will drop digits until the number is + smaller or equal to the existing number of groups or it is one digit. +

  • + +
  • Perl uses the g flag to request a match that resumes + where the last match left off. This functionality is provided implicitly + by the Matcher class: Repeated invocations of the + find method will resume where the last match left off, + unless the matcher is reset.

  • + +
  • Perl is forgiving about malformed matching constructs, as in the + expression *a, as well as dangling brackets, as in the + expression abc], and treats them as literals. This + class also strict and will not compile a pattern when dangling characters + are encountered.

  • + +
+ + +

For a more precise description of the behavior of regular expression + constructs, please see + Mastering Regular Expressions, 2nd Edition, Jeffrey E. F. Friedl, + O'Reilly and Associates, 2002. +

+

+ + End Text Extracted And Modified From java.util.regex.Pattern documentation + +


+ + @author Jeffery Stuart + @since March 2003, Stable Since November 2004 + @version 1.07.00 + @memo A class used to represent "PERL 5"-ish regular expressions + */ +class Pattern +{ + friend class Matcher; + friend class NFANode; + friend class NFAQuantifierNode; + private: + /** + This constructor should not be called directly. Those wishing to use the + Pattern class should instead use the {@link compile compile} method. + + @param rhs The pattern to compile + @memo Creates a new pattern from the regular expression in rhs. + */ + Pattern(const bkstring & rhs); + protected: + /** + This currently is not used, so don't try to do anything with it. + @memo Holds all the compiled patterns for quick access. + */ + static std::map compiledPatterns; + /** + Holds all of the registered patterns as strings. Due to certain problems + with compilation of patterns, especially with capturing groups, this seemed + to be the best way to do it. + */ + static std::map > registeredPatterns; + protected: + /** + Holds all the NFA nodes used. This makes deletion of a pattern, as well as + clean-up from an unsuccessful compile much easier and faster. + */ + std::map nodes; + /** + Used when methods like split are called. The matcher class uses a lot of + dynamic memeory, so having an instance increases speedup of certain + operations. + */ + Matcher * matcher; + /** + The front node of the NFA. + */ + NFANode * head; + /** + The actual regular expression we rerpesent + */ + bkstring pattern; + /** + Flag used during compilation. Once the pattern is successfully compiled, + error is no longer used. + */ + bool error; + /** + Used during compilation to keep track of the current index into + {@link pattern pattern}. Once the pattern is successfully + compiled, error is no longer used. + */ + int curInd; + /** + The number of capture groups this contains. + */ + int groupCount; + /** + The number of non-capture groups this contains. + */ + int nonCapGroupCount; + /** + The flags specified when this was compiled. + */ + unsigned long flags; + protected: + /** + Raises an error during compilation. Compilation will cease at that point + and compile will return NULL. + */ + void raiseError(); + /** + Convenience function for registering a node in nodes. + @param node The node to register + @return The registered node + */ + NFANode * registerNode(NFANode * node); + + /** + Calculates the union of two strings. This function will first sort the + strings and then use a simple selection algorithm to find the union. + @param s1 The first "class" to union + @param s2 The second "class" to union + @return A new string containing all unique characters. Each character + must have appeared in one or both of s1 and + s2. + */ + bkstring classUnion (bkstring s1, bkstring s2) const; + /** + Calculates the intersection of two strings. This function will first sort + the strings and then use a simple selection algorithm to find the + intersection. + @param s1 The first "class" to intersect + @param s2 The second "class" to intersect + @return A new string containing all unique characters. Each character + must have appeared both s1 and s2. + */ + bkstring classIntersect (bkstring s1, bkstring s2) const; + /** + Calculates the negation of a string. The negation is the set of all + characters between \x00 and \xFF not + contained in s1. + @param s1 The "class" to be negated. + @param s2 The second "class" to intersect + @return A new string containing all unique characters. Each character + must have appeared both s1 and s2. + */ + bkstring classNegate (bkstring s1) const; + /** + Creates a new "class" representing the range from low thru + hi. This function will wrap if low > + hi. This is a feature, not a buf. Sometimes it is useful + to be able to say [\x70-\x10] instead of [\x70-\x7F\x00-\x10]. + @param low The beginning character + @param hi The ending character + @return A new string containing all the characters from low thru hi. + */ + bkstring classCreateRange(char low, char hi) const; + + /** + Extracts a decimal number from the substring of member-variable + {@link pattern pattern} starting at start and + ending at end. + @param start The starting index in {@link pattern pattern} + @param end The last index in {@link pattern pattern} + @return The decimal number in {@link pattern pattern} + */ + int getInt(int start, int end); + /** + Parses a {n,m} string out of the member-variable + {@link pattern pattern} stores the result in sNum + and eNum. + @param sNum Output parameter. The minimum number of matches required + by the curly quantifier are stored here. + @param eNum Output parameter. The maximum number of matches allowed + by the curly quantifier are stored here. + @return Success/Failure. Fails when the curly does not have the proper + syntax + */ + bool quantifyCurly(int & sNum, int & eNum); + /** + Tries to quantify the currently parsed group. If the group being parsed + is indeed quantified in the member-variable + {@link pattern pattern}, then the NFA is modified accordingly. + @param start The starting node of the current group being parsed + @param stop The ending node of the current group being parsed + @param gn The group number of the current group being parsed + @return The node representing the starting node of the group. If the + group becomes quantified, then this node is not necessarily + a GroupHead node. + */ + NFANode * quantifyGroup(NFANode * start, NFANode * stop, const int gn); + + /** + Tries to quantify the last parsed expression. If the character was indeed + quantified, then the NFA is modified accordingly. + @param newNode The recently created expression node + @return The node representing the last parsed expression. If the + expression was quantified, return value != newNode + */ + NFANode * quantify(NFANode * newNode); + /** + Parses the current class being examined in + {@link pattern pattern}. + @return A string of unique characters contained in the current class being + parsed + */ + bkstring parseClass(); + /** + Parses the current POSIX class being examined in + {@link pattern pattern}. + @return A string of unique characters representing the POSIX class being + parsed + */ + bkstring parsePosix(); + /** + Returns a string containing the octal character being parsed + @return The string contained the octal value being parsed + */ + bkstring parseOctal(); + /** + Returns a string containing the hex character being parsed + @return The string contained the hex value being parsed + */ + bkstring parseHex(); + /** + Returns a new node representing the back reference being parsed + @return The new node representing the back reference being parsed + */ + NFANode * parseBackref(); + /** + Parses the escape sequence currently being examined. Determines if the + escape sequence is a class, a single character, or the beginning of a + quotation sequence. + @param inv Output parameter. Whether or not to invert the returned class + @param quo Output parameter. Whether or not this sequence starts a + quotation. + @return The characters represented by the class + */ + bkstring parseEscape(bool & inv, bool & quo); + /** + Parses a supposed registered pattern currently under compilation. If the + sequence of characters does point to a registered pattern, then the + registered pattern is appended to *end. The registered pattern + is parsed with the current compilation flags. + @param end The ending node of the thus-far compiled pattern + @return The new end node of the current pattern + */ + NFANode * parseRegisteredPattern(NFANode ** end); + /** + Parses a lookbehind expression. Appends the necessary nodes + *end. + @param pos Positive or negative look behind + @param end The ending node of the current pattern + @return The new end node of the current pattern + */ + NFANode * parseBehind(const bool pos, NFANode ** end); + /** + Parses the current expression and tacks on nodes until a \E is found. + @return The end of the current pattern + */ + NFANode * parseQuote(); + /** + Parses {@link pattern pattern}. This function is called + recursively when an or (|) or a group is encountered. + @param inParen Are we currently parsing inside a group + @param inOr Are we currently parsing one side of an or (|) + @param end The end of the current expression + @return The starting node of the NFA constructed from this parse + */ + NFANode * parse(const bool inParen = 0, const bool inOr = 0, NFANode ** end = NULL); + public: + /// We should match regardless of case + const static unsigned long CASE_INSENSITIVE; + /// We are implicitly quoted + const static unsigned long LITERAL; + /// @memo We should treat a . as [\x00-\x7F] + const static unsigned long DOT_MATCHES_ALL; + /** ^ and $ should anchor to the beginning and + ending of lines, not all input + */ + const static unsigned long MULTILINE_MATCHING; + /** When enabled, only instances of \n are recognized as + line terminators + */ + const static unsigned long UNIX_LINE_MODE; + /// The absolute minimum number of matches a quantifier can match (0) + const static int MIN_QMATCH; + /// The absolute maximum number of matches a quantifier can match (0x7FFFFFFF) + const static int MAX_QMATCH; + public: + /** + Call this function to compile a regular expression into a + Pattern object. Special values can be assigned to + mode when certain non-standard behaviors are expected from + the Pattern object. + @param pattern The regular expression to compile + @param mode A bitwise or of flags signalling what special behaviors are + wanted from this Pattern object + @return If successful, compile returns a Pattern + pointer. Upon failure, compile returns + NULL + */ + static Pattern * compile (const bkstring & pattern, + const unsigned long mode = 0); + /** + Dont use this function. This function will compile a pattern, and cache + the result. This will eventually be used as an optimization when people + just want to call static methods using the same pattern over and over + instead of first compiling the pattern and then using the compiled + instance for matching. + @param pattern The regular expression to compile + @param mode A bitwise or of flags signalling what special behaviors are + wanted from this Pattern object + @return If successful, compileAndKeep returns a + Pattern pointer. Upon failure, compile + returns NULL. + */ + static Pattern * compileAndKeep (const bkstring & pattern, + const unsigned long mode = 0); + + /** + Searches through replace and replaces all substrings matched + by pattern with str. str may + contain backreferences (e.g. \1) to capture groups. A typical + invocation looks like: +

+ + Pattern::replace("(a+)b(c+)", "abcccbbabcbabc", "\\2b\\1"); + +

+ which would replace abcccbbabcbabc with + cccbabbcbabcba. + @param pattern The regular expression + @param str The replacement text + @param replacementText The string in which to perform replacements + @param mode The special mode requested of the Pattern + during the replacement process + @return The text with the replacement string substituted where necessary + */ + static bkstring replace (const bkstring & pattern, + const bkstring & str, + const bkstring & replacementText, + const unsigned long mode = 0); + + /** + Splits the specified string over occurrences of the specified pattern. + Empty strings can be optionally ignored. The number of strings returned is + configurable. A typical invocation looks like: +

+ + bkstring str(strSize, '\0');
+ FILE * fp = fopen(fileName, "r");
+ fread((char*)str.data(), strSize, 1, fp);
+ fclose(fp);
+
+ std::vector<bkstring> lines = Pattern::split("[\r\n]+", str, true);
+
+
+ + @param pattern The regular expression + @param replace The string to split + @param keepEmptys Whether or not to keep empty strings + @param limit The maximum number of splits to make + @param mode The special mode requested of the Pattern + during the split process + @return All substrings of str split across pattern. + */ + static std::vector split (const bkstring & pattern, + const bkstring & str, + const bool keepEmptys = 0, + const unsigned long limit = 0, + const unsigned long mode = 0); + + /** + Finds all the instances of the specified pattern within the string. You + should be careful to only pass patterns with a minimum length of one. For + example, the pattern a* can be matched by an empty string, so + instead you should pass a+ since at least one character must + be matched. A typical invocation of findAll looks like: +

+ + std::vector<td::string> numbers = Pattern::findAll("\\d+", string); + +

+ + @param pattern The pattern for which to search + @param str The string to search + @param mode The special mode requested of the Pattern + during the find process + @return All instances of pattern in str + */ + static std::vector findAll (const bkstring & pattern, + const bkstring & str, + const unsigned long mode = 0); + + /** + Determines if an entire string matches the specified pattern + + @param pattern The pattern for to match + @param str The string to match + @param mode The special mode requested of the Pattern + during the replacement process + @return True if str is recognized by pattern + */ + static bool matches (const bkstring & pattern, + const bkstring & str, + const unsigned long mode = 0); + + /** + Registers a pattern under a specific name for use in later compilations. + A typical invocation and later use looks like: +

+ + Pattern::registerPattern("ip", "(?:\\d{1,3}\\.){3}\\d{1,3}");
+ Pattern * p1 = Pattern::compile("{ip}:\\d+");
+ Pattern * p2 = Pattern::compile("Connection from ({ip}) on port \\d+");
+
+

+ Multiple calls to registerPattern with the same + name will result in the pattern getting overwritten. + + @param name The name to give to the pattern + @param pattern The pattern to register + @param mode Any special flags to use when compiling pattern + @return Success/Failure. Fails only if pattern has invalid + syntax + */ + static bool registerPattern(const bkstring & name, + const bkstring & pattern, + const unsigned long mode = 0); + + /** + Clears the pattern registry + */ + static void unregisterPatterns(); + /** + Don't use + */ + static void clearPatternCache(); + + /** + Searches through a string for the nth match of the + given pattern in the string. Match indeces start at zero, not one. + A typical invocation looks like this: +

+ + std::pair<bkstring, int> match = Pattern::findNthMatch("\\d{1,3}", "192.168.1.101:22", 1);
+ printf("%s %i\n", match.first.c_str(), match.second);
+
+ Output: 168 4
+
+ + @param pattern The pattern for which to search + @param str The string to search + @param matchNum Which match to find + @param mode Any special flags to use during the matching process + @return A string and an integer. The string is the string matched. The + integer is the starting location of the matched string in + str. You can check for success/failure by making sure + that the integer returned is greater than or equal to zero. + */ + static std::pair findNthMatch (const bkstring & pattern, + const bkstring & str, + const int matchNum, + const unsigned long mode = 0); + public: + /** + Deletes all NFA nodes allocated during compilation + */ + ~Pattern(); + + bkstring replace (const bkstring & str, + const bkstring & replacementText); + std::vector split (const bkstring & str, const bool keepEmptys = 0, + const unsigned long limit = 0); + std::vector findAll (const bkstring & str); + bool matches (const bkstring & str); + /** + Returns the flags used during compilation of this pattern + @return The flags used during compilation of this pattern + */ + unsigned long getFlags () const; + /** + Returns the regular expression this pattern represents + @return The regular expression this pattern represents + */ + bkstring getPattern () const; + /** + Creates a matcher object using the specified string and this pattern. + @param str The string to match against + @return A new matcher using object using this pattern and the specified + string + */ + Matcher * createMatcher (const bkstring & str); +}; + +class NFANode +{ + friend class Matcher; + public: + NFANode * next; + NFANode(); + virtual ~NFANode(); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const = 0; + inline virtual bool isGroupHeadNode() const { return false; } + inline virtual bool isStartOfInputNode() const { return false; } +}; +class NFACharNode : public NFANode +{ + protected: + char ch; + public: + NFACharNode(const char c); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFACICharNode : public NFANode +{ + protected: + char ch; + public: + NFACICharNode(const char c); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAStartNode : public NFANode +{ + public: + NFAStartNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAEndNode : public NFANode +{ + public: + NFAEndNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAQuantifierNode : public NFANode +{ + public: + int min, max; + NFANode * inner; + virtual void findAllNodes(std::map & soFar); + NFAQuantifierNode(Pattern * pat, NFANode * internal, + const int minMatch = Pattern::MIN_QMATCH, + const int maxMatch = Pattern::MAX_QMATCH); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAGreedyQuantifierNode : public NFAQuantifierNode +{ + public: + NFAGreedyQuantifierNode(Pattern * pat, NFANode * internal, + const int minMatch = Pattern::MIN_QMATCH, + const int maxMatch = Pattern::MAX_QMATCH); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; + virtual int matchInternal(const bkstring & str, Matcher * matcher, const int curInd, const int soFar) const; +}; +class NFALazyQuantifierNode : public NFAQuantifierNode +{ + public: + NFALazyQuantifierNode(Pattern * pat, NFANode * internal, + const int minMatch = Pattern::MIN_QMATCH, + const int maxMatch = Pattern::MAX_QMATCH); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAPossessiveQuantifierNode : public NFAQuantifierNode +{ + public: + NFAPossessiveQuantifierNode(Pattern * pat, NFANode * internal, + const int minMatch = Pattern::MIN_QMATCH, + const int maxMatch = Pattern::MAX_QMATCH); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAAcceptNode : public NFANode +{ + public: + NFAAcceptNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAClassNode : public NFANode +{ + public: + bool inv; + std::map vals; + NFAClassNode(const bool invert = 0); + NFAClassNode(const bkstring & clazz, const bool invert); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFACIClassNode : public NFANode +{ + public: + bool inv; + std::map vals; + NFACIClassNode(const bool invert = 0); + NFACIClassNode(const bkstring & clazz, const bool invert); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFASubStartNode : public NFANode +{ + public: + NFASubStartNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAOrNode : public NFANode +{ + public: + NFANode * one; + NFANode * two; + NFAOrNode(NFANode * first, NFANode * second); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAQuoteNode : public NFANode +{ + public: + bkstring qStr; + NFAQuoteNode(const bkstring & quoted); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFACIQuoteNode : public NFANode +{ + public: + bkstring qStr; + NFACIQuoteNode(const bkstring & quoted); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFALookAheadNode : public NFANode +{ + public: + bool pos; + NFANode * inner; + NFALookAheadNode(NFANode * internal, const bool positive); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFALookBehindNode : public NFANode +{ + public: + bool pos; + bkstring mStr; + NFALookBehindNode(const bkstring & str, const bool positive); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAStartOfLineNode : public NFANode +{ + public: + NFAStartOfLineNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAEndOfLineNode : public NFANode +{ + public: + NFAEndOfLineNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAReferenceNode : public NFANode +{ + public: + int gi; + NFAReferenceNode(const int groupIndex); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAStartOfInputNode : public NFANode +{ + public: + NFAStartOfInputNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; + inline virtual bool isStartOfInputNode() const { return true; } +}; +class NFAEndOfInputNode : public NFANode +{ + public: + bool term; + NFAEndOfInputNode(const bool lookForTerm); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAWordBoundaryNode : public NFANode +{ + public: + bool pos; + NFAWordBoundaryNode(const bool positive); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAEndOfMatchNode : public NFANode +{ + public: + NFAEndOfMatchNode(); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAGroupHeadNode : public NFANode +{ + public: + int gi; + NFAGroupHeadNode(const int groupIndex); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; + inline virtual bool isGroupHeadNode() const { return true; } +}; +class NFAGroupTailNode : public NFANode +{ + public: + int gi; + NFAGroupTailNode(const int groupIndex); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAGroupLoopPrologueNode : public NFANode +{ + public: + int gi; + NFAGroupLoopPrologueNode(const int groupIndex); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; +class NFAGroupLoopNode : public NFANode +{ + public: + int gi, min, max, type; + NFANode * inner; + NFAGroupLoopNode(NFANode * internal, const int minMatch, + const int maxMatch, const int groupIndex, const int matchType); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const; + int matchGreedy(const bkstring & str, Matcher * matcher, const int curInd = 0) const; + int matchLazy(const bkstring & str, Matcher * matcher, const int curInd = 0) const; + int matchPossessive(const bkstring & str, Matcher * matcher, const int curInd = 0) const; +}; + +#endif + diff --git a/plugins/SmileyAdd/src/regexp/WCMatcher.cpp b/plugins/SmileyAdd/src/regexp/WCMatcher.cpp new file mode 100644 index 0000000000..bed5a1944b --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/WCMatcher.cpp @@ -0,0 +1,181 @@ +#include "WCMatcher.h" +#include "WCPattern.h" + +const int WCMatcher::MATCH_ENTIRE_STRING = 0x01; + +/* + Detailed documentation is provided in this class' header file + + @author Jeffery Stuart + @since November 2004 + @version 1.07.00 +*/ + +WCMatcher::WCMatcher(WCPattern * pattern, const bkstring & text) +{ + pat = pattern; + str = &text; + gc = pattern->groupCount; + ncgc = -pattern->nonCapGroupCount; + flags = 0; + matchedSomething = false; + starts = new int[gc + ncgc]; + ends = new int[gc + ncgc]; + groups = new int[gc + ncgc]; + groupPos = new int[gc + ncgc]; + groupIndeces = new int[gc + ncgc]; + starts = starts + ncgc; + ends = ends + ncgc; + groups = groups + ncgc; + groupPos = groupPos + ncgc; + groupIndeces = groupIndeces + ncgc; + for (int i = 0; i < gc; ++i) starts[i] = ends[i] = 0; +} +WCMatcher::~WCMatcher() +{ + delete [] (starts - ncgc); + delete [] (ends - ncgc); + delete [] (groups - ncgc); + delete [] (groupIndeces - ncgc); + delete [] (groupPos - ncgc); +} +void WCMatcher::clearGroups() +{ + int i; + lm = 0; + for (i = 0; i < gc; ++i) groups[i] = starts[i] = ends[i] = -1; + for (i = 1; i <= ncgc; ++i) groups[0 - i] = starts[0 - i] = ends[0 - i] = -1; +} +bkstring WCMatcher::replaceWithGroups(const bkstring & str) +{ + bkstring ret = L""; + + bkstring t = str; + while (t.size() > 0) + { + if (t[0] == (wchar_t)'\\') + { + t.erase(0, 1); + if (t.size() == 0) + { + ret += L"\\"; + } + else if (t[0] < (wchar_t)'0' || t[0] > (wchar_t)'9') + { + ret += t[0]; + t.erase(0, 1); + } + else + { + int gn = 0; + while (t.size() > 0 && t[0] >= (wchar_t)'0' && t[0] <= (wchar_t)'9') + { + gn = gn * 10 + (t[0] - (wchar_t)'0'); + t.erase(0, 1); + } + ret += getGroup(gn); + } + } + else + { + ret += t[0]; + t.erase(0, 1); + } + } + + return ret; +} +unsigned long WCMatcher::getFlags() const +{ + return flags; +} +const bkstring& WCMatcher::getText() const +{ + return *str; +} + +bool WCMatcher::matches() +{ + flags = MATCH_ENTIRE_STRING; + matchedSomething = false; + clearGroups(); + lm = 0; + return pat->head->match(*str, this, 0) == (int)str->size(); +} +bool WCMatcher::findFirstMatch() +{ + starts[0] = 0; + flags = 0; + clearGroups(); + start = 0; + lm = 0; + ends[0] = pat->head->match(*str, this, 0); + if (ends[0] >= 0) + { + matchedSomething = true; + return 1; + } + return 0; +} +bool WCMatcher::findNextMatch() +{ + int s = starts[0], e = ends[0]; + + if (!matchedSomething) return findFirstMatch(); + if (s == e) ++e; + flags = 0; + clearGroups(); + + starts[0] = e; + if (e >= (int)str->size()) return 0; + start = e; + lm = e; + ends[0] = pat->head->match(*str, this, e); + return ends[0] >= 0; +} +std::vector WCMatcher::findAll() +{ + std::vector ret; + reset(); + while (findNextMatch()) + { + ret.push_back(getGroup()); + } + return ret; +} + +void WCMatcher::reset() +{ + lm = 0; + clearGroups(); + matchedSomething = false; +} + +int WCMatcher::getStartingIndex(const int groupNum) const +{ + if (groupNum < 0 || groupNum >= gc) return -1; + return starts[groupNum]; +} +int WCMatcher::getEndingIndex(const int groupNum) const +{ + if (groupNum < 0 || groupNum >= gc) return -1; + return ends[groupNum]; +} +bkstring WCMatcher::getGroup(const int groupNum) const +{ + if (groupNum < 0 || groupNum >= gc) return L""; + if (starts[groupNum] < 0 || ends[groupNum] < 0) return L""; + return str->substr(starts[groupNum], ends[groupNum] - starts[groupNum]); +} +std::vector WCMatcher::getGroups(const bool includeGroupZero) const +{ + int i, start = (includeGroupZero ? 0 : 1); + std::vector ret; + + for (i = start; i < gc; ++i) + { + ret.push_back(getGroup(i)); + } + return ret; +} + diff --git a/plugins/SmileyAdd/src/regexp/WCMatcher.h b/plugins/SmileyAdd/src/regexp/WCMatcher.h new file mode 100644 index 0000000000..23cc49e41f --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/WCMatcher.h @@ -0,0 +1,234 @@ +#ifndef __WCMATCHER_H__ +#define __WCMATCHER_H__ + +#include "bkstring.h" +#include +#include + +/** + A matcher is a non thread-safe object used to scan strings using a given + {@link WCPattern WCPattern} object. Using a WCMatcher is the preferred + method for scanning strings. WCMatchers are not thread-safe. WCMatchers require + very little dynamic memory, hence one is encouraged to create several + instances of a matcher when necessary as opposed to sharing a single instance + of a matcher. +

+ The most common methods needed by the matcher are matches, + findNextMatch, and getGroup. matches + and findNextMatch both return success or failure, and further + details can be gathered from their documentation. +

+ Unlike Java's WCMatcher, this class allows you to change the string + you are matching against. This provides a small optimization, since you no + longer need multiple matchers for a single pattern in a single thread. +

+ This class also provides an extremely handy method for replacing text with + captured data via the replaceWithGroups method. A typical + invocation looks like: +

+  wchar_t buf[10000];
+  bkstring str = "\\5 (user name \\1) uses \\7 for his/her shell and \\6 is their home directory";
+  FILE * fp = fopen("/etc/passwd", "r");
+  WCPattern::registerWCPattern("entry", "[^:]+");
+  WCPattern * p = WCPattern::compile("^({entry}):({entry}):({entry}):({entry}):({entry}):({entry}):({entry})$",
+                                 WCPattern::MULTILINE_MATCHING | WCPattern::UNIX_LINE_MODE);
+  WCMatcher * m = p->createWCMatcher("");
+  while (fgets(buf, 9999, fp))
+  {
+    m->setString(buf);
+    if (m->matches())
+    {
+      printf("%s\n", m->replaceWithGroups(str).c_str());
+    }
+  }
+  fclose(fp);
+
+  
+ Calling any of the following functions before first calling + matches, findFirstMatch, or + findNextMatch results in undefined behavior and may cause your + program to crash. + +
    +
  • replaceWithGroups +
  • getStartingIndex
  • +
  • getEndingIndex
  • +
  • getGroup
  • +
  • getGroups
  • +
+
+

+ The function findFirstMatch will attempt to find the first match + in the input string. The same results can be obtained by first calling + reset followed by findNextMatch. +

+ To eliminate the necessity of looping through a string to find all the + matching substrings, findAll was created. The function will find + all matching substrings and return them in a vector. If you need + to examine specific capture groups within the substrings, then this method + should not be used. + + @author Jeffery Stuart + @since March 2003, Stable Since November 2004 + @version 1.05.00 + @memo Mutable object used on instances of a WCPattern class + */ +class WCMatcher +{ + friend class NFAUNode; + friend class NFAStartUNode; + friend class NFAEndUNode; + friend class NFAGroupHeadUNode; + friend class NFAGroupLoopUNode; + friend class NFAGroupLoopPrologueUNode; + friend class NFAGroupTailUNode; + friend class NFALookBehindUNode; + friend class NFAStartOfLineUNode; + friend class NFAEndOfLineUNode; + friend class NFAEndOfMatchUNode; + friend class NFAReferenceUNode; + friend class WCPattern; + private: + /** + Creates a new matcher object against text using + pattern. + + @param pattern The pattern with which to search + @param text The text in which to search + */ + WCMatcher(WCPattern * pattern, const bkstring & text); + protected: + /// The pattern we use to match + WCPattern * pat; + /// The string in which we are matching + const bkstring * str; + /// The starting point of our match + int start; + /// An array of the starting positions for each group + int * starts; + /// An array of the ending positions for each group + int * ends; + /// An array of private data used by NFAUNodes during matching + int * groups; + /// An array of private data used by NFAUNodes during matching + int * groupIndeces; + /// An array of private data used by NFAUNodes during matching + int * groupPos; + /// The ending index of the last match + int lm; + /// The number of capturing groups we have + int gc; + /// The number of non-capturing groups we havew + int ncgc; + /// Whether or not we have matched something (used only by findFirstMatch and findNextMatch) + int matchedSomething; + /// The flags with which we were made + unsigned long flags; + /// Called by reset to clear the group arrays + void clearGroups(); + public: + /// Used internally by match to signify we want the entire string matched + const static int MATCH_ENTIRE_STRING; + public: + /// Cleans up the dynamic memory used by this matcher + ~WCMatcher(); + /** + Replaces the contents of str with the appropriate captured + text. str should have at least one back reference, otherwise + this function does nothing. + @param str The string in which to replace text + @return A string with all backreferences appropriately replaced + */ + bkstring replaceWithGroups(const bkstring & str); + /** + The flags currently being used by the matcher. + @return Zero + */ + unsigned long getFlags() const; + /** + The text being searched by the matcher. + @return the text being searched by the matcher. + */ + const bkstring& getText() const; + + /** + Scans the string from start to finish for a match. The entire string must + match for this function to return success. Group variables are + appropriately set and can be queried after this function returns. + + @return Success if and only if the entire string matches the pattern + */ + bool matches(); + /** + Scans the string for the first substring matching the pattern. The entire + string does not necessarily have to match for this function to return + success. Group variables are appropriately set and can be queried after + this function returns. + + @return Success if any substring matches the specified pattern + */ + bool findFirstMatch(); + /** + Scans the string for the next substring matching the pattern. If no calls + have been made to findFirstMatch of findNextMatch since the last call to + reset, matches, or setString, then this function's behavior results to + that of findFirstMatch. + + @return Success if another substring can be found that matches the pattern + */ + bool findNextMatch(); + /** + Returns a vector of every substring in order which matches the given + pattern. + + @return Every substring in order which matches the given pattern + */ + std::vector findAll(); + /** + Resets the internal state of the matcher + */ + void reset(); + /** + Same as getText. Left n for backwards compatibilty with old source code + @return Returns the string that is currently being used for matching + */ + inline const bkstring& getString() const { return *str; } + /** + Sets the string to scan + @param newStr The string to scan for subsequent matches + */ + inline void setString(const bkstring & newStr) { str = &newStr; reset(); } + + /** + Returns the starting index of the specified group. + @param groupNum The group to query + @return The starting index of the group if it was matched, -1 for an + invalid group or if the group was not matched + */ + int getStartingIndex(const int groupNum = 0) const; + /** + Returns the ending index of the specified group. + @param groupNum The group to query + @return The ending index of the group if it was matched, -1 for an + invalid group or if the group was not matched + */ + int getEndingIndex(const int groupNum = 0) const; + /** + Returns the specified group. An empty string ("") does not necessarily + mean the group was not matched. A group such as (a*b?) could be matched by + a zero length. If an empty string is returned, getStartingIndex can be + called to determine if the group was actually matched. + @param groupNum The group to query + @return The text of the group + */ + bkstring getGroup(const int groupNum = 0) const; + /** + Returns every capture group in a vector + + @param includeGroupZero Whether or not include capture group zero + @return Every capture group + */ + std::vector getGroups(const bool includeGroupZero = 0) const; +}; + +#endif diff --git a/plugins/SmileyAdd/src/regexp/WCPattern.cpp b/plugins/SmileyAdd/src/regexp/WCPattern.cpp new file mode 100644 index 0000000000..25f379f5e4 --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/WCPattern.cpp @@ -0,0 +1,1747 @@ +/** + From the author (Jeff Stuart) + " + Let me start by saying this file is pretty big. If you feel up to it, you can + try making changes yourself, but you would be better off to just email me at + stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you + would like added. This project is very "near and dear" to me, so I am fairly quick + to make bug fixes. The header files for WCPattern and WCMatcher are fairly well + documented and the function names are pretty self-explanatory, but if you are having + any trouble, feel free to email me at stuart@cs.ucdavis.edu. + + If you email me, make sure you put something like C++RE in the subject because + I tend to delete email if I don't recognize the name and the subject is + something like "I Need Your Help" or "Got A Second" or "I Found It". + " + */ + +/* + Detailed documentation is provided in this class' header file + + @author Jeffery Stuart + @since November 2004 + @version 1.07.00 +*/ + +#ifdef _WIN32 + #pragma warning(push) + #pragma warning(disable:4996) +#endif + +#include +#include +#include +#include +#ifndef _WIN32 + #include +#endif + +std::map WCPattern::compiledWCPatterns; +std::map > WCPattern::registeredWCPatterns; + +const int WCPattern::MIN_QMATCH = 0x00000000; +const int WCPattern::MAX_QMATCH = 0x7FFFFFFF; + +const unsigned long WCPattern::CASE_INSENSITIVE = 0x01; +const unsigned long WCPattern::LITERAL = 0x02; +const unsigned long WCPattern::DOT_MATCHES_ALL = 0x04; +const unsigned long WCPattern::MULTILINE_MATCHING = 0x08; +const unsigned long WCPattern::UNIX_LINE_MODE = 0x10; + +#define to_lower(a) (wchar_t)(UINT_PTR)CharLowerW((LPWSTR)(unsigned)a) +#define is_alpha IsCharAlphaW + +#if defined(_WIN32) + #define str_icmp lstrcmpiW +#elif defined(__CYGWIN__) || defined(__APPLE__) + #include + static inline int str_icmp(const wchar_t * a, const wchar_t * b) + { + while (*a && *b) + { + const int t = (int)towlower(*a) - (int)tolower(*b); + if (t) return t; + ++a; ++b; + } + if (*a) + { + if (*b) return (int)towlower(*a) - (int)tolower(*b); + return 1; + } + else if (*b) return 1; + return 0; + } +#else + #define str_icmp wcscasecmp +#endif + +WCPattern::WCPattern(const bkstring & rhs) +{ + matcher = NULL; + pattern = rhs; + curInd = 0; + groupCount = 0; + nonCapGroupCount = 0; + error = 0; + head = NULL; +} +// convenience function in case we want to add any extra debugging output +void WCPattern::raiseError() +{ +/* switch (pattern[curInd - 1]) + { + case '*': + case ')': + case '+': + case '?': + case ']': + case '}': + fwprintf(stderr, L"%s\n%*c^\n", pattern.c_str(), curInd - 1, ' '); + fwprintf(stderr, L"Syntax Error near here. Possible unescaped meta character.\n"); + break; + default: + fwprintf(stderr, L"%s\n%*c^\n", pattern.c_str(), curInd - 1, ' '); + fwprintf(stderr, L"Syntax Error near here. \n"); + break; + }*/ + error = 1; +} +NFAUNode * WCPattern::registerNode(NFAUNode * node) +{ + nodes[node] = 1; + return node; +} + +bkstring WCPattern::classUnion (bkstring s1, bkstring s2) const +{ + wchar_t * out = new wchar_t[66000]; + std::sort(s1.begin(), s1.end()); + std::sort(s2.begin(), s2.end()); + wchar_t* p = std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out); *p = 0; + bkstring ret = out; + delete [] out; + return ret; +} +bkstring WCPattern::classIntersect (bkstring s1, bkstring s2) const +{ + wchar_t * out = new wchar_t[66000]; + std::sort(s1.begin(), s1.end()); + std::sort(s2.begin(), s2.end()); + *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0; + bkstring ret = out; + delete [] out; + return ret; +} +bkstring WCPattern::classNegate (bkstring s1) const +{ + wchar_t * out = new wchar_t[66000]; + int i, ind = 0; + std::map m; + + for (i = 0; i < (int)s1.size(); ++i) m[s1[i]] = 1; + for (i = 0xFF; i >= 0; --i) if (m.find((wchar_t)i) == m.end()) out[ind++] = (wchar_t)i; + out[ind] = 0; + bkstring ret(out, ind); + delete [] out; + return ret; +} +bkstring WCPattern::classCreateRange(wchar_t low, wchar_t hi) const +{ + wchar_t out[300]; + int ind = 0; + while (low != hi) out[ind++] = low++; + out[ind++] = low; + return bkstring(out, ind); +} + +int WCPattern::getInt(int start, int end) +{ + int ret = 0; + for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - (wchar_t)'0'); + return ret; +} +bool WCPattern::quantifyCurly(int & sNum, int & eNum) +{ + bool good = 1; + int i, ci = curInd + 1; + int commaInd = ci, endInd = ci, len = pattern.size(); + sNum = eNum = 0; + + while (endInd < len && pattern[endInd ] != (wchar_t)'}') ++endInd; + while (commaInd < endInd && pattern[commaInd] != (wchar_t)',') ++commaInd; + if (endInd >= len) { raiseError(); return 0; } + for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0; + if (!good && commaInd < endInd) { raiseError(); return 0; } + if (!good) return 0; + /* so now everything in here is either a comma (and there is at most one comma) or a digit */ + if (commaInd == ci) // {,*} + { + if (endInd == commaInd + 1) { sNum = MIN_QMATCH; eNum = MAX_QMATCH; } // {,} = * + else { sNum = MIN_QMATCH; eNum = getInt(commaInd + 1, endInd - 1); } // {,+} + } + else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH; } // {+,} + else if (commaInd == endInd) { sNum = getInt(ci, endInd - 1); eNum = sNum; } // {+} + else { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1); } // {+,+} + curInd = endInd + 1; + return 1; +} +NFAUNode * WCPattern::quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn) +{ + NFAUNode * newNode = NULL; + int type = 0; + + if (curInd < (int)pattern.size()) + { + wchar_t ch = (curInd + 1 >= (int)pattern.size()) ? USHRT_MAX : pattern[curInd + 1]; + switch (pattern[curInd]) + { + case (wchar_t)'*': + ++curInd; + switch (ch) + { + case (wchar_t)'?': ++curInd; type = 1; break; + case (wchar_t)'+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueUNode(gn)); + newNode->next = registerNode(new NFAGroupLoopUNode(start, MIN_QMATCH, MAX_QMATCH, gn, type)); + stop->next = newNode->next; + return newNode; + case (wchar_t)'?': + ++curInd; + switch (ch) + { + case (wchar_t)'?': ++curInd; type = 1; break; + case (wchar_t)'+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueUNode(gn)); + newNode->next = registerNode(new NFAGroupLoopUNode(start, MIN_QMATCH, 1, gn, type)); + stop->next = newNode->next; + return newNode; + case (wchar_t)'+': + ++curInd; + switch (ch) + { + case (wchar_t)'?': ++curInd; type = 1; break; + case (wchar_t)'+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueUNode(gn)); + newNode->next = registerNode(new NFAGroupLoopUNode(start, 1, MAX_QMATCH, gn, type)); + stop->next = newNode->next; + return newNode; + case (wchar_t)'{': + { + int s, e; + if (quantifyCurly(s, e)) + { + ch = (curInd < (int)pattern.size()) ? pattern[curInd] : USHRT_MAX; + switch (ch) + { + case (wchar_t)'?': ++curInd; type = 1; break; + case (wchar_t)'+': ++curInd; type = 2; break; + } + newNode = registerNode(new NFAGroupLoopPrologueUNode(gn)); + newNode->next = registerNode(new NFAGroupLoopUNode(start, s, e, gn, type)); + stop->next = newNode->next; + return newNode; + } + } + default: + break; + } + } + return NULL; +} + +NFAUNode * WCPattern::quantify(NFAUNode * newNode) +{ + if (curInd < (int)pattern.size()) + { + wchar_t ch = (curInd + 1 >= (int)pattern.size()) ? USHRT_MAX : pattern[curInd + 1]; + switch (pattern[curInd]) + { + case (wchar_t)'*': + ++curInd; + switch (ch) + { + case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break; + case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break; + default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break; + } + break; + case (wchar_t)'?': + ++curInd; + switch (ch) + { + case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, MIN_QMATCH, 1)); break; + case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, MIN_QMATCH, 1)); break; + default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, MIN_QMATCH, 1)); break; + } + break; + case (wchar_t)'+': + ++curInd; + switch (ch) + { + case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, 1, MAX_QMATCH)); break; + case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, 1, MAX_QMATCH)); break; + default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, 1, MAX_QMATCH)); break; + } + break; + case (wchar_t)'{': + { + int s, e; + if (quantifyCurly(s, e)) + { + ch = (curInd < (int)pattern.size()) ? pattern[curInd] : USHRT_MAX; + switch (ch) + { + case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, s, e)); break; + case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, s, e)); break; + default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, s, e)); break; + } + } + } + break; + default: + break; + } + } + return newNode; +} +bkstring WCPattern::parseClass() +{ + bkstring t, ret = L""; + wchar_t ch, c1, c2; + bool inv = 0, neg = 0, quo = 0; + + if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'^') + { + ++curInd; + neg = 1; + } + while (curInd < (int)pattern.size() && pattern[curInd] != (wchar_t)']') + { + ch = pattern[curInd++]; + if (ch == (wchar_t)'[') + { + t = parseClass(); + ret = classUnion(ret, t); + } + /*else if (ch == (wchar_t)'-') + { + raiseError(); + curInd = pattern.size(); + }*/ + else if (ch == (wchar_t)'&' && curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'&') + { + if (pattern[++curInd] != (wchar_t)'[') + { + raiseError(); + curInd = pattern.size(); + } + else + { + ++curInd; + t = parseClass(); + ret = classIntersect(ret, t); + } + } + else if (ch == (wchar_t)'\\') + { + t = parseEscape(inv, quo); + if (quo) + { + raiseError(); + curInd = pattern.size(); + } + else if (inv || t.size() > 1) // cant be part of a range (a-z) + { + if (inv) t = classNegate(t); + ret = classUnion(ret, t); + } + else if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'-') // part of a range (a-z) + { + c1 = t[0]; + ++curInd; + if (curInd >= (int)pattern.size()) raiseError(); + else + { + c2 = pattern[curInd++]; + if (c2 == (wchar_t)'\\') + { + t = parseEscape(inv, quo); + if (quo) + { + raiseError(); + curInd = pattern.size(); + } + else if (inv || t.size() > 1) raiseError(); + else ret = classUnion(ret, classCreateRange(c1, c2)); + } + else if (c2 == (wchar_t)'[' || c2 == (wchar_t)']' || c2 == (wchar_t)'-' || c2 == (wchar_t)'&') + { + raiseError(); + curInd = pattern.size(); + } + else ret = classUnion(ret, classCreateRange(c1, c2)); + } + } + else + { + ret = classUnion(ret, t); + } + } + else if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'-') + { + c1 = ch; + ++curInd; + if (curInd >= (int)pattern.size()) raiseError(); + else + { + c2 = pattern[curInd++]; + if (c2 == (wchar_t)'\\') + { + t = parseEscape(inv, quo); + if (quo) + { + raiseError(); + curInd = pattern.size(); + } + else if (inv || t.size() > 1) raiseError(); + else ret = classUnion(ret, classCreateRange(c1, c2)); + } + else if (c2 == (wchar_t)'[' || c2 == (wchar_t)']' || c2 == (wchar_t)'-' || c2 == (wchar_t)'&') + { + raiseError(); + curInd = pattern.size(); + } + else + { + ret = classUnion(ret, classCreateRange(c1, c2)); + } + } + } + else + { + ret += L" "; + ret[ret.size() - 1] = ch; + } + } + if (curInd >= (int)pattern.size() || pattern[curInd] != (wchar_t)']') + { + raiseError(); + ret = L""; + } + else + { + ++curInd; + if (neg) ret = classNegate(ret); + } + return ret; +} +bkstring WCPattern::parsePosix() +{ + bkstring s7 = pattern.substr(curInd, 7); + if (s7 == L"{Lower}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyz"; } + if (s7 == L"{Upper}") { curInd += 7; return L"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; } + if (s7 == L"{Alpha}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; } + if (s7 == L"{Digit}") { curInd += 7; return L"0123456789"; } + if (s7 == L"{Alnum}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; } + if (s7 == L"{Punct}") { curInd += 7; return L"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } + if (s7 == L"{Graph}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } + if (s7 == L"{Print}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } + if (s7 == L"{Blank}") { curInd += 7; return L" \t"; } + if (s7 == L"{Space}") { curInd += 7; return L" \t\n\x0B\f\r"; } + if (s7 == L"{Cntrl}") + { + bkstring::value_type i; + bkstring s = L" "; + + for (i = 0; i < 5; ++i) s += s; + s += L" "; + for (i = 0; i <= 0x1F; ++i) s[i] = i; + s[0x20] = 0x7F; + curInd += 7; + return s; + } + if (s7 == L"{ASCII}") + { + bkstring s(0x80, (wchar_t)' '); + for (bkstring::value_type i = 0; i <= 0x7f; ++i) s[i] = i; + curInd += 7; + return s; + } + if (pattern.substr(curInd, 8) == L"{XDigit}") { curInd += 8; return L"abcdefABCDEF0123456789"; } + raiseError(); + return L""; +} +NFAUNode * WCPattern::parseBackref() +{ + #define is_dig(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'9') + #define to_int(x) ((x) - (wchar_t)'0') + int ci = curInd; + int oldRef = 0, ref = 0; + + while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount)) + { + oldRef = ref; + ref = ref * 10 + to_int(pattern[ci++]); + } + if (ci == (int)pattern.size()) + { + oldRef = ref; + ++ci; + } + if (oldRef < 0 || ci <= curInd) + { + raiseError(); + return registerNode(new NFAReferenceUNode(-1)); + } + curInd = ci; + return registerNode(new NFAReferenceUNode(ref)); + + #undef is_dig + #undef to_int +} +bkstring WCPattern::parseOctal() +{ + #define islowoc(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'3') + #define isoc(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'7') + #define fromoc(x) ((x) - (wchar_t)'0') + int ci = curInd; + wchar_t ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : USHRT_MAX; + wchar_t ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : USHRT_MAX; + wchar_t ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : USHRT_MAX; + bkstring s = L" "; + + if (islowoc(ch1) && isoc(ch2)) + { + curInd += 2; + s[0] = fromoc(ch1) * 8 + fromoc(ch2); + if (isoc(ch3)) + { + ++curInd; + s[0] = s[0] * 8 + fromoc(ch3); + } + } + else if (isoc(ch1) && isoc(ch2)) + { + curInd += 2; + s[0] = fromoc(ch1) * 8 + fromoc(ch2); + } + else raiseError(); + + return s; + #undef islowoc + #undef isoc + #undef fromoc +} +bkstring WCPattern::parseHex() +{ + #define to_low(x) (((x) >= (wchar_t)'A' && (x) <= (wchar_t)'Z') ? ((x) - (wchar_t)'A' + (wchar_t)'a') : (x)) + #define is_dig(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'9') + #define is_hex(x) (is_dig(x) || (to_low(x) >= (wchar_t)'a' && to_low(x) <= (wchar_t)'f')) + #define to_int(x) ((is_dig(x)) ? ((x) - (wchar_t)'0') : (to_low(x) - (wchar_t)'a' + 10)) + + int ci = curInd; + wchar_t ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : USHRT_MAX; + wchar_t ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : USHRT_MAX; + wchar_t ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : USHRT_MAX; + wchar_t ch4 = (ci + 3 < (int)pattern.size()) ? pattern[ci + 3] : USHRT_MAX; + bkstring s = L" "; + + if (is_hex(ch1) && is_hex(ch2) && is_hex(ch3) && is_hex(ch4)) + { + curInd += 2; + s[0] = (to_int(ch1) << 12 & 0xF000) | (to_int(ch2) << 8 & 0x0F00) | + (to_int(ch3) << 4 & 0x0F00) | (to_int(ch4) & 0x000F); + } + else if (is_hex(ch1) && is_hex(ch2)) + { + curInd += 2; + s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F); + } + + return s; + #undef to_low + #undef is_dig + #undef is_hex + #undef to_int +} +bkstring WCPattern::parseEscape(bool & inv, bool & quo) +{ + wchar_t ch = pattern[curInd++]; + bkstring classes = L""; + + if (curInd > (int)pattern.size()) + { + raiseError(); + return NULL; + } + + quo = 0; + inv = 0; + switch (ch) + { + case (wchar_t)'p': classes = parsePosix(); break; + case (wchar_t)'P': classes = L"!!"; classes += parsePosix(); break; + case (wchar_t)'d': classes = L"0123456789"; break; + case (wchar_t)'D': classes = L"!!0123456789"; break; + case (wchar_t)'s': classes = L" \t\r\n\f"; break; + case (wchar_t)'S': classes = L"!! \t\r\n\f"; break; + case (wchar_t)'w': classes = L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break; + case (wchar_t)'W': classes = L"!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break; + case (wchar_t)'0': classes = parseOctal(); break; + case (wchar_t)'x': classes = parseHex(); break; + + case (wchar_t)'Q': quo = 1; break; + case (wchar_t)'t': classes = L"\t"; break; + case (wchar_t)'r': classes = L"\r"; break; + case (wchar_t)'n': classes = L"\n"; break; + case (wchar_t)'f': classes = L"\f"; break; + case (wchar_t)'a': classes = L"\a"; break; + case (wchar_t)'e': classes = L"\r"; break; + default: classes = L" "; classes[0] = ch; break; + } + if (classes.substr(0, 2) == L"!!") + { + classes = classes.substr(2); + inv = 1; + } + return classes; +} +NFAUNode * WCPattern::parseRegisteredWCPattern(NFAUNode ** end) +{ + int i, j; + bkstring s; + NFAUNode * ret = NULL; + for (i = curInd; i < (int)pattern.size() && pattern[i] != (wchar_t)'}'; ++i) { } + if (pattern[i] != (wchar_t)'}') { raiseError(); return NULL; } + if (i == curInd + 1) { raiseError(); return NULL; } // {} + if ( + !( + (pattern[curInd] >= (wchar_t)'a' && pattern[curInd] <= (wchar_t)'z') || + (pattern[curInd] >= (wchar_t)'A' && pattern[curInd] <= (wchar_t)'Z') || + (pattern[curInd] == (wchar_t)'_') + ) + ) + { + raiseError(); + return NULL; + } + for (j = curInd; !error && j < i; ++j) + { + if ( + !( + (pattern[j] >= (wchar_t)'a' && pattern[j] <= (wchar_t)'z') || + (pattern[j] >= (wchar_t)'A' && pattern[j] <= (wchar_t)'Z') || + (pattern[j] >= (wchar_t)'0' && pattern[j] <= (wchar_t)'9') || + (pattern[j] == (wchar_t)'_') + ) + ) + { + raiseError(); + return NULL; + } + } + s = pattern.substr(curInd, i - curInd); + if (registeredWCPatterns.find(s) == registeredWCPatterns.end()) raiseError(); + else + { + unsigned long oflags = flags; + bkstring op = pattern; + int ci = i + 1; + + pattern = registeredWCPatterns[s].first; + curInd = 0; + flags = registeredWCPatterns[s].second; + + --groupCount; + ret = parse(0, 0, end); + + pattern = op; + curInd = ci; + flags = oflags; + } + if (error) { *end = ret = NULL; } + return ret; +} + +// look behind should interpret everything as a literal (except \\) since the +// pattern must have a concrete length +NFAUNode * WCPattern::parseBehind(const bool pos, NFAUNode ** end) +{ + bkstring t = L""; + while (curInd < (int)pattern.size() && pattern[curInd] != (wchar_t)')') + { + wchar_t ch = pattern[curInd++]; + t += L" "; + if (ch == (wchar_t)'\\') + { + if (curInd + 1 >= (int)pattern.size()) + { + raiseError(); + return *end = registerNode(new NFACharUNode((wchar_t)' ')); + } + ch = pattern[curInd++]; + } + t[t.size() - 1] = ch; + } + if (curInd >= (int)pattern.size() || pattern[curInd] != (wchar_t)')') raiseError(); + else ++curInd; + return *end = registerNode(new NFALookBehindUNode(t, pos)); +} +NFAUNode * WCPattern::parseQuote() +{ + bool done = 0; + bkstring s = L""; + + while (!done) + { + if (curInd >= (int)pattern.size()) + { + raiseError(); + done = 1; + } + else if (pattern.substr(curInd, 2) == L"\\E") + { + curInd += 2; + done = 1; + } + else if (pattern[curInd] == (wchar_t)'\\') + { + s += L" "; + s[s.size() - 1] = pattern[++curInd]; + ++curInd; + } + else + { + s += L" "; + s[s.size() - 1] = pattern[curInd++]; + } + } + if ((flags & WCPattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteUNode(s)); + return registerNode(new NFAQuoteUNode(s)); +} +NFAUNode * WCPattern::parse(const bool inParen, const bool inOr, NFAUNode ** end) +{ + NFAUNode * start, * cur, * next = NULL; + bkstring t; + int grc = groupCount++; + bool inv, quo; + bool ahead = 0, pos = 0, noncap = 0, indep = 0; + unsigned long oldFlags = flags; + + if (inParen) + { + if (pattern[curInd] == (wchar_t)'?') + { + ++curInd; + --groupCount; + if (pattern[curInd] == (wchar_t)':') { noncap = 1; ++curInd; grc = --nonCapGroupCount; } + else if (pattern[curInd] == (wchar_t)'=') { ++curInd; ahead = 1; pos = 1; } + else if (pattern[curInd] == (wchar_t)'!') { ++curInd; ahead = 1; pos = 0; } + else if (pattern.substr(curInd, 2) == L"<=") { curInd += 2; return parseBehind(1, end); } + else if (pattern.substr(curInd, 2) == L"') { ++curInd; indep = 1; } + else + { + bool negate = false, done = false; + while (!done) + { + if (curInd >= (int)pattern.size()) + { + raiseError(); + return NULL; + } + else if (negate) + { + switch (pattern[curInd]) + { + case (wchar_t)'i': flags &= ~WCPattern::CASE_INSENSITIVE; break; + case (wchar_t)'d': flags &= ~WCPattern::UNIX_LINE_MODE; break; + case (wchar_t)'m': flags &= ~WCPattern::MULTILINE_MATCHING; break; + case (wchar_t)'s': flags &= ~WCPattern::DOT_MATCHES_ALL; break; + case (wchar_t)':': done = true; break; + case (wchar_t)')': + ++curInd; + *end = registerNode(new NFALookBehindUNode(L"", true)); + return *end; + case (wchar_t)'-': + default: raiseError(); return NULL; + } + } + else + { + switch (pattern[curInd]) + { + case (wchar_t)'i': flags |= WCPattern::CASE_INSENSITIVE; break; + case (wchar_t)'d': flags |= WCPattern::UNIX_LINE_MODE; break; + case (wchar_t)'m': flags |= WCPattern::MULTILINE_MATCHING; break; + case (wchar_t)'s': flags |= WCPattern::DOT_MATCHES_ALL; break; + case (wchar_t)':': done = true; break; + case (wchar_t)'-': negate = true; break; + case (wchar_t)')': + ++curInd; + *end = registerNode(new NFALookBehindUNode(L"", true)); + return *end; + default: raiseError(); return NULL; + } + } + ++curInd; + } + noncap = 1; + grc = --nonCapGroupCount; + } + + if (noncap) cur = start = registerNode(new NFAGroupHeadUNode(grc)); + else cur = start = registerNode(new NFASubStartUNode); + } + else cur = start = registerNode(new NFAGroupHeadUNode(grc)); + } + else cur = start = registerNode(new NFASubStartUNode); + while (curInd < (int)pattern.size()) + { + wchar_t ch = pattern[curInd++]; + + next = NULL; + if (error) return NULL; + switch (ch) + { + case (wchar_t)'^': + if ((flags & WCPattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineUNode); + else next = registerNode(new NFAStartOfInputUNode); + break; + case (wchar_t)'$': + if ((flags & WCPattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineUNode); + else next = registerNode(new NFAEndOfInputUNode(0)); + break; + case (wchar_t)'|': + --groupCount; + cur->next = registerNode(new NFAAcceptUNode); + cur = start = registerNode(new NFAOrUNode(start, parse(inParen, 1))); + break; + case (wchar_t)'\\': + if (curInd < (int)pattern.size()) + { + bool eoi = 0; + switch (pattern[curInd]) + { + case (wchar_t)'1': + case (wchar_t)'2': + case (wchar_t)'3': + case (wchar_t)'4': + case (wchar_t)'5': + case (wchar_t)'6': + case (wchar_t)'7': + case (wchar_t)'8': + case (wchar_t)'9': next = parseBackref(); break; + case (wchar_t)'A': ++curInd; next = registerNode(new NFAStartOfInputUNode); break; + case (wchar_t)'B': ++curInd; next = registerNode(new NFAWordBoundaryUNode(0)); break; + case (wchar_t)'b': ++curInd; next = registerNode(new NFAWordBoundaryUNode(1)); break; + case (wchar_t)'G': ++curInd; next = registerNode(new NFAEndOfMatchUNode); break; + case (wchar_t)'Z': eoi = 1; + case (wchar_t)'z': ++curInd; next = registerNode(new NFAEndOfInputUNode(eoi)); break; + default: + t = parseEscape(inv, quo); + //printf("inv quo classes { %c %c %s }\n", inv ? (wchar_t)'t' : (wchar_t)'f', quo ? (wchar_t)'t' : (wchar_t)'f', t.c_str()); + if (!quo) + { + if (t.size() > 1 || inv) + { + if ((flags & WCPattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassUNode(t, inv)); + else next = registerNode(new NFAClassUNode(t, inv)); + } + else + { + next = registerNode(new NFACharUNode(t[0])); + } + } + else + { + next = parseQuote(); + } + } + } + else raiseError(); + break; + case (wchar_t)'[': + if ((flags & WCPattern::CASE_INSENSITIVE) == 0) + { + NFAClassUNode * clazz = new NFAClassUNode(); + bkstring s = parseClass(); + for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1; + next = registerNode(clazz); + } + else + { + NFACIClassUNode * clazz = new NFACIClassUNode(); + bkstring s = parseClass(); + for (int i = 0; i < (int)s.size(); ++i) clazz->vals[to_lower(s[i])] = 1; + next = registerNode(clazz); + } + break; + case (wchar_t)'.': + { + bool useN = 1, useR = 1; + NFAClassUNode * clazz = new NFAClassUNode(1); + if ((flags & WCPattern::UNIX_LINE_MODE) != 0) useR = 0; + if ((flags & WCPattern::DOT_MATCHES_ALL) != 0) useN = useR = 0; + if (useN) clazz->vals[(wchar_t)'\n'] = 1; + if (useR) clazz->vals[(wchar_t)'\r'] = 1; + next = registerNode(clazz); + } + break; + case (wchar_t)'(': + { + NFAUNode * end, * t1, * t2; + t1 = parse(1, 0, &end); + if (!t1) raiseError(); + else if (t1->isGroupHeadNode() && (t2 = quantifyGroup(t1, end, grc)) != NULL) + { + cur->next = t2; + cur = t2->next; + } + else + { + cur->next = t1; + cur = end; + } + } + break; + case (wchar_t)')': + if (!inParen) raiseError(); + else if (inOr) + { + --curInd; + cur = cur->next = registerNode(new NFAAcceptUNode); + flags = oldFlags; + return start; + } + else + { + if (ahead) + { + cur = cur->next = registerNode(new NFAAcceptUNode); + flags = oldFlags; + return *end = registerNode(new NFALookAheadUNode(start, pos)); + } + else if (indep) + { + cur = cur->next = registerNode(new NFAAcceptUNode); + flags = oldFlags; + return *end = registerNode(new NFAPossessiveQuantifierUNode(this, start, 1, 1)); + } + else // capping or noncapping, it doesnt matter + { + *end = cur = cur->next = registerNode(new NFAGroupTailUNode(grc)); + next = quantifyGroup(start, *end, grc); + if (next) + { + start = next; + *end = next->next; + } + flags = oldFlags; + return start; + } + } + break; + case (wchar_t)'{': // registered pattern + cur->next = parseRegisteredWCPattern(&next); + if (cur->next) cur = next; + break; + case (wchar_t)'*': + case (wchar_t)'+': + case (wchar_t)'?': +// case (wchar_t)'}': +// case (wchar_t)']': + raiseError(); + break; + default: + if ((flags & WCPattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACICharUNode(ch)); + else next = registerNode(new NFACharUNode(ch)); + break; + } + if (next) cur = cur->next = quantify(next); + } + if (inParen) raiseError(); + else + { + if (inOr) cur = cur->next = registerNode(new NFAAcceptUNode); + if (end) *end = cur; + } + + flags = oldFlags; + if (error) return NULL; + + return start; +} + +WCPattern * WCPattern::compile(const bkstring & pattern, const unsigned long mode) +{ + WCPattern * p = new WCPattern(pattern); + NFAUNode * end; + + p->flags = mode; + if ((mode & WCPattern::LITERAL) != 0) + { + p->head = p->registerNode(new NFAStartUNode); + if ((mode & WCPattern::CASE_INSENSITIVE) != 0) p->head->next = p->registerNode(new NFACIQuoteUNode(pattern)); + else p->head->next = p->registerNode(new NFAQuoteUNode(pattern)); + p->head->next->next = p->registerNode(new NFAEndUNode); + } + else + { + p->head = p->parse(0, 0, &end); + if (!p->head) + { + delete p; + p = NULL; + } + else + { + if (!(p->head && p->head->isStartOfInputNode())) + { + NFAUNode * n = p->registerNode(new NFAStartUNode); + n->next = p->head; + p->head = n; + } + end->next = p->registerNode(new NFAEndUNode); + } + } + if (p != NULL) + { + p->matcher = new WCMatcher(p, L""); + } + + return p; +} + +WCPattern * WCPattern::compileAndKeep(const bkstring & pattern, const unsigned long mode) +{ + WCPattern * ret = NULL; + std::map::iterator it = compiledWCPatterns.find(pattern); + + if (it != compiledWCPatterns.end()) + { + ret = it->second; + } + else + { + ret = compile(pattern, mode); + compiledWCPatterns[pattern] = ret; + } + + return ret; +} +bkstring WCPattern::replace(const bkstring & pattern, const bkstring & str, + const bkstring & replacementText, const unsigned long mode) +{ + bkstring ret; + WCPattern * p = WCPattern::compile(pattern, mode); + if (p) + { + ret = p->replace(str, replacementText); + delete p; + } + return ret; +} + +std::vector WCPattern::split(const bkstring & pattern, const bkstring & str, const bool keepEmptys, + const unsigned long limit, const unsigned long mode) +{ + std::vector ret; + WCPattern * p = WCPattern::compile(pattern, mode); + if (p) + { + ret = p->split(str, keepEmptys, limit); + delete p; + } + return ret; +} + +std::vector WCPattern::findAll(const bkstring & pattern, const bkstring & str, const unsigned long mode) +{ + std::vector ret; + WCPattern * p = WCPattern::compile(pattern, mode); + if (p) + { + ret = p->findAll(str); + delete p; + } + return ret; +} + +bool WCPattern::matches(const bkstring & pattern, const bkstring & str, const unsigned long mode) +{ + bool ret = 0; + WCPattern * p = compile(pattern, mode); + + if (p) + { + ret = p->matches(str); + delete p; + } + + return ret; +} + +bool WCPattern::registerWCPattern(const bkstring & name, const bkstring & pattern, const unsigned long mode) +{ + WCPattern * p = WCPattern::compile(pattern, mode); + if (!p) return 0; + WCPattern::registeredWCPatterns[name] = std::make_pair(pattern, mode); + delete p; + return 1; +} + +void WCPattern::unregisterWCPatterns() +{ + registeredWCPatterns.clear(); +} +void WCPattern::clearWCPatternCache() +{ + std::map::iterator it; + for (it = compiledWCPatterns.begin(); it != compiledWCPatterns.end(); ++it) + { + delete it->second; + } + compiledWCPatterns.clear(); +} + +std::pair WCPattern::findNthMatch(const bkstring & pattern, const bkstring & str, + const int matchNum, const unsigned long mode) +{ + std::pair ret; + WCPattern * p = WCPattern::compile(pattern, mode); + + ret.second = -1; + if (p) + { + int i = -1; + p->matcher->setString(str); + while (i < matchNum && p->matcher->findNextMatch()) { ++i; } + if (i == matchNum && p->matcher->getStartingIndex() >= 0) + { + ret.first = p->matcher->getGroup(0); + ret.second = p->matcher->getStartingIndex(); + } + delete p; + } + + return ret; +} + +WCPattern::~WCPattern() +{ + /* + nodes.clear(); + if (head) head->findAllNodes(nodes); + */ + if (matcher) delete matcher; + for (std::map::iterator it = nodes.begin(); it != nodes.end(); ++it) delete it->first; +} +bkstring WCPattern::replace(const bkstring & str, const bkstring & replacementText) +{ + int li = 0; + bkstring ret = L""; + + matcher->setString(str); + while (matcher->findNextMatch()) + { + ret += str.substr(li, matcher->getStartingIndex() - li); + ret += matcher->replaceWithGroups(replacementText); + li = matcher->getEndingIndex(); + } + ret += str.substr(li); + + return ret; +} +std::vector WCPattern::split(const bkstring & str, const bool keepEmptys, const unsigned long limit) +{ + unsigned long lim = (limit == 0 ? MAX_QMATCH : limit); + int li = 0; + std::vector ret; + + matcher->setString(str); + + while (matcher->findNextMatch() && ret.size() < lim) + { + if (matcher->getStartingIndex() == 0 && keepEmptys) ret.push_back(L""); + if ((matcher->getStartingIndex() != matcher->getEndingIndex()) || keepEmptys) + { + if (li != matcher->getStartingIndex() || keepEmptys) + { + ret.push_back(str.substr(li, matcher->getStartingIndex() - li)); + } + li = matcher->getEndingIndex(); + } + } + if (li < (int)str.size()) ret.push_back(str.substr(li)); + + return ret; +} +std::vector WCPattern::findAll(const bkstring & str) +{ + matcher->setString(str); + return matcher->findAll(); +} +bool WCPattern::matches(const bkstring & str) +{ + matcher->setString(str); + return matcher->matches(); +} +unsigned long WCPattern::getFlags() const +{ + return flags; +} +bkstring WCPattern::getWCPattern() const +{ + return pattern; +} +WCMatcher * WCPattern::createWCMatcher(const bkstring & str) +{ + return new WCMatcher(this, str); +} + +// NFAUNode + +NFAUNode::NFAUNode() { next = NULL; } +NFAUNode::~NFAUNode() { } +void NFAUNode::findAllNodes(std::map & soFar) +{ + if (soFar.find(this) == soFar.end()) return; + soFar[this] = 1; + if (next) next->findAllNodes(soFar); +} + +// NFACharUNode + +NFACharUNode::NFACharUNode(const wchar_t c) { ch = c; } +int NFACharUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && str[curInd] == ch) return next->match(str, matcher, curInd + 1); + return -1; +} + +// NFACICharUNode + +NFACICharUNode::NFACICharUNode(const wchar_t c) { ch = to_lower(c); } +int NFACICharUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && to_lower(str[curInd]) == ch) return next->match(str, matcher, curInd + 1); + return -1; +} + +// NFAStartUNode + +NFAStartUNode::NFAStartUNode() { } +int NFAStartUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int ret = -1, ci = curInd; + + matcher->starts[0] = curInd; + if ((matcher->getFlags() & WCMatcher::MATCH_ENTIRE_STRING) == (unsigned int)WCMatcher::MATCH_ENTIRE_STRING) + { + if (curInd != 0) + { + matcher->starts[0] = -1; + return -1; + } + return next->match(str, matcher, 0); + } + while ((ret = next->match(str, matcher, ci)) == -1 && ci < (int)str.size()) + { + matcher->clearGroups(); + matcher->starts[0] = ++ci; + } + if (ret < 0) matcher->starts[0] = -1; + return ret; +} + +// NFAEndUNode + +NFAEndUNode::NFAEndUNode() { } +int NFAEndUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + matcher->ends[0] = curInd; + if ((matcher->getFlags() & WCMatcher::MATCH_ENTIRE_STRING) != 0) + { + if (curInd == (int)str.size()) return curInd; + matcher->ends[0] = -1; + return -1; + } + return curInd; +} + +// NFAQuantifierUNode + +void NFAQuantifierUNode::findAllNodes(std::map & soFar) +{ + inner->findAllNodes(soFar); + NFAUNode::findAllNodes(soFar); +} +NFAQuantifierUNode::NFAQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch) +{ + inner = internal; + inner->next = pat->registerNode(new NFAAcceptUNode); + min = (minMatch < WCPattern::MIN_QMATCH) ? WCPattern::MIN_QMATCH : minMatch; + max = (maxMatch > WCPattern::MAX_QMATCH) ? WCPattern::MAX_QMATCH : maxMatch; +} + +int NFAQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int i0, i1, i2 = 0; + + i0 = i1 = curInd; + while (i2 < min) + { + + ++i2; + i1 = inner->match(str, matcher, i0); + if (i1 <= i0) return i1; // i1 < i0 means i1 is -1 + i0 = i1; + } + + return i1; +} +// NFAGreedyQuantifierUNode + +NFAGreedyQuantifierUNode::NFAGreedyQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch) + : NFAQuantifierUNode(pat, internal, minMatch, maxMatch) { } +int NFAGreedyQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int t = NFAQuantifierUNode::match(str, matcher, curInd); + if (t != -1) return matchInternal(str, matcher, t, min); + return t; +} +int NFAGreedyQuantifierUNode::matchInternal(const bkstring & str, WCMatcher * matcher, const int curInd, const int soFar) const +{ + if (soFar >= max) return next->match(str, matcher, curInd); + + int i, j; + + i = inner->match(str, matcher, curInd); + if (i != -1) + { + j = matchInternal(str, matcher, i, soFar + 1); + if (j != -1) return j; + } + return next->match(str, matcher, curInd); +} + +// NFALazyQuantifierUNode + +NFALazyQuantifierUNode::NFALazyQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch) + : NFAQuantifierUNode(pat, internal, minMatch, maxMatch) { } +int NFALazyQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int i, j, m = NFAQuantifierUNode::match(str, matcher, curInd); + + if (m == -1) return -1; + + for (i = min; i < max; ++i) + { + j = next->match(str, matcher, m); + if (j == -1) + { + j = inner->match(str, matcher, m); + // if j < m, then j is -1, so we bail. + // if j == m, then we would just go and call next->match on the same index, + // but it already failed trying to match right there, so we know we can + // just bail + if (j <= m) return -1; + m = j; + } + else return j; + } + return next->match(str, matcher, m); +} + +// NFAPossessiveQuantifierUNode + +NFAPossessiveQuantifierUNode::NFAPossessiveQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch) + : NFAQuantifierUNode(pat, internal, minMatch, maxMatch) { } +int NFAPossessiveQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int i, j, m = NFAQuantifierUNode::match(str, matcher, curInd); + + if (m == -1) return -1; + for (i = min; i < max; ++i) + { + j = inner->match(str, matcher, m); + if (j <= m) return next->match(str, matcher, m); + m = j; + } + return next->match(str, matcher, m); +} + +// NFAAcceptUNode + +NFAAcceptUNode::NFAAcceptUNode() { } +int NFAAcceptUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (!next) return curInd; + else return next->match(str, matcher, curInd); +} + +// NFAClassUNode + +NFAClassUNode::NFAClassUNode(const bool invert) +{ + inv = invert; +} +NFAClassUNode::NFAClassUNode(const bkstring & clazz, const bool invert) +{ + inv = invert; + for (int i = 0; i < (int)clazz.size(); ++i) vals[clazz[i]] = 1; +} +int NFAClassUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && ((vals.find(str[curInd]) != vals.end()) ^ inv)) + { + return next->match(str, matcher, curInd + 1); + } + return -1; +} + +// NFACIClassUNode + +NFACIClassUNode::NFACIClassUNode(const bool invert) +{ + inv = invert; +} +NFACIClassUNode::NFACIClassUNode(const bkstring & clazz, const bool invert) +{ + inv = invert; + for (int i = 0; i < (int)clazz.size(); ++i) vals[to_lower(clazz[i])] = 1; +} +int NFACIClassUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd < (int)str.size() && ((vals.find(to_lower(str[curInd])) != vals.end()) ^ inv)) + { + return next->match(str, matcher, curInd + 1); + } + return -1; +} + +// NFASubStartUNode + +NFASubStartUNode::NFASubStartUNode() { } +int NFASubStartUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + return next->match(str, matcher, curInd); +} + +// NFAOrUNode + +NFAOrUNode::NFAOrUNode(NFAUNode * first, NFAUNode * second) : one(first), two(second) { } +void NFAOrUNode::findAllNodes(std::map & soFar) +{ + if (one) one->findAllNodes(soFar); + if (two) two->findAllNodes(soFar); + NFAUNode::findAllNodes(soFar); +} +int NFAOrUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int ci = one->match(str, matcher, curInd); + + if (ci != -1) ci = next->match(str, matcher, ci); + if (ci != -1) return ci; + if (ci == -1) ci = two->match(str, matcher, curInd); + if (ci != -1) ci = next->match(str, matcher, ci); + return ci; +} + +// NFAQuoteUNode + +NFAQuoteUNode::NFAQuoteUNode(const bkstring & quoted) : qStr(quoted) { } +int NFAQuoteUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd + qStr.size() > str.size()) return -1; + if (str.substr(curInd, qStr.size()) != qStr) return -1; + return next->match(str, matcher, curInd + (int)qStr.size()); +} + +// NFACIQuoteUNode + +NFACIQuoteUNode::NFACIQuoteUNode(const bkstring & quoted) : qStr(quoted) { } +int NFACIQuoteUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd + qStr.size() > str.size()) return -1; + if (str_icmp(str.substr(curInd, qStr.size()).c_str(), qStr.c_str())) return -1; + return next->match(str, matcher, (int)qStr.size()); +} + +// NFALookAheadUNode + +NFALookAheadUNode::NFALookAheadUNode(NFAUNode * internal, const bool positive) : NFAUNode(), pos(positive), inner(internal) { } +void NFALookAheadUNode::findAllNodes(std::map & soFar) +{ + if (inner) inner->findAllNodes(soFar); + NFAUNode::findAllNodes(soFar); +} +int NFALookAheadUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + return ((inner->match(str, matcher, curInd) == -1) ^ pos) ? next->match(str, matcher, curInd) : -1; +} + +// NFALookBehindUNode + +NFALookBehindUNode::NFALookBehindUNode(const bkstring & str, const bool positive) : pos(positive), mStr(str) { } +int NFALookBehindUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (pos) + { + if (curInd < (int)mStr.size()) return -1; + if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return next->match(str, matcher, curInd); + } + else + { + if (curInd < (int)mStr.size()) return next->match(str, matcher, curInd); + if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return -1; + return next->match(str, matcher, curInd); + } + return -1; +} + +// NFAStartOfLineUNode + +NFAStartOfLineUNode::NFAStartOfLineUNode() { } +int NFAStartOfLineUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd == 0 || str[curInd - 1] == (wchar_t)'\n' || str[curInd - 1] == (wchar_t)'\r') + { + return next->match(str, matcher, curInd); + } + return -1; +} + +// NFAEndOfLineUNode + +NFAEndOfLineUNode::NFAEndOfLineUNode() { } +int NFAEndOfLineUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd >= (int)str.size() || str[curInd] == (wchar_t)'\n' || str[curInd] == (wchar_t)'\r') + { + return next->match(str, matcher, curInd); + } + return -1; +} + +// NFAReferenceUNode + +NFAReferenceUNode::NFAReferenceUNode(const int groupIndex) : gi(groupIndex) { } +int NFAReferenceUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int len = matcher->ends[gi] - matcher->starts[gi]; + int ni = -1; + if (gi < 1 || matcher->ends[gi] < matcher->starts[gi] || len == 0) ni = curInd; + else if (curInd + len > (int)str.size()) return -1; + else if (str.substr(curInd, len) != str.substr(matcher->starts[gi], len)) return -1; + else ni = curInd + len; + + return next->match(str, matcher, ni); +} + +// NFAStartOfInputUNode + +NFAStartOfInputUNode::NFAStartOfInputUNode() { } +int NFAStartOfInputUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd == 0) return next->match(str, matcher, curInd); + return -1; +} + +// NFAEndOfInputUNode + +NFAEndOfInputUNode::NFAEndOfInputUNode(const bool lookForTerm) : term(lookForTerm) { } +int NFAEndOfInputUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int len = (int)str.size(); + if (curInd == len) return next->match(str, matcher, curInd); + else if (term) + { + if (curInd == len - 1 && (str[curInd] == (wchar_t)'\r' || str[curInd] == (wchar_t)'\n')) + { + return next->match(str, matcher, curInd); + } + else if (curInd == len - 2 && str.substr(curInd, 2) == L"\r\n") + { + return next->match(str, matcher, curInd); + } + } + return -1; +} + +// NFAWordBoundaryUNode + +NFAWordBoundaryUNode::NFAWordBoundaryUNode(const bool positive) : pos(positive) { } +int NFAWordBoundaryUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int len = (int)str.size(); + + wchar_t c1 = (curInd - 1 < len && curInd > 0) ? str[curInd - 1] : '\n'; + wchar_t c2 = (curInd < len) ? str[curInd ] : '\n'; + + if (curInd == len) return next->match(str, matcher, curInd); + bool ok = is_alpha(c1) != is_alpha(c2); + if (ok && pos) return next->match(str, matcher, curInd); + return -1; +} + +// NFAEndOfMatchUNode + +NFAEndOfMatchUNode::NFAEndOfMatchUNode() { } +int NFAEndOfMatchUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + if (curInd == matcher->lm) return next->match(str, matcher, curInd); + return -1; +} + +// NFAGroupHeadUNode + +NFAGroupHeadUNode::NFAGroupHeadUNode(const int groupIndex) : gi(groupIndex) { } +int NFAGroupHeadUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int ret, o = matcher->starts[gi]; + + matcher->starts[gi] = curInd; + ret = next->match(str, matcher, curInd); + if (ret < 0) matcher->starts[gi] = o; + + return ret; +} + +// NFAGroupTailUNode + +NFAGroupTailUNode::NFAGroupTailUNode(const int groupIndex) : gi(groupIndex) { } +int NFAGroupTailUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int ret, o = matcher->ends[gi]; + + matcher->ends[gi] = curInd; + ret = next->match(str, matcher, curInd); + if (ret < 0) matcher->ends[gi] = o; + + return ret; +} + +// NFAGroupLoopPrologueUNode + +NFAGroupLoopPrologueUNode::NFAGroupLoopPrologueUNode(const int groupIndex) : gi(groupIndex) { } +int NFAGroupLoopPrologueUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int ret, o1 = matcher->groups[gi], o2 = matcher->groupPos[gi], o3 = matcher->groupIndeces[gi]; + + matcher->groups[gi] = 0; + matcher->groupPos[gi] = 0; + matcher->groupIndeces[gi] = -1; + ret = next->match(str, matcher, curInd); + if (ret < 0) + { + matcher->groups[gi] = o1; + matcher->groupPos[gi] = o2; + matcher->groupIndeces[gi] = o3; + } + + return ret; +} + +// NFAGroupLoopUNode + +NFAGroupLoopUNode::NFAGroupLoopUNode(NFAUNode * internal, const int minMatch, const int maxMatch, + const int groupIndex, const int matchType) +{ + inner = internal; + min = minMatch; + max = maxMatch; + gi = groupIndex; + type = matchType; +} +void NFAGroupLoopUNode::findAllNodes(std::map & soFar) +{ + if (inner) inner->findAllNodes(soFar); + NFAUNode::findAllNodes(soFar); +} +int NFAGroupLoopUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + bool b = (curInd > matcher->groupIndeces[gi]); + + if (b && matcher->groups[gi] < min) + { + ++matcher->groups[gi]; + int o = matcher->groupIndeces[gi]; + matcher->groupIndeces[gi] = curInd; + int ret = inner->match(str, matcher, curInd); + if (ret < 0) + { + matcher->groupIndeces[gi] = o; + --matcher->groups[gi]; + } + return ret; + } + else if (!b || matcher->groups[gi] >= max) + { + return next->match(str, matcher, curInd); + } + else + { + switch (type) + { + case 0: return matchGreedy(str, matcher, curInd); + case 1: return matchLazy(str, matcher, curInd); + case 2: return matchPossessive(str, matcher, curInd); + } + } + return -1; +} +int NFAGroupLoopUNode::matchGreedy(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int o = matcher->groupIndeces[gi]; // save our info for backtracking + matcher->groupIndeces[gi] = curInd; // move along + ++matcher->groups[gi]; + int ret = inner->match(str, matcher, curInd); // match internally + if (ret < 0) + { // if we failed, then restore info and match next + --matcher->groups[gi]; + matcher->groupIndeces[gi] = o; + ret = next->match(str, matcher, curInd); + } + return ret; +} +int NFAGroupLoopUNode::matchLazy(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int ret = next->match(str, matcher, curInd); // be lazy, just go on + if (ret < 0) + { + int o = matcher->groupIndeces[gi]; // save info for backtracking + matcher->groupIndeces[gi] = curInd; // advance our position + ++matcher->groups[gi]; + ret = inner->match(str, matcher, curInd); // match our internal stuff + if (ret < 0) // if we failed, then restore the info + { + --matcher->groups[gi]; + matcher->groupIndeces[gi] = o; + } + } + return ret; +} +int NFAGroupLoopUNode::matchPossessive(const bkstring & str, WCMatcher * matcher, const int curInd) const +{ + int o = matcher->groupIndeces[gi]; // save info for backtracking + matcher->groupPos[gi] = matcher->groups[gi]; // set a flag stating we have matcher at least this much + matcher->groupIndeces[gi] = curInd; // move along + ++matcher->groups[gi]; + int ret = inner->match(str, matcher, curInd); // try and match again + if (ret < 0) + { // if we fail, back off, but to an extent + --matcher->groups[gi]; + matcher->groupIndeces[gi] = o; + if (matcher->groups[gi] == matcher->groupPos[gi]) ret = next->match(str, matcher, curInd); + } + return ret; +} + +#ifdef _WIN32 + #pragma warning(pop) +#endif diff --git a/plugins/SmileyAdd/src/regexp/WCPattern.h b/plugins/SmileyAdd/src/regexp/WCPattern.h new file mode 100644 index 0000000000..3d52a7fd2e --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/WCPattern.h @@ -0,0 +1,1663 @@ +#ifndef __WCPATTERN_H__ +#define __WCPATTERN_H__ + +#ifdef _WIN32 + #pragma warning(disable:4786) +#endif + +#include "bkstring.h" + +#include +#include + +class WCMatcher; +class NFAUNode; +class NFAQuantifierUNode; + +/** + This pattern class is very similar in functionality to Java's + java.util.regex.WCPattern class. The pattern class represents an immutable + regular expression object. Instead of having a single object contain both the + regular expression object and the matching object, instead the two objects are + split apart. The {@link WCMatcher WCMatcher} class represents the maching + object. + + The WCPattern class works primarily off of "compiled" patterns. A typical + instantiation of a regular expression looks like: + +

+  WCPattern * p = WCPattern::compile(L"a*b");
+  WCMatcher * m = p->createWCMatcher(L"aaaaaab");
+  if (m->matches()) ...
+  
+ + However, if you do not need to use a pattern more than once, it is often times + okay to use the WCPattern's static methods insteads. An example looks like this: + +
+  if (WCPattern::matches(L"a*b", L"aaaab")) { ... }
+  
+ + This class does not currently support unicode. The unicode update for this + class is coming soon. + + This class is partially immutable. It is completely safe to call createWCMatcher + concurrently in different threads, but the other functions (e.g. split) should + not be called concurrently on the same WCPattern. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Construct + + Matches + +
+   +
+ Characters +
+ x + + The character x +
+ \\ + + The character \ +
+ \0nn + + The character with octal ASCII value nn +
+ \0nnn + + The character with octal ASCII value nnn +
+ \xhh + + The character with hexadecimal ASCII value hh +
+ \t + + A tab character +
+ \r + + A carriage return character +
+ \n + + A new-line character +
+   +
+ Character Classes +
+ [abc] + + Either a, b, or c +
+ [^abc] + + Any character but a, b, or c +
+ [a-zA-Z] + + Any character ranging from a thru z, or + A thru Z +
+ [^a-zA-Z] + + Any character except those ranging from a thru + z, or A thru Z +
+ [a\-z] + + Either a, -, or z +
+ [a-z[A-Z]] + + Same as [a-zA-Z] +
+ [a-z&&[g-i]] + + Any character in the intersection of a-z and + g-i +
+ [a-z&&[^g-i]] + + Any character in a-z and not in g-i +
+   +
+ Prefefined character classes +
+ . + + Any character. Multiline matching must be compiled into the pattern for + . to match a \r or a \n. + Even if multiline matching is enabled, . will not + match a \r\n, only a \r or a \n. +
+ \d + + [0-9] +
+ \D + + [^\d] +
+ \s + + [ \t\r\n\x0B] +
+ \S + + [^\s] +
+ \w + + [a-zA-Z0-9_] +
+ \W + + [^\w] +
+   +
+ POSIX character classes +
+ \p{Lower} + + [a-z] +
+ \p{Upper} + + [A-Z] +
+ \p{ASCII} + + [\x00-\x7F] +
+ \p{Alpha} + + [a-zA-Z] +
+ \p{Digit} + + [0-9] +
+ \p{Alnum} + + [\w&&[^_]] +
+ \p{Punct} + + [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~] +
+ \p{XDigit} + + [a-fA-F0-9] +
+   +
+ Boundary Matches +
+ ^ + + The beginning of a line. Also matches the beginning of input. +
+ $ + + The end of a line. Also matches the end of input. +
+ \b + + A word boundary +
+ \B + + A non word boundary +
+ \A + + The beginning of input +
+ \G + + The end of the previous match. Ensures that a "next" match will only + happen if it begins with the character immediately following the end of + the "current" match. +
+ \Z + + The end of input. Will also match if there is a single trailing + \r\n, a single trailing \r, or a single + trailing \n. +
+ \z + + The end of input +
+   +
+ Greedy Quantifiers +
+ x? + + x, either zero times or one time +
+ x* + + x, zero or more times +
+ x+ + + x, one or more times +
+ x{n} + + x, exactly n times +
+ x{n,} + + x, at least n times +
+ x{,m} + + x, at most m times +
+ x{n,m} + + x, at least n times and at most + m times +
+   +
+ Possessive Quantifiers +
+ x?+ + + x, either zero times or one time +
+ x*+ + + x, zero or more times +
+ x++ + + x, one or more times +
+ x{n}+ + + x, exactly n times +
+ x{n,}+ + + x, at least n times +
+ x{,m}+ + + x, at most m times +
+ x{n,m}+ + + x, at least n times and at most + m times +
+   +
+ Reluctant Quantifiers +
+ x?? + + x, either zero times or one time +
+ x*? + + x, zero or more times +
+ x+? + + x, one or more times +
+ x{n}? + + x, exactly n times +
+ x{n,}? + + x, at least n times +
+ x{,m}? + + x, at most m times +
+ x{n,m}? + + x, at least n times and at most + m times +
+   +
+ Operators +
+ xy + + x then y +
+ x|y + + x or y +
+ (x) + + x as a capturing group +
+   +
+ Quoting +
+ \Q + + Nothing, but treat every character (including \s) literally until a + matching \E +
+ \E + + Nothing, but ends its matching \Q +
+   +
+ Special Constructs +
+ (?:x) + + x, but not as a capturing group +
+ (?=x) + + x, via positive lookahead. This means that the + expression will match only if it is trailed by x. + It will not "eat" any of the characters matched by + x. +
+ (?!x) + + x, via negative lookahead. This means that the + expression will match only if it is not trailed by + x. It will not "eat" any of the characters + matched by x. +
+ (?<=x) + + x, via positive lookbehind. x + cannot contain any quantifiers. +
+ (?x) + + x, via negative lookbehind. x + cannot contain any quantifiers. +
+ (?>x) + + x{1}+ +
+   +
+ Registered Expression Matching +
+ {x} + + The registered pattern x +
+ +
+ + Begin Text Extracted And Modified From java.util.regex.WCPattern documentation + +

Backslashes, escapes, and quoting

+ +

The backslash character ((wchar_t)'\') serves to introduce escaped + constructs, as defined in the table above, as well as to quote characters + that otherwise would be interpreted as unescaped constructs. Thus the + expression \\ matches a single backslash and \{ matches a + left brace. + +

It is an error to use a backslash prior to any alphabetic character that + does not denote an escaped construct; these are reserved for future + extensions to the regular-expression language. A backslash may be used + prior to a non-alphabetic character regardless of whether that character is + part of an unescaped construct. + +

It is necessary to double backslashes in string literals that represent + regular expressions to protect them from interpretation by a compiler. The + string literal "\b", for example, matches a single backspace + character when interpreted as a regular expression, while + "\\b" matches a word boundary. The string litera + "\(hello\)" is illegal and leads to a compile-time error; + in order to match the string (hello) the string literal + "\\(hello\\)" must be used. + +

Character Classes

+ +

Character classes may appear within other character classes, and + may be composed by the union operator (implicit) and the intersection + operator (&&). + The union operator denotes a class that contains every character that is + in at least one of its operand classes. The intersection operator + denotes a class that contains every character that is in both of its + operand classes. + +

The precedence of character-class operators is as follows, from + highest to lowest: + +

+ + + + + + + + + + + + + + + + +
1    Literal escape    \x
2    Rangea-z
3    Grouping[...]
4    Intersection[a-z&&[aeiou]]
5    Union[a-e][i-u]
+ +

Note that a different set of metacharacters are in effect inside + a character class than outside a character class. For instance, the + regular expression . loses its special meaning inside a + character class, while the expression - becomes a range + forming metacharacter. + + + + +

Groups and capturing

+ +

Capturing groups are numbered by counting their opening parentheses from + left to right. In the expression ((A)(B(C))), for example, there + are four such groups:

+ +
+ + + + + + + + + + +
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
+ +

Group zero always stands for the entire expression. + +

Capturing groups are so named because, during a match, each subsequence + of the input sequence that matches such a group is saved. The captured + subsequence may be used later in the expression, via a back reference, and + may also be retrieved from the matcher once the match operation is complete. + +

The captured input associated with a group is always the subsequence + that the group most recently matched. If a group is evaluated a second time + because of quantification then its previously-captured value, if any, will + be retained if the second evaluation fails. Matching the string + L"aba" against the expression (a(b)?)+, for example, leaves + group two set to L"b". All captured input is discarded at the + beginning of each match. + +

Groups beginning with (? are pure, non-capturing groups + that do not capture text and do not count towards the group total. + + +

WC support

+ +

Coming Soon. + +

Comparison to Perl 5

+ +

The WCPattern engine performs traditional NFA-based matching + with ordered alternation as occurs in Perl 5. + +

Perl constructs not supported by this class:

+ +
    + +
  • The conditional constructs (?{X}) and + (?(condition)X|Y), +

  • + +
  • The embedded code constructs (?{code}) + and (??{code}),

  • + +
  • The embedded comment syntax (?#comment), and

  • + +
  • The preprocessing operations \l \u, + \L, and \U.

  • + +
  • Embedded flags

  • + +
+ +

Constructs supported by this class but not by Perl:

+ +
    + +
  • Possessive quantifiers, which greedily match as much as they can + and do not back off, even when doing so would allow the overall match to + succeed.

  • + +
  • Character-class union and intersection as described + above.

  • + +
+ +

Notable differences from Perl:

+ +
    + +
  • In Perl, \1 through \9 are always interpreted + as back references; a backslash-escaped number greater than 9 is + treated as a back reference if at least that many subexpressions exist, + otherwise it is interpreted, if possible, as an octal escape. In this + class octal escapes must always begin with a zero. In this class, + \1 through \9 are always interpreted as back + references, and a larger number is accepted as a back reference if at + least that many subexpressions exist at that point in the regular + expression, otherwise the parser will drop digits until the number is + smaller or equal to the existing number of groups or it is one digit. +

  • + +
  • Perl uses the g flag to request a match that resumes + where the last match left off. This functionality is provided implicitly + by the WCMatcher class: Repeated invocations of the + find method will resume where the last match left off, + unless the matcher is reset.

  • + +
  • Perl is forgiving about malformed matching constructs, as in the + expression *a, as well as dangling brackets, as in the + expression abc], and treats them as literals. This + class also strict and will not compile a pattern when dangling characters + are encountered.

  • + +
+ + +

For a more precise description of the behavior of regular expression + constructs, please see + Mastering Regular Expressions, 2nd Edition, Jeffrey E. F. Friedl, + O'Reilly and Associates, 2002. +

+

+ + End Text Extracted And Modified From java.util.regex.WCPattern documentation + +


+ + @author Jeffery Stuart + @since March 2003, Stable Since November 2004 + @version 1.07.00 + @memo A class used to represent "PERL 5"-ish regular expressions + */ +class WCPattern +{ + friend class WCMatcher; + friend class NFAUNode; + friend class NFAQuantifierUNode; + private: + /** + This constructor should not be called directly. Those wishing to use the + WCPattern class should instead use the {@link compile compile} method. + + @param rhs The pattern to compile + @memo Creates a new pattern from the regular expression in rhs. + */ + WCPattern(const bkstring & rhs); + protected: + /** + This currently is not used, so don't try to do anything with it. + @memo Holds all the compiled patterns for quick access. + */ + static std::map compiledWCPatterns; + /** + Holds all of the registered patterns as strings. Due to certain problems + with compilation of patterns, especially with capturing groups, this seemed + to be the best way to do it. + */ + static std::map > registeredWCPatterns; + protected: + /** + Holds all the NFA nodes used. This makes deletion of a pattern, as well as + clean-up from an unsuccessful compile much easier and faster. + */ + std::map nodes; + /** + Used when methods like split are called. The matcher class uses a lot of + dynamic memeory, so having an instance increases speedup of certain + operations. + */ + WCMatcher * matcher; + /** + The front node of the NFA. + */ + NFAUNode * head; + /** + The actual regular expression we rerpesent + */ + bkstring pattern; + /** + Flag used during compilation. Once the pattern is successfully compiled, + error is no longer used. + */ + bool error; + /** + Used during compilation to keep track of the current index into + {@link pattern pattern}. Once the pattern is successfully + compiled, error is no longer used. + */ + int curInd; + /** + The number of capture groups this contains. + */ + int groupCount; + /** + The number of non-capture groups this contains. + */ + int nonCapGroupCount; + /** + The flags specified when this was compiled. + */ + unsigned long flags; + protected: + /** + Raises an error during compilation. Compilation will cease at that point + and compile will return NULL. + */ + void raiseError(); + /** + Convenience function for registering a node in nodes. + @param node The node to register + @return The registered node + */ + NFAUNode * registerNode(NFAUNode * node); + + /** + Calculates the union of two strings. This function will first sort the + strings and then use a simple selection algorithm to find the union. + @param s1 The first "class" to union + @param s2 The second "class" to union + @return A new string containing all unique characters. Each character + must have appeared in one or both of s1 and + s2. + */ + bkstring classUnion (bkstring s1, bkstring s2) const; + /** + Calculates the intersection of two strings. This function will first sort + the strings and then use a simple selection algorithm to find the + intersection. + @param s1 The first "class" to intersect + @param s2 The second "class" to intersect + @return A new string containing all unique characters. Each character + must have appeared both s1 and s2. + */ + bkstring classIntersect (bkstring s1, bkstring s2) const; + /** + Calculates the negation of a string. The negation is the set of all + characters between \x00 and \xFF not + contained in s1. + @param s1 The "class" to be negated. + @param s2 The second "class" to intersect + @return A new string containing all unique characters. Each character + must have appeared both s1 and s2. + */ + bkstring classNegate (bkstring s1) const; + /** + Creates a new "class" representing the range from low thru + hi. This function will wrap if low > + hi. This is a feature, not a buf. Sometimes it is useful + to be able to say [\x70-\x10] instead of [\x70-\x7F\x00-\x10]. + @param low The beginning character + @param hi The ending character + @return A new string containing all the characters from low thru hi. + */ + bkstring classCreateRange(wchar_t low, wchar_t hi) const; + + /** + Extracts a decimal number from the substring of member-variable + {@link pattern pattern} starting at start and + ending at end. + @param start The starting index in {@link pattern pattern} + @param end The last index in {@link pattern pattern} + @return The decimal number in {@link pattern pattern} + */ + int getInt(int start, int end); + /** + Parses a {n,m} string out of the member-variable + {@link pattern pattern} stores the result in sNum + and eNum. + @param sNum Output parameter. The minimum number of matches required + by the curly quantifier are stored here. + @param eNum Output parameter. The maximum number of matches allowed + by the curly quantifier are stored here. + @return Success/Failure. Fails when the curly does not have the proper + syntax + */ + bool quantifyCurly(int & sNum, int & eNum); + /** + Tries to quantify the currently parsed group. If the group being parsed + is indeed quantified in the member-variable + {@link pattern pattern}, then the NFA is modified accordingly. + @param start The starting node of the current group being parsed + @param stop The ending node of the current group being parsed + @param gn The group number of the current group being parsed + @return The node representing the starting node of the group. If the + group becomes quantified, then this node is not necessarily + a GroupHead node. + */ + NFAUNode * quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn); + + /** + Tries to quantify the last parsed expression. If the character was indeed + quantified, then the NFA is modified accordingly. + @param newNode The recently created expression node + @return The node representing the last parsed expression. If the + expression was quantified, return value != newNode + */ + NFAUNode * quantify(NFAUNode * newNode); + /** + Parses the current class being examined in + {@link pattern pattern}. + @return A string of unique characters contained in the current class being + parsed + */ + bkstring parseClass(); + /** + Parses the current POSIX class being examined in + {@link pattern pattern}. + @return A string of unique characters representing the POSIX class being + parsed + */ + bkstring parsePosix(); + /** + Returns a string containing the octal character being parsed + @return The string contained the octal value being parsed + */ + bkstring parseOctal(); + /** + Returns a string containing the hex character being parsed + @return The string contained the hex value being parsed + */ + bkstring parseHex(); + /** + Returns a new node representing the back reference being parsed + @return The new node representing the back reference being parsed + */ + NFAUNode * parseBackref(); + /** + Parses the escape sequence currently being examined. Determines if the + escape sequence is a class, a single character, or the beginning of a + quotation sequence. + @param inv Output parameter. Whether or not to invert the returned class + @param quo Output parameter. Whether or not this sequence starts a + quotation. + @return The characters represented by the class + */ + bkstring parseEscape(bool & inv, bool & quo); + /** + Parses a supposed registered pattern currently under compilation. If the + sequence of characters does point to a registered pattern, then the + registered pattern is appended to *end. The registered pattern + is parsed with the current compilation flags. + @param end The ending node of the thus-far compiled pattern + @return The new end node of the current pattern + */ + NFAUNode * parseRegisteredWCPattern(NFAUNode ** end); + /** + Parses a lookbehind expression. Appends the necessary nodes + *end. + @param pos Positive or negative look behind + @param end The ending node of the current pattern + @return The new end node of the current pattern + */ + NFAUNode * parseBehind(const bool pos, NFAUNode ** end); + /** + Parses the current expression and tacks on nodes until a \E is found. + @return The end of the current pattern + */ + NFAUNode * parseQuote(); + /** + Parses {@link pattern pattern}. This function is called + recursively when an or (|) or a group is encountered. + @param inParen Are we currently parsing inside a group + @param inOr Are we currently parsing one side of an or (|) + @param end The end of the current expression + @return The starting node of the NFA constructed from this parse + */ + NFAUNode * parse(const bool inParen = 0, const bool inOr = 0, NFAUNode ** end = NULL); + public: + /// We should match regardless of case + const static unsigned long CASE_INSENSITIVE; + /// We are implicitly quoted + const static unsigned long LITERAL; + /// @memo We should treat a . as [\x00-\x7F] + const static unsigned long DOT_MATCHES_ALL; + /** ^ and $ should anchor to the beginning and + ending of lines, not all input + */ + const static unsigned long MULTILINE_MATCHING; + /** When enabled, only instances of \n are recognized as + line terminators + */ + const static unsigned long UNIX_LINE_MODE; + /// The absolute minimum number of matches a quantifier can match (0) + const static int MIN_QMATCH; + /// The absolute maximum number of matches a quantifier can match (0x7FFFFFFF) + const static int MAX_QMATCH; + public: + /** + Call this function to compile a regular expression into a + WCPattern object. Special values can be assigned to + mode when certain non-standard behaviors are expected from + the WCPattern object. + @param pattern The regular expression to compile + @param mode A bitwise or of flags signalling what special behaviors are + wanted from this WCPattern object + @return If successful, compile returns a WCPattern + pointer. Upon failure, compile returns + NULL + */ + static WCPattern * compile (const bkstring & pattern, + const unsigned long mode = 0); + /** + Dont use this function. This function will compile a pattern, and cache + the result. This will eventually be used as an optimization when people + just want to call static methods using the same pattern over and over + instead of first compiling the pattern and then using the compiled + instance for matching. + @param pattern The regular expression to compile + @param mode A bitwise or of flags signalling what special behaviors are + wanted from this WCPattern object + @return If successful, compileAndKeep returns a + WCPattern pointer. Upon failure, compile + returns NULL. + */ + static WCPattern * compileAndKeep (const bkstring & pattern, + const unsigned long mode = 0); + + /** + Searches through replace and replaces all substrings matched + by pattern with str. str may + contain backreferences (e.g. \1) to capture groups. A typical + invocation looks like: +

+ + WCPattern::replace(L"(a+)b(c+)", L"abcccbbabcbabc", L"\\2b\\1"); + +

+ which would replace abcccbbabcbabc with + cccbabbcbabcba. + @param pattern The regular expression + @param str The replacement text + @param replacementText The string in which to perform replacements + @param mode The special mode requested of the WCPattern + during the replacement process + @return The text with the replacement string substituted where necessary + */ + static bkstring replace (const bkstring & pattern, + const bkstring & str, + const bkstring & replacementText, + const unsigned long mode = 0); + + /** + Splits the specified string over occurrences of the specified pattern. + Empty strings can be optionally ignored. The number of strings returned is + configurable. A typical invocation looks like: +

+ + bkstring str(strSize, 0);
+ FILE * fp = fopen(fileName, "r");
+ fread((char*)str.data(), strSize * 2, 1, fp);
+ fclose(fp);
+
+ std::vector<bkstring> lines = WCPattern::split(L"[\r\n]+", str, true);
+
+
+ + @param pattern The regular expression + @param replace The string to split + @param keepEmptys Whether or not to keep empty strings + @param limit The maximum number of splits to make + @param mode The special mode requested of the WCPattern + during the split process + @return All substrings of str split across pattern. + */ + static std::vector split (const bkstring & pattern, + const bkstring & str, + const bool keepEmptys = 0, + const unsigned long limit = 0, + const unsigned long mode = 0); + + /** + Finds all the instances of the specified pattern within the string. You + should be careful to only pass patterns with a minimum length of one. For + example, the pattern a* can be matched by an empty string, so + instead you should pass a+ since at least one character must + be matched. A typical invocation of findAll looks like: +

+ + std::vector<td::string> numbers = WCPattern::findAll(L"\\d+", string); + +

+ + @param pattern The pattern for which to search + @param str The string to search + @param mode The special mode requested of the WCPattern + during the find process + @return All instances of pattern in str + */ + static std::vector findAll (const bkstring & pattern, + const bkstring & str, + const unsigned long mode = 0); + + /** + Determines if an entire string matches the specified pattern + + @param pattern The pattern for to match + @param str The string to match + @param mode The special mode requested of the WCPattern + during the replacement process + @return True if str is recognized by pattern + */ + static bool matches (const bkstring & pattern, + const bkstring & str, + const unsigned long mode = 0); + + /** + Registers a pattern under a specific name for use in later compilations. + A typical invocation and later use looks like: +

+ + WCPattern::registerWCPattern(L"ip", L"(?:\\d{1,3}\\.){3}\\d{1,3}");
+ WCPattern * p1 = WCPattern::compile(L"{ip}:\\d+");
+ WCPattern * p2 = WCPattern::compile(L"Connection from ({ip}) on port \\d+");
+
+

+ Multiple calls to registerWCPattern with the same + name will result in the pattern getting overwritten. + + @param name The name to give to the pattern + @param pattern The pattern to register + @param mode Any special flags to use when compiling pattern + @return Success/Failure. Fails only if pattern has invalid + syntax + */ + static bool registerWCPattern(const bkstring & name, + const bkstring & pattern, + const unsigned long mode = 0); + + /** + Clears the pattern registry + */ + static void unregisterWCPatterns(); + /** + Don't use + */ + static void clearWCPatternCache(); + + /** + Searches through a string for the nth match of the + given pattern in the string. Match indeces start at zero, not one. + A typical invocation looks like this: +

+ + std::pair<bkstring, int> match = WCPattern::findNthMatch(L"\\d{1,3}", L"192.168.1.101:22", 1);
+ wprintf(L"%s %i\n", match.first.c_str(), match.second);
+
+ Output: 168 4
+
+ + @param pattern The pattern for which to search + @param str The string to search + @param matchNum Which match to find + @param mode Any special flags to use during the matching process + @return A string and an integer. The string is the string matched. The + integer is the starting location of the matched string in + str. You can check for success/failure by making sure + that the integer returned is greater than or equal to zero. + */ + static std::pair findNthMatch (const bkstring & pattern, + const bkstring & str, + const int matchNum, + const unsigned long mode = 0); + public: + /** + Deletes all NFA nodes allocated during compilation + */ + ~WCPattern(); + + bkstring replace (const bkstring & str, + const bkstring & replacementText); + std::vector split (const bkstring & str, const bool keepEmptys = 0, + const unsigned long limit = 0); + std::vector findAll (const bkstring & str); + bool matches (const bkstring & str); + /** + Returns the flags used during compilation of this pattern + @return The flags used during compilation of this pattern + */ + unsigned long getFlags () const; + /** + Returns the regular expression this pattern represents + @return The regular expression this pattern represents + */ + bkstring getWCPattern () const; + /** + Creates a matcher object using the specified string and this pattern. + @param str The string to match against + @return A new matcher using object using this pattern and the specified + string + */ + WCMatcher * createWCMatcher (const bkstring & str); +}; + +class NFAUNode +{ + friend class WCMatcher; + public: + NFAUNode * next; + NFAUNode(); + virtual ~NFAUNode(); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const = 0; + inline virtual bool isGroupHeadNode() const { return false; } + inline virtual bool isStartOfInputNode() const { return false; } +}; +class NFACharUNode : public NFAUNode +{ + protected: + wchar_t ch; + public: + NFACharUNode(const wchar_t c); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFACICharUNode : public NFAUNode +{ + protected: + wchar_t ch; + public: + NFACICharUNode(const wchar_t c); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAStartUNode : public NFAUNode +{ + public: + NFAStartUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAEndUNode : public NFAUNode +{ + public: + NFAEndUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAQuantifierUNode : public NFAUNode +{ + public: + int min, max; + NFAUNode * inner; + virtual void findAllNodes(std::map & soFar); + NFAQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGreedyQuantifierUNode : public NFAQuantifierUNode +{ + public: + NFAGreedyQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + virtual int matchInternal(const bkstring & str, WCMatcher * matcher, const int curInd, const int soFar) const; +}; +class NFALazyQuantifierUNode : public NFAQuantifierUNode +{ + public: + NFALazyQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAPossessiveQuantifierUNode : public NFAQuantifierUNode +{ + public: + NFAPossessiveQuantifierUNode(WCPattern * pat, NFAUNode * internal, + const int minMatch = WCPattern::MIN_QMATCH, + const int maxMatch = WCPattern::MAX_QMATCH); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAAcceptUNode : public NFAUNode +{ + public: + NFAAcceptUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAClassUNode : public NFAUNode +{ + public: + bool inv; + std::map vals; + NFAClassUNode(const bool invert = 0); + NFAClassUNode(const bkstring & clazz, const bool invert); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFACIClassUNode : public NFAUNode +{ + public: + bool inv; + std::map vals; + NFACIClassUNode(const bool invert = 0); + NFACIClassUNode(const bkstring & clazz, const bool invert); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFASubStartUNode : public NFAUNode +{ + public: + NFASubStartUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAOrUNode : public NFAUNode +{ + public: + NFAUNode * one; + NFAUNode * two; + NFAOrUNode(NFAUNode * first, NFAUNode * second); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAQuoteUNode : public NFAUNode +{ + public: + bkstring qStr; + NFAQuoteUNode(const bkstring & quoted); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFACIQuoteUNode : public NFAUNode +{ + public: + bkstring qStr; + NFACIQuoteUNode(const bkstring & quoted); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFALookAheadUNode : public NFAUNode +{ + public: + bool pos; + NFAUNode * inner; + NFALookAheadUNode(NFAUNode * internal, const bool positive); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFALookBehindUNode : public NFAUNode +{ + public: + bool pos; + bkstring mStr; + NFALookBehindUNode(const bkstring & str, const bool positive); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAStartOfLineUNode : public NFAUNode +{ + public: + NFAStartOfLineUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAEndOfLineUNode : public NFAUNode +{ + public: + NFAEndOfLineUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAReferenceUNode : public NFAUNode +{ + public: + int gi; + NFAReferenceUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAStartOfInputUNode : public NFAUNode +{ + public: + NFAStartOfInputUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + inline virtual bool isStartOfInputNode() const { return false; } +}; +class NFAEndOfInputUNode : public NFAUNode +{ + public: + bool term; + NFAEndOfInputUNode(const bool lookForTerm); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAWordBoundaryUNode : public NFAUNode +{ + public: + bool pos; + NFAWordBoundaryUNode(const bool positive); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAEndOfMatchUNode : public NFAUNode +{ + public: + NFAEndOfMatchUNode(); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGroupHeadUNode : public NFAUNode +{ + public: + int gi; + NFAGroupHeadUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + inline virtual bool isGroupHeadNode() const { return false; } +}; +class NFAGroupTailUNode : public NFAUNode +{ + public: + int gi; + NFAGroupTailUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGroupLoopPrologueUNode : public NFAUNode +{ + public: + int gi; + NFAGroupLoopPrologueUNode(const int groupIndex); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; +class NFAGroupLoopUNode : public NFAUNode +{ + public: + int gi, min, max, type; + NFAUNode * inner; + NFAGroupLoopUNode(NFAUNode * internal, const int minMatch, + const int maxMatch, const int groupIndex, const int matchType); + virtual void findAllNodes(std::map & soFar); + virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + int matchGreedy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + int matchLazy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; + int matchPossessive(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const; +}; + +#endif + diff --git a/plugins/SmileyAdd/src/regexp/test.cpp b/plugins/SmileyAdd/src/regexp/test.cpp new file mode 100644 index 0000000000..bb41566f96 --- /dev/null +++ b/plugins/SmileyAdd/src/regexp/test.cpp @@ -0,0 +1,38 @@ +#include +#include +#include +#include + +int main() +{ + { + Pattern * p = Pattern::compile("^([^:]*)://([^/:]*)((?::[0-9]+)?)/?(([^?]*)((?:\\?.*)?))$"); + Matcher * m0 = p->createMatcher("http://www.example.com:80/test.php?a=1&a=1&a=1"); + + if (m0->matches()) + { + std::vector groups = m0->getGroups(true); + for (int i = 0; i < (int)groups.size(); ++i) + { + printf("m->group(%d): %s\n", i, groups[i].c_str()); + } + } + } + { + std::wstring pat = L"^([^:]*)://([^/:]*)((?::[0-9]+)?)/?(([^?]*)((?:\\?.*)?))$"; + std::wstring mat = L"http://www.example.com:80/test.php?a=1&a=1&a=1"; + UnicodePattern * p = UnicodePattern::compile(pat); + UnicodeMatcher * m0 = p->createUnicodeMatcher(mat); + + if (m0->matches()) + { + std::vector groups = m0->getGroups(true); + for (int i = 0; i < (int)groups.size(); ++i) + { + wprintf(L"m->group(%d): %s\n", i, groups[i].c_str()); + } + } + } + + return 0; +} -- cgit v1.2.3