diff options
Diffstat (limited to 'plugins/smileyadd/regexp')
-rw-r--r-- | plugins/smileyadd/regexp/Matcher.cpp | 178 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/Matcher.h | 248 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/Pattern.cpp | 1709 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/Pattern.h | 1663 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/WCMatcher.cpp | 181 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/WCMatcher.h | 234 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/WCPattern.cpp | 1747 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/WCPattern.h | 1663 | ||||
-rw-r--r-- | plugins/smileyadd/regexp/test.cpp | 38 |
9 files changed, 7661 insertions, 0 deletions
diff --git a/plugins/smileyadd/regexp/Matcher.cpp b/plugins/smileyadd/regexp/Matcher.cpp new file mode 100644 index 0000000000..ebd1f57850 --- /dev/null +++ b/plugins/smileyadd/regexp/Matcher.cpp @@ -0,0 +1,178 @@ +#include <Matcher.h>
+#include <Pattern.h>
+
+const int Matcher::MATCH_ENTIRE_STRING = 0x01;
+
+/*
+ Detailed documentation is provided in this class' header file
+
+ @author Jeffery Stuart
+ @since November 2004
+ @version 1.07.00
+*/
+
+Matcher::Matcher(Pattern * pattern, const bkstring & text)
+{
+ pat = pattern;
+ str = &text;
+ gc = pattern->groupCount;
+ ncgc = -pattern->nonCapGroupCount;
+ flags = 0;
+ matchedSomething = false;
+ starts = new int[gc + ncgc];
+ ends = new int[gc + ncgc];
+ groups = new int[gc + ncgc];
+ groupPos = new int[gc + ncgc];
+ groupIndeces = new int[gc + ncgc];
+ starts = starts + ncgc;
+ ends = ends + ncgc;
+ groups = groups + ncgc;
+ groupPos = groupPos + ncgc;
+ groupIndeces = groupIndeces + ncgc;
+ for (int i = 0; i < gc; ++i) starts[i] = ends[i] = 0;
+}
+Matcher::~Matcher()
+{
+ delete [] (starts - ncgc);
+ delete [] (ends - ncgc);
+ delete [] (groups - ncgc);
+ delete [] (groupIndeces - ncgc);
+ delete [] (groupPos - ncgc);
+}
+void Matcher::clearGroups()
+{
+ int i;
+ lm = 0;
+ for (i = 0; i < gc; ++i) groups[i] = starts[i] = ends[i] = -1;
+ for (i = 1; i <= ncgc; ++i) groups[0 - i] = -1;
+}
+bkstring Matcher::replaceWithGroups(const bkstring & str)
+{
+ bkstring ret = "";
+
+ bkstring t = str;
+ while (t.size() > 0)
+ {
+ if (t[0] == '\\')
+ {
+ t.erase(0, 1);
+ if (t.size() == 0)
+ {
+ ret += "\\";
+ }
+ else if (t[0] < '0' || t[0] > '9')
+ {
+ ret += t[0];
+ t.erase(0, 1);
+ }
+ else
+ {
+ int gn = 0;
+ while (t.size() > 0 && t[0] >= '0' && t[0] <= '9')
+ {
+ gn = gn * 10 + (t[0] - '0');
+ t.erase(0, 1);
+ }
+ ret += getGroup(gn);
+ }
+ }
+ else
+ {
+ ret += t[0];
+ t.erase(0, 1);
+ }
+ }
+
+ return ret;
+}
+unsigned long Matcher::getFlags() const
+{
+ return flags;
+}
+const bkstring& Matcher::getText() const
+{
+ return *str;
+}
+
+bool Matcher::matches()
+{
+ flags = MATCH_ENTIRE_STRING;
+ matchedSomething = false;
+ clearGroups();
+ lm = 0;
+ return pat->head->match(*str, this, 0) == (int)str->size();
+}
+bool Matcher::findFirstMatch()
+{
+ starts[0] = 0;
+ flags = 0;
+ clearGroups();
+ start = 0;
+ lm = 0;
+ ends[0] = pat->head->match(*str, this, 0);
+ if (ends[0] >= 0)
+ {
+ matchedSomething = true;
+ return 1;
+ }
+ return 0;
+}
+bool Matcher::findNextMatch()
+{
+ int s = starts[0], e = ends[0];
+
+ if (!matchedSomething) return findFirstMatch();
+ if (s == e) ++e;
+ flags = 0;
+ clearGroups();
+
+ starts[0] = e;
+ if (e >= (int)str->size()) return 0;
+ start = e;
+ lm = e;
+ ends[0] = pat->head->match(*str, this, e);
+ return ends[0] >= 0;
+}
+std::vector<bkstring> Matcher::findAll()
+{
+ std::vector<bkstring> ret;
+ reset();
+ while (findNextMatch())
+ {
+ ret.push_back(getGroup());
+ }
+ return ret;
+}
+
+void Matcher::reset()
+{
+ lm = 0;
+ clearGroups();
+ matchedSomething = false;
+}
+
+int Matcher::getStartingIndex(const int groupNum) const
+{
+ if (groupNum < 0 || groupNum >= gc) return -1;
+ return starts[groupNum];
+}
+int Matcher::getEndingIndex(const int groupNum) const
+{
+ if (groupNum < 0 || groupNum >= gc) return -1;
+ return ends[groupNum];
+}
+bkstring Matcher::getGroup(const int groupNum) const
+{
+ if (groupNum < 0 || groupNum >= gc) return "";
+ if (starts[groupNum] < 0 || ends[groupNum] < 0) return "";
+ return str->substr(starts[groupNum], ends[groupNum] - starts[groupNum]);
+}
+std::vector<bkstring> Matcher::getGroups(const bool includeGroupZero) const
+{
+ int i, start = (includeGroupZero ? 0 : 1);
+ std::vector<bkstring> ret;
+
+ for (i = start; i < gc; ++i) ret.push_back(getGroup(i));
+ return ret;
+}
+
diff --git a/plugins/smileyadd/regexp/Matcher.h b/plugins/smileyadd/regexp/Matcher.h new file mode 100644 index 0000000000..621184d8c2 --- /dev/null +++ b/plugins/smileyadd/regexp/Matcher.h @@ -0,0 +1,248 @@ +#ifndef __MATCHER_H__
+#define __MATCHER_H__
+
+#include "bkstring.h"
+#include <vector>
+
+class Vector;
+class NFANode;
+class NFAStartNode;
+class NFAEndNode;
+class NFAGroupHeadNode;
+class NFAGroupLoopNode;
+class NFAGroupLoopPrologueNode;
+class NFAGroupTailNode;
+class NFALookBehindNode;
+class NFAStartOfLineNode;
+class NFAEndOfLineNode;
+class NFAEndOfMatchNode;
+class NFAReferenceNode;
+class Pattern;
+
+/**
+ A matcher is a non thread-safe object used to scan strings using a given
+ {@link Pattern Pattern} object. Using a <code>Matcher</code> is the preferred
+ method for scanning strings. Matchers are not thread-safe. Matchers require
+ very little dynamic memory, hence one is encouraged to create several
+ instances of a matcher when necessary as opposed to sharing a single instance
+ of a matcher.
+ <p>
+ The most common methods needed by the matcher are <code>matches</code>,
+ <code>findNextMatch</code>, and <code>getGroup</code>. <code>matches</code>
+ and <code>findNextMatch</code> both return success or failure, and further
+ details can be gathered from their documentation.
+ <p>
+ Unlike Java's <code>Matcher</code>, this class allows you to change the string
+ you are matching against. This provides a small optimization, since you no
+ longer need multiple matchers for a single pattern in a single thread.
+ <p>
+ This class also provides an extremely handy method for replacing text with
+ captured data via the <code>replaceWithGroups</code> method. A typical
+ invocation looks like:
+ <pre>
+ char buf[10000];
+ bkstring str = "\\5 (user name \\1) uses \\7 for his/her shell and \\6 is their home directory";
+ FILE * fp = fopen("/etc/passwd", "r");
+ Pattern::registerPattern("entry", "[^:]+");
+ Pattern * p = Pattern::compile("^({entry}):({entry}):({entry}):({entry}):({entry}):({entry}):({entry})$",
+ Pattern::MULTILINE_MATCHING | Pattern::UNIX_LINE_MODE);
+ Matcher * m = p->createMatcher("");
+ while (fgets(buf, 9999, fp))
+ {
+ m->setString(buf);
+ if (m->matches())
+ {
+ printf("%s\n", m->replaceWithGroups(str).c_str());
+ }
+ }
+ fclose(fp);
+
+ </pre>
+ Calling any of the following functions before first calling
+ <code>matches</code>, <code>findFirstMatch</code>, or
+ <code>findNextMatch</code> results in undefined behavior and may cause your
+ program to crash.
+ <code>
+ <ul>
+ <li>replaceWithGroups</code>
+ <li>getStartingIndex</li>
+ <li>getEndingIndex</li>
+ <li>getGroup</li>
+ <li>getGroups</li>
+ </ul>
+ </code>
+ <p>
+ The function <code>findFirstMatch</code> will attempt to find the first match
+ in the input string. The same results can be obtained by first calling
+ <code>reset</code> followed by <code>findNextMatch</code>.
+ <p>
+ To eliminate the necessity of looping through a string to find all the
+ matching substrings, <code>findAll</code> was created. The function will find
+ all matching substrings and return them in a <code>vector</code>. If you need
+ to examine specific capture groups within the substrings, then this method
+ should not be used.
+
+ @author Jeffery Stuart
+ @since March 2003, Stable Since November 2004
+ @version 1.07.00
+ @memo Mutable object used on instances of a Pattern class
+ */
+class Matcher
+{
+ friend class NFANode;
+ friend class NFAStartNode;
+ friend class NFAEndNode;
+ friend class NFAGroupHeadNode;
+ friend class NFAGroupLoopNode;
+ friend class NFAGroupLoopPrologueNode;
+ friend class NFAGroupTailNode;
+ friend class NFALookBehindNode;
+ friend class NFAStartOfLineNode;
+ friend class NFAEndOfLineNode;
+ friend class NFAEndOfMatchNode;
+ friend class NFAReferenceNode;
+ friend class Pattern;
+ private:
+ /**
+ Creates a new matcher object against <code>text</code> using
+ <code>pattern</code>.
+
+ @param pattern The pattern with which to search
+ @param text The text in which to search
+ */
+ Matcher(Pattern * pattern, const bkstring & text);
+ protected:
+ /// The pattern we use to match
+ Pattern * pat;
+ /// The string in which we are matching
+ const bkstring* str;
+ /// The starting point of our match
+ int start;
+ /// An array of the starting positions for each group
+ int * starts;
+ /// An array of the ending positions for each group
+ int * ends;
+ /// An array of private data used by NFANodes during matching
+ int * groups;
+ /// An array of private data used by NFANodes during matching
+ int * groupIndeces;
+ /// An array of private data used by NFANodes during matching
+ int * groupPos;
+ /// The ending index of the last match
+ int lm;
+ /// The number of capturing groups we have
+ int gc;
+ /// The number of non-capturing groups we havew
+ int ncgc;
+ /// Whether or not we have matched something (used only by findFirstMatch and findNextMatch)
+ int matchedSomething;
+ /// The flags with which we were made
+ unsigned long flags;
+ /// Called by reset to clear the group arrays
+ void clearGroups();
+ public:
+ /// Used internally by match to signify we want the entire string matched
+ const static int MATCH_ENTIRE_STRING;
+ public:
+ /// Cleans up the dynamic memory used by this matcher
+ ~Matcher();
+ /**
+ Replaces the contents of <code>str</code> with the appropriate captured
+ text. <code>str</code> should have at least one back reference, otherwise
+ this function does nothing.
+ @param str The string in which to replace text
+ @return A string with all backreferences appropriately replaced
+ */
+ bkstring replaceWithGroups(const bkstring & str);
+ /**
+ The flags currently being used by the matcher.
+ @return Zero
+ */
+ unsigned long getFlags() const;
+ /**
+ The text being searched by the matcher.
+ @return the text being searched by the matcher.
+ */
+ const bkstring& getText() const;
+
+ /**
+ Scans the string from start to finish for a match. The entire string must
+ match for this function to return success. Group variables are
+ appropriately set and can be queried after this function returns.
+
+ @return Success if and only if the entire string matches the pattern
+ */
+ bool matches();
+ /**
+ Scans the string for the first substring matching the pattern. The entire
+ string does not necessarily have to match for this function to return
+ success. Group variables are appropriately set and can be queried after
+ this function returns.
+
+ @return Success if any substring matches the specified pattern
+ */
+ bool findFirstMatch();
+ /**
+ Scans the string for the next substring matching the pattern. If no calls
+ have been made to findFirstMatch of findNextMatch since the last call to
+ reset, matches, or setString, then this function's behavior results to
+ that of findFirstMatch.
+
+ @return Success if another substring can be found that matches the pattern
+ */
+ bool findNextMatch();
+ /**
+ Returns a vector of every substring in order which matches the given
+ pattern.
+
+ @return Every substring in order which matches the given pattern
+ */
+ std::vector<bkstring> findAll();
+ /**
+ Resets the internal state of the matcher
+ */
+ void reset();
+ /**
+ Same as getText. Left n for backwards compatibilty with old source code
+ @return Returns the string that is currently being used for matching
+ */
+ inline const bkstring& getString() const { return *str; }
+ /**
+ Sets the string to scan
+ @param newStr The string to scan for subsequent matches
+ */
+ inline void setString(const bkstring & newStr) { str = &newStr; reset(); }
+
+ /**
+ Returns the starting index of the specified group.
+ @param groupNum The group to query
+ @return The starting index of the group if it was matched, -1 for an
+ invalid group or if the group was not matched
+ */
+ int getStartingIndex(const int groupNum = 0) const;
+ /**
+ Returns the ending index of the specified group.
+ @param groupNum The group to query
+ @return The ending index of the group if it was matched, -1 for an
+ invalid group or if the group was not matched
+ */
+ int getEndingIndex(const int groupNum = 0) const;
+ /**
+ Returns the specified group. An empty string ("") does not necessarily
+ mean the group was not matched. A group such as (a*b?) could be matched by
+ a zero length. If an empty string is returned, getStartingIndex can be
+ called to determine if the group was actually matched.
+ @param groupNum The group to query
+ @return The text of the group
+ */
+ bkstring getGroup(const int groupNum = 0) const;
+ /**
+ Returns every capture group in a vector
+
+ @param includeGroupZero Whether or not include capture group zero
+ @return Every capture group
+ */
+ std::vector<bkstring> getGroups(const bool includeGroupZero = 0) const;
+};
+
+#endif
diff --git a/plugins/smileyadd/regexp/Pattern.cpp b/plugins/smileyadd/regexp/Pattern.cpp new file mode 100644 index 0000000000..4487dddd4b --- /dev/null +++ b/plugins/smileyadd/regexp/Pattern.cpp @@ -0,0 +1,1709 @@ +/**
+ From the author (Jeff Stuart)
+ "
+ Let me start by saying this file is pretty big. If you feel up to it, you can
+ try making changes yourself, but you would be better off to just email me at
+ stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you
+ would like added. This project is very "near and dear" to me, so I am fairly
+ quick to make bug fixes. The header files for Pattern and Matcher are fairly
+ well documented and the function names are pretty self-explanatory, but if you
+ are having any trouble, feel free to email me at stuart@cs.ucdavis.edu.
+
+ If you email me, make sure you put something like C++RE in the subject because
+ I tend to delete email if I don't recognize the name and the subject is
+ something like "I Need Your Help" or "Got A Second" or "I Found It".
+ "
+ */
+
+/*
+ Detailed documentation is provided in this class' header file
+
+ @author Jeffery Stuart
+ @since November 2004
+ @version 1.07.00
+*/
+
+#define to_lower(a) (char)(unsigned)CharLowerA((LPSTR)static_cast<unsigned char>(a))
+#define is_alpha IsCharAlphaA
+
+#ifdef _WIN32
+ #pragma warning(push)
+ #pragma warning(disable:4996)
+ #define str_icmp lstrcmpiA
+#else
+ #define str_icmp strcasecmp
+#endif
+
+#include <Pattern.h>
+#include <Matcher.h>
+#include <cstring>
+#include <algorithm>
+
+std::map<bkstring, Pattern *> Pattern::compiledPatterns;
+std::map<bkstring, std::pair<bkstring, unsigned long> > Pattern::registeredPatterns;
+
+const int Pattern::MIN_QMATCH = 0x00000000;
+const int Pattern::MAX_QMATCH = 0x7FFFFFFF;
+
+const unsigned long Pattern::CASE_INSENSITIVE = 0x01;
+const unsigned long Pattern::LITERAL = 0x02;
+const unsigned long Pattern::DOT_MATCHES_ALL = 0x04;
+const unsigned long Pattern::MULTILINE_MATCHING = 0x08;
+const unsigned long Pattern::UNIX_LINE_MODE = 0x10;
+
+Pattern::Pattern(const bkstring & rhs)
+{
+ matcher = NULL;
+ pattern = rhs;
+ curInd = 0;
+ groupCount = 0;
+ nonCapGroupCount = 0;
+ error = 0;
+ head = NULL;
+}
+// convenience function in case we want to add any extra debugging output
+void Pattern::raiseError()
+{
+/* switch (pattern[curInd - 1])
+ {
+ case '*':
+ case ')':
+ case '+':
+ case '?':
+ case ']':
+ case '}':
+ fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
+ fprintf(stderr, "Syntax Error near here. Possible unescaped meta character.\n");
+ break;
+ default:
+ fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
+ fprintf(stderr, "Syntax Error near here. \n");
+ break;
+ }*/
+ error = 1;
+}
+NFANode * Pattern::registerNode(NFANode * node)
+{
+ nodes[node] = 1;
+ return node;
+}
+
+bkstring Pattern::classUnion (bkstring s1, bkstring s2) const
+{
+ char out[300];
+ std::sort(s1.begin(), s1.end());
+ std::sort(s2.begin(), s2.end());
+ *std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
+ return out;
+}
+bkstring Pattern::classIntersect (bkstring s1, bkstring s2) const
+{
+ char out[300];
+ std::sort(s1.begin(), s1.end());
+ std::sort(s2.begin(), s2.end());
+ *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
+ return out;
+}
+bkstring Pattern::classNegate (bkstring s1) const
+{
+ char out[300];
+ int i, ind = 0;
+ std::map<char, bool> m;
+
+ for (i = 0; i < (int)s1.size(); ++i) m[s1[i]] = 1;
+ for (i = 0xFF; i >= 0; --i) if (m.find((char)i) == m.end()) out[ind++] = (char)i;
+ out[ind] = 0;
+ return bkstring(out, ind);
+}
+bkstring Pattern::classCreateRange(char low, char hi) const
+{
+ char out[300];
+ int ind = 0;
+ while (low != hi) out[ind++] = low++;
+ out[ind++] = low;
+ return bkstring(out, ind);
+}
+
+int Pattern::getInt(int start, int end)
+{
+ int ret = 0;
+ for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - '0');
+ return ret;
+}
+bool Pattern::quantifyCurly(int & sNum, int & eNum)
+{
+ bool good = 1;
+ int i, ci = curInd + 1;
+ int commaInd = ci, endInd = ci, len = (int)pattern.size();
+ sNum = eNum = 0;
+
+ while (endInd < len && pattern[endInd ] != '}') ++endInd;
+ while (commaInd < endInd && pattern[commaInd] != ',') ++commaInd;
+ if (endInd >= len) { raiseError(); return 0; }
+ for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0;
+ if (!good && commaInd < endInd) { raiseError(); return 0; }
+ if (!good) return 0;
+ /* so now everything in here is either a comma (and there is at most one comma) or a digit */
+ if (commaInd == ci) // {,*}
+ {
+ if (endInd == commaInd + 1) { sNum = MIN_QMATCH; eNum = MAX_QMATCH; } // {,} = *
+ else { sNum = MIN_QMATCH; eNum = getInt(commaInd + 1, endInd - 1); } // {,+}
+ }
+ else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH; } // {+,}
+ else if (commaInd == endInd) { sNum = getInt(ci, endInd - 1); eNum = sNum; } // {+}
+ else { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1); } // {+,+}
+ curInd = endInd + 1;
+ return 1;
+}
+NFANode * Pattern::quantifyGroup(NFANode * start, NFANode * stop, const int gn)
+{
+ NFANode * newNode = NULL;
+ int type = 0;
+
+ if (curInd < (int)pattern.size())
+ {
+ char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
+ switch (pattern[curInd])
+ {
+ case '*':
+ ++curInd;
+ switch (ch)
+ {
+ case '?': ++curInd; type = 1; break;
+ case '+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, MAX_QMATCH, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ case '?':
+ ++curInd;
+ switch (ch)
+ {
+ case '?': ++curInd; type = 1; break;
+ case '+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, 1, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ case '+':
+ ++curInd;
+ switch (ch)
+ {
+ case '?': ++curInd; type = 1; break;
+ case '+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopNode(start, 1, MAX_QMATCH, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ case '{':
+ {
+ int s, e;
+ if (quantifyCurly(s, e))
+ {
+ ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
+ switch (ch)
+ {
+ case '?': ++curInd; type = 1; break;
+ case '+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopNode(start, s, e, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ }
+ }
+ default:
+ break;
+ }
+ }
+ return NULL;
+}
+
+NFANode * Pattern::quantify(NFANode * newNode)
+{
+ if (curInd < (int)pattern.size())
+ {
+ char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
+ switch (pattern[curInd])
+ {
+ case '*':
+ ++curInd;
+ switch (ch)
+ {
+ case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
+ case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
+ }
+ break;
+ case '?':
+ ++curInd;
+ switch (ch)
+ {
+ case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, MIN_QMATCH, 1)); break;
+ case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, 1)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, MIN_QMATCH, 1)); break;
+ }
+ break;
+ case '+':
+ ++curInd;
+ switch (ch)
+ {
+ case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, 1, MAX_QMATCH)); break;
+ case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, 1, MAX_QMATCH)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, 1, MAX_QMATCH)); break;
+ }
+ break;
+ case '{':
+ {
+ int s, e;
+ if (quantifyCurly(s, e))
+ {
+ ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
+ switch (ch)
+ {
+ case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, s, e)); break;
+ case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, s, e)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, s, e)); break;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return newNode;
+}
+bkstring Pattern::parseClass()
+{
+ bkstring t, ret = "";
+ char ch, c1, c2;
+ bool inv = 0, neg = 0, quo = 0;
+
+ if (curInd < (int)pattern.size() && pattern[curInd] == '^')
+ {
+ ++curInd;
+ neg = 1;
+ }
+ while (curInd < (int)pattern.size() && pattern[curInd] != ']')
+ {
+ ch = pattern[curInd++];
+ if (ch == '[')
+ {
+ t = parseClass();
+ ret = classUnion(ret, t);
+ }
+ /*else if (ch == '-')
+ {
+ raiseError();
+ curInd = pattern.size();
+ }*/
+ else if (ch == '&' && curInd < (int)pattern.size() && pattern[curInd] == '&')
+ {
+ if (pattern[++curInd] != '[')
+ {
+ raiseError();
+ curInd = (int)pattern.size();
+ }
+ else
+ {
+ ++curInd;
+ t = parseClass();
+ ret = classIntersect(ret, t);
+ }
+ }
+ else if (ch == '\\')
+ {
+ t = parseEscape(inv, quo);
+ if (quo)
+ {
+ raiseError();
+ curInd = (int)pattern.size();
+ }
+ else if (inv || t.size() > 1) // cant be part of a range (a-z)
+ {
+ if (inv) t = classNegate(t);
+ ret = classUnion(ret, t);
+ }
+ else if (curInd < (int)pattern.size() && pattern[curInd] == '-') // part of a range (a-z)
+ {
+ c1 = t[0];
+ ++curInd;
+ if (curInd >= (int)pattern.size()) raiseError();
+ else
+ {
+ c2 = pattern[curInd++];
+ if (c2 == '\\')
+ {
+ t = parseEscape(inv, quo);
+ if (quo)
+ {
+ raiseError();
+ curInd = (int)pattern.size();
+ }
+ else if (inv || t.size() > 1) raiseError();
+ else ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')
+ {
+ raiseError();
+ curInd = (int)pattern.size();
+ }
+ else ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ }
+ else
+ {
+ ret = classUnion(ret, t);
+ }
+ }
+ else if (curInd < (int)pattern.size() && pattern[curInd] == '-')
+ {
+ c1 = ch;
+ ++curInd;
+ if (curInd >= (int)pattern.size()) raiseError();
+ else
+ {
+ c2 = pattern[curInd++];
+ if (c2 == '\\')
+ {
+ t = parseEscape(inv, quo);
+ if (quo)
+ {
+ raiseError();
+ curInd = (int)pattern.size();
+ }
+ else if (inv || t.size() > 1) raiseError();
+ else ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')
+ {
+ raiseError();
+ curInd = (int)pattern.size();
+ }
+ else
+ {
+ ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ }
+ }
+ else
+ {
+ ret += " ";
+ ret[ret.size() - 1] = ch;
+ }
+ }
+ if (curInd >= (int)pattern.size() || pattern[curInd] != ']')
+ {
+ raiseError();
+ ret = "";
+ }
+ else
+ {
+ ++curInd;
+ if (neg) ret = classNegate(ret);
+ }
+ return ret;
+}
+bkstring Pattern::parsePosix()
+{
+ bkstring s7 = pattern.substr(curInd, 7);
+ if (s7 == "{Lower}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyz"; }
+ if (s7 == "{Upper}") { curInd += 7; return "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; }
+ if (s7 == "{Alpha}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; }
+ if (s7 == "{Digit}") { curInd += 7; return "0123456789"; }
+ if (s7 == "{Alnum}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; }
+ if (s7 == "{Punct}") { curInd += 7; return "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
+ if (s7 == "{Graph}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
+ if (s7 == "{Print}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
+ if (s7 == "{Blank}") { curInd += 7; return " \t"; }
+ if (s7 == "{Space}") { curInd += 7; return " \t\n\x0B\f\r"; }
+ if (s7 == "{Cntrl}")
+ {
+ bkstring::value_type i;
+ bkstring s = " ";
+
+ for (i = 0; i < 5; ++i) s += s;
+ s += " ";
+ for (i = 0; i <= 0x1F; ++i) s[i] = i;
+ s[0x20] = 0x7F;
+ curInd += 7;
+ return s;
+ }
+ if (s7 == "{ASCII}")
+ {
+ bkstring s(0x80, ' ');
+ for (int i = 0; i <= 0x7f; ++i) s[i] = (bkstring::value_type)i;
+ curInd += 7;
+ return s;
+ }
+ if (pattern.substr(curInd, 8) == "{XDigit}") { curInd += 8; return "abcdefABCDEF0123456789"; }
+ raiseError();
+ return "";
+}
+NFANode * Pattern::parseBackref()
+{
+ #define is_dig(x) ((x) >= '0' && (x) <= '9')
+ #define to_int(x) ((x) - '0')
+ int ci = curInd;
+ int oldRef = 0, ref = 0;
+
+ while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount))
+ {
+ oldRef = ref;
+ ref = ref * 10 + to_int(pattern[ci++]);
+ }
+ if (ci == (int)pattern.size())
+ {
+ oldRef = ref;
+ ++ci;
+ }
+ if (oldRef < 0 || ci <= curInd)
+ {
+ raiseError();
+ return registerNode(new NFAReferenceNode(-1));
+ }
+ curInd = ci;
+ return registerNode(new NFAReferenceNode(ref));
+
+ #undef is_dig
+ #undef to_int
+}
+bkstring Pattern::parseOctal()
+{
+ #define islowoc(x) ((x) >= '0' && (x) <= '3')
+ #define isoc(x) ((x) >= '0' && (x) <= '7')
+ #define fromoc(x) ((x) - '0')
+ int ci = curInd;
+ char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;
+ char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;
+ char ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : -1;
+ bkstring s = " ";
+
+ if (islowoc(ch1) && isoc(ch2))
+ {
+ curInd += 2;
+ s[0] = fromoc(ch1) * 8 + fromoc(ch2);
+ if (isoc(ch3))
+ {
+ ++curInd;
+ s[0] = s[0] * 8 + fromoc(ch3);
+ }
+ }
+ else if (isoc(ch1) && isoc(ch2))
+ {
+ curInd += 2;
+ s[0] = fromoc(ch1) * 8 + fromoc(ch2);
+ }
+ else raiseError();
+
+ return s;
+ #undef islowoc
+ #undef isoc
+ #undef fromoc
+}
+bkstring Pattern::parseHex()
+{
+ #define to_low(x) (((x) >= 'A' && (x) <= 'Z') ? ((x) - 'A' + 'a') : (x))
+ #define is_dig(x) ((x) >= '0' && (x) <= '9')
+ #define is_hex(x) (is_dig(x) || (to_low(x) >= 'a' && to_low(x) <= 'f'))
+ #define to_int(x) ((is_dig(x)) ? ((x) - '0') : (to_low(x) - 'a' + 10))
+
+ int ci = curInd;
+ char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;
+ char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;
+ bkstring s = " ";
+
+ if (is_hex(ch1) && is_hex(ch2))
+ {
+ curInd += 2;
+ s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F);
+ }
+
+ return s;
+ #undef to_low
+ #undef is_dig
+ #undef is_hex
+ #undef to_int
+}
+bkstring Pattern::parseEscape(bool & inv, bool & quo)
+{
+ char ch = pattern[curInd++];
+ bkstring classes = "";
+
+ if (curInd > (int)pattern.size())
+ {
+ raiseError();
+ return NULL;
+ }
+
+ quo = 0;
+ inv = 0;
+ switch (ch)
+ {
+ case 'p': classes = parsePosix(); break;
+ case 'P': classes = "!!"; classes += parsePosix(); break;
+ case 'd': classes = "0123456789"; break;
+ case 'D': classes = "!!0123456789"; break;
+ case 's': classes = " \t\r\n\f"; break;
+ case 'S': classes = "!! \t\r\n\f"; break;
+ case 'w': classes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break;
+ case 'W': classes = "!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break;
+ case '0': classes = parseOctal(); break;
+ case 'x': classes = parseHex(); break;
+
+ case 'Q': quo = 1; break;
+ case 't': classes = "\t"; break;
+ case 'r': classes = "\r"; break;
+ case 'n': classes = "\n"; break;
+ case 'f': classes = "\f"; break;
+ case 'a': classes = "\a"; break;
+ case 'e': classes = "\r"; break;
+ default: classes = " "; classes[0] = ch; break;
+ }
+ if (classes.substr(0, 2) == "!!")
+ {
+ classes = classes.substr(2);
+ inv = 1;
+ }
+ return classes;
+}
+NFANode * Pattern::parseRegisteredPattern(NFANode ** end)
+{
+ int i, j;
+ bkstring s;
+ NFANode * ret = NULL;
+ for (i = curInd; i < (int)pattern.size() && pattern[i] != '}'; ++i) { }
+ if (pattern[i] != '}') { raiseError(); return NULL; }
+ if (i == curInd + 1) { raiseError(); return NULL; } // {}
+ if (
+ !(
+ (pattern[curInd] >= 'a' && pattern[curInd] <= 'z') ||
+ (pattern[curInd] >= 'A' && pattern[curInd] <= 'Z') ||
+ (pattern[curInd] == '_')
+ )
+ )
+ {
+ raiseError();
+ return NULL;
+ }
+ for (j = curInd; !error && j < i; ++j)
+ {
+ if (
+ !(
+ (pattern[j] >= 'a' && pattern[j] <= 'z') ||
+ (pattern[j] >= 'A' && pattern[j] <= 'Z') ||
+ (pattern[j] >= '0' && pattern[j] <= '9') ||
+ (pattern[j] == '_')
+ )
+ )
+ {
+ raiseError();
+ return NULL;
+ }
+ }
+ s = pattern.substr(curInd, i - curInd);
+ if (registeredPatterns.find(s) == registeredPatterns.end()) raiseError();
+ else
+ {
+ unsigned long oflags = flags;
+ bkstring op = pattern;
+ int ci = i + 1;
+
+ pattern = registeredPatterns[s].first;
+ curInd = 0;
+ flags = registeredPatterns[s].second;
+
+ --groupCount;
+ ret = parse(0, 0, end);
+
+ pattern = op;
+ curInd = ci;
+ flags = oflags;
+ }
+ if (error) { *end = ret = NULL; }
+ return ret;
+}
+
+// look behind should interpret everything as a literal (except \\) since the
+// pattern must have a concrete length
+NFANode * Pattern::parseBehind(const bool pos, NFANode ** end)
+{
+ bkstring t = "";
+ while (curInd < (int)pattern.size() && pattern[curInd] != ')')
+ {
+ char ch = pattern[curInd++];
+ t += " ";
+ if (ch == '\\')
+ {
+ if (curInd + 1 >= (int)pattern.size())
+ {
+ raiseError();
+ return *end = registerNode(new NFACharNode(' '));
+ }
+ ch = pattern[curInd++];
+ }
+ t[t.size() - 1] = ch;
+ }
+ if (curInd >= (int)pattern.size() || pattern[curInd] != ')') raiseError();
+ else ++curInd;
+ return *end = registerNode(new NFALookBehindNode(t, pos));
+}
+NFANode * Pattern::parseQuote()
+{
+ bool done = 0;
+ bkstring s = "";
+
+ while (!done)
+ {
+ if (curInd >= (int)pattern.size())
+ {
+ raiseError();
+ done = 1;
+ }
+ else if (pattern.substr(curInd, 2) == "\\E")
+ {
+ curInd += 2;
+ done = 1;
+ }
+ else if (pattern[curInd] == '\\')
+ {
+ s += " ";
+ s[s.size() - 1] = pattern[++curInd];
+ ++curInd;
+ }
+ else
+ {
+ s += " ";
+ s[s.size() - 1] = pattern[curInd++];
+ }
+ }
+ if ((flags & Pattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteNode(s));
+ return registerNode(new NFAQuoteNode(s));
+}
+NFANode * Pattern::parse(const bool inParen, const bool inOr, NFANode ** end)
+{
+ NFANode * start, * cur, * next = NULL;
+ bkstring t;
+ int grc = groupCount++;
+ bool inv, quo;
+ bool ahead = 0, pos = 0, noncap = 0, indep = 0;
+ unsigned long oldFlags = flags;
+
+ if (inParen)
+ {
+ if (pattern[curInd] == '?')
+ {
+ ++curInd;
+ --groupCount;
+ if (pattern[curInd] == ':') { noncap = 1; ++curInd; grc = --nonCapGroupCount; }
+ else if (pattern[curInd] == '=') { ++curInd; ahead = 1; pos = 1; }
+ else if (pattern[curInd] == '!') { ++curInd; ahead = 1; pos = 0; }
+ else if (pattern.substr(curInd, 2) == "<=") { curInd += 2; return parseBehind(1, end); }
+ else if (pattern.substr(curInd, 2) == "<!") { curInd += 2; return parseBehind(0, end); }
+ else if (pattern[curInd] == '>') { ++curInd; indep = 1; }
+ else
+ {
+ bool negate = false, done = false;
+ while (!done)
+ {
+ if (curInd >= (int)pattern.size())
+ {
+ raiseError();
+ return NULL;
+ }
+ else if (negate)
+ {
+ switch (pattern[curInd])
+ {
+ case 'i': flags &= ~Pattern::CASE_INSENSITIVE; break;
+ case 'd': flags &= ~Pattern::UNIX_LINE_MODE; break;
+ case 'm': flags &= ~Pattern::MULTILINE_MATCHING; break;
+ case 's': flags &= ~Pattern::DOT_MATCHES_ALL; break;
+ case ':': done = true; break;
+ case ')':
+ ++curInd;
+ *end = registerNode(new NFALookBehindNode("", true));
+ return *end;
+ case '-':
+ default: raiseError(); return NULL;
+ }
+ }
+ else
+ {
+ switch (pattern[curInd])
+ {
+ case 'i': flags |= Pattern::CASE_INSENSITIVE; break;
+ case 'd': flags |= Pattern::UNIX_LINE_MODE; break;
+ case 'm': flags |= Pattern::MULTILINE_MATCHING; break;
+ case 's': flags |= Pattern::DOT_MATCHES_ALL; break;
+ case ':': done = true; break;
+ case '-': negate = true; break;
+ case ')':
+ ++curInd;
+ *end = registerNode(new NFALookBehindNode("", true));
+ return *end;
+ default: raiseError(); return NULL;
+ }
+ }
+ ++curInd;
+ }
+ noncap = 1;
+ grc = --nonCapGroupCount;
+ }
+ if (noncap) cur = start = registerNode(new NFAGroupHeadNode(grc));
+ else cur = start = registerNode(new NFASubStartNode);
+ }
+ else cur = start = registerNode(new NFAGroupHeadNode(grc));
+ }
+ else cur = start = registerNode(new NFASubStartNode);
+ while (curInd < (int)pattern.size())
+ {
+ char ch = pattern[curInd++];
+
+ next = NULL;
+ if (error) return NULL;
+ switch (ch)
+ {
+ case '^':
+ if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineNode);
+ else next = registerNode(new NFAStartOfInputNode);
+ break;
+ case '$':
+ if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineNode);
+ else next = registerNode(new NFAEndOfInputNode(0));
+ break;
+ case '|':
+ --groupCount;
+ cur->next = registerNode(new NFAAcceptNode);
+ cur = start = registerNode(new NFAOrNode(start, parse(inParen, 1)));
+ break;
+ case '\\':
+ if (curInd < (int)pattern.size())
+ {
+ bool eoi = 0;
+ switch (pattern[curInd])
+ {
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': next = parseBackref(); break;
+ case 'A': ++curInd; next = registerNode(new NFAStartOfInputNode); break;
+ case 'B': ++curInd; next = registerNode(new NFAWordBoundaryNode(0)); break;
+ case 'b': ++curInd; next = registerNode(new NFAWordBoundaryNode(1)); break;
+ case 'G': ++curInd; next = registerNode(new NFAEndOfMatchNode); break;
+ case 'Z': eoi = 1;
+ case 'z': ++curInd; next = registerNode(new NFAEndOfInputNode(eoi)); break;
+ default:
+ t = parseEscape(inv, quo);
+ if (!quo)
+ {
+ if (t.size() > 1 || inv)
+ {
+ if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassNode(t, inv));
+ else next = registerNode(new NFAClassNode(t, inv));
+ }
+ else
+ {
+ next = registerNode(new NFACharNode(t[0]));
+ }
+ }
+ else
+ {
+ next = parseQuote();
+ }
+ }
+ }
+ else raiseError();
+ break;
+ case '[':
+ if ((flags & Pattern::CASE_INSENSITIVE) == 0)
+ {
+ NFAClassNode * clazz = new NFAClassNode();
+ bkstring s = parseClass();
+ for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1;
+ next = registerNode(clazz);
+ }
+ else
+ {
+ NFACIClassNode * clazz = new NFACIClassNode();
+ bkstring s = parseClass();
+ for (int i = 0; i < (int)s.size(); ++i) clazz->vals[to_lower(s[i])] = 1;
+ next = registerNode(clazz);
+ }
+ break;
+ case '.':
+ {
+ bool useN = 1, useR = 1;
+ NFAClassNode * clazz = new NFAClassNode(1);
+ if ((flags & Pattern::UNIX_LINE_MODE) != 0) useR = 0;
+ if ((flags & Pattern::DOT_MATCHES_ALL) != 0) useN = useR = 0;
+ if (useN) clazz->vals['\n'] = 1;
+ if (useR) clazz->vals['\r'] = 1;
+ next = registerNode(clazz);
+ }
+ break;
+ case '(':
+ {
+ NFANode * end, * t1, * t2;
+ t1 = parse(1, 0, &end);
+ if (!t1) raiseError();
+ else if (t1->isGroupHeadNode() && (t2 = quantifyGroup(t1, end, grc)) != NULL)
+ {
+ cur->next = t2;
+ cur = t2->next;
+ }
+ else
+ {
+ cur->next = t1;
+ cur = end;
+ }
+ }
+ break;
+ case ')':
+ if (!inParen) raiseError();
+ else if (inOr)
+ {
+ --curInd;
+ cur = cur->next = registerNode(new NFAAcceptNode);
+ flags = oldFlags;
+ return start;
+ }
+ else
+ {
+ if (ahead)
+ {
+ cur = cur->next = registerNode(new NFAAcceptNode);
+ flags = oldFlags;
+ return *end = registerNode(new NFALookAheadNode(start, pos));
+ }
+ else if (indep)
+ {
+ cur = cur->next = registerNode(new NFAAcceptNode);
+ flags = oldFlags;
+ return *end = registerNode(new NFAPossessiveQuantifierNode(this, start, 1, 1));
+ }
+ else // capping or noncapping, it doesnt matter
+ {
+ *end = cur = cur->next = registerNode(new NFAGroupTailNode(grc));
+ next = quantifyGroup(start, *end, grc);
+ if (next)
+ {
+ start = next;
+ *end = next->next;
+ }
+ flags = oldFlags;
+ return start;
+ }
+ }
+ break;
+ case '{': // registered pattern
+ cur->next = parseRegisteredPattern(&next);
+ if (cur->next) cur = next;
+ break;
+ case '*':
+ case '+':
+ case '?':
+// case '}':
+// case ']':
+ raiseError();
+ break;
+ default:
+ if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACICharNode(ch));
+ else next = registerNode(new NFACharNode(ch));
+ break;
+ }
+ if (next)
+ {
+ cur = cur->next = quantify(next);
+ }
+ }
+ if (inParen) raiseError();
+ else
+ {
+ if (inOr) cur = cur->next = registerNode(new NFAAcceptNode);
+ if (end) *end = cur;
+ }
+
+ flags = oldFlags;
+ if (error) return NULL;
+
+ return start;
+}
+
+Pattern * Pattern::compile(const bkstring & pattern, const unsigned long mode)
+{
+ Pattern * p = new Pattern(pattern);
+ NFANode * end;
+
+ p->flags = mode;
+ if ((mode & Pattern::LITERAL) != 0)
+ {
+ p->head = p->registerNode(new NFAStartNode);
+ if ((mode & Pattern::CASE_INSENSITIVE) != 0) p->head->next = p->registerNode(new NFACIQuoteNode(pattern));
+ else p->head->next = p->registerNode(new NFAQuoteNode(pattern));
+ p->head->next->next = p->registerNode(new NFAEndNode);
+ }
+ else
+ {
+ p->head = p->parse(0, 0, &end);
+ if (!p->head)
+ {
+ delete p;
+ p = NULL;
+ }
+ else
+ {
+ if (!(p->head && p->head->isStartOfInputNode()))
+ {
+ NFANode * n = p->registerNode(new NFAStartNode);
+ n->next = p->head;
+ p->head = n;
+ }
+ end->next = p->registerNode(new NFAEndNode);
+ }
+ }
+ if (p != NULL)
+ {
+ p->matcher = new Matcher(p, "");
+ }
+
+ return p;
+}
+
+Pattern * Pattern::compileAndKeep(const bkstring & pattern, const unsigned long mode)
+{
+ Pattern * ret = NULL;
+ std::map<bkstring, Pattern*>::iterator it = compiledPatterns.find(pattern);
+
+ if (it != compiledPatterns.end())
+ {
+ ret = it->second;
+ }
+ else
+ {
+ ret = compile(pattern, mode);
+ compiledPatterns[pattern] = ret;
+ }
+
+ return ret;
+}
+bkstring Pattern::replace(const bkstring & pattern, const bkstring & str,
+ const bkstring & replacementText, const unsigned long mode)
+{
+ bkstring ret;
+ Pattern * p = Pattern::compile(pattern, mode);
+ if (p)
+ {
+ ret = p->replace(str, replacementText);
+ delete p;
+ }
+ return ret;
+}
+
+std::vector<bkstring> Pattern::split(const bkstring & pattern, const bkstring & str, const bool keepEmptys,
+ const unsigned long limit, const unsigned long mode)
+{
+ std::vector<bkstring> ret;
+ Pattern * p = Pattern::compile(pattern, mode);
+ if (p)
+ {
+ ret = p->split(str, keepEmptys, limit);
+ delete p;
+ }
+ return ret;
+}
+
+std::vector<bkstring> Pattern::findAll(const bkstring & pattern, const bkstring & str, const unsigned long mode)
+{
+ std::vector<bkstring> ret;
+ Pattern * p = Pattern::compile(pattern, mode);
+ if (p)
+ {
+ ret = p->findAll(str);
+ delete p;
+ }
+ return ret;
+}
+
+bool Pattern::matches(const bkstring & pattern, const bkstring & str, const unsigned long mode)
+{
+ bool ret = 0;
+ Pattern * p = compile(pattern, mode);
+
+ if (p)
+ {
+ ret = p->matches(str);
+ delete p;
+ }
+
+ return ret;
+}
+
+bool Pattern::registerPattern(const bkstring & name, const bkstring & pattern, const unsigned long mode)
+{
+ Pattern * p = Pattern::compile(pattern, mode);
+ if (!p) return 0;
+ Pattern::registeredPatterns[name] = std::make_pair(pattern, mode);
+ delete p;
+ return 1;
+}
+
+void Pattern::unregisterPatterns()
+{
+ registeredPatterns.clear();
+}
+void Pattern::clearPatternCache()
+{
+ std::map<bkstring, Pattern*>::iterator it;
+ for (it = compiledPatterns.begin(); it != compiledPatterns.end(); ++it)
+ {
+ delete it->second;
+ }
+ compiledPatterns.clear();
+}
+
+std::pair<bkstring, int> Pattern::findNthMatch(const bkstring & pattern, const bkstring & str,
+ const int matchNum, const unsigned long mode)
+{
+ std::pair<bkstring, int> ret;
+ Pattern * p = Pattern::compile(pattern, mode);
+
+ ret.second = -1;
+ if (p)
+ {
+ int i = -1;
+ p->matcher->setString(str);
+ while (i < matchNum && p->matcher->findNextMatch()) { ++i; }
+ if (i == matchNum && p->matcher->getStartingIndex() >= 0)
+ {
+ ret.first = p->matcher->getGroup(0);
+ ret.second = p->matcher->getStartingIndex();
+ }
+ delete p;
+ }
+
+ return ret;
+}
+
+Pattern::~Pattern()
+{
+ if (matcher) delete matcher;
+ for (std::map<NFANode*, bool>::iterator it = nodes.begin(); it != nodes.end(); ++it)
+ {
+ delete it->first;
+ }
+}
+bkstring Pattern::replace(const bkstring & str, const bkstring & replacementText)
+{
+ int li = 0;
+ bkstring ret = "";
+
+ matcher->setString(str);
+ while (matcher->findNextMatch())
+ {
+ ret += str.substr(li, matcher->getStartingIndex() - li);
+ ret += matcher->replaceWithGroups(replacementText);
+ li = matcher->getEndingIndex();
+ }
+ ret += str.substr(li);
+
+ return ret;
+}
+std::vector<bkstring> Pattern::split(const bkstring & str, const bool keepEmptys, const unsigned long limit)
+{
+ unsigned long lim = (limit == 0 ? MAX_QMATCH : limit);
+ int li = 0;
+ std::vector<bkstring> ret;
+
+ matcher->setString(str);
+
+ while (matcher->findNextMatch() && ret.size() < lim)
+ {
+ if (matcher->getStartingIndex() == 0 && keepEmptys) ret.push_back("");
+ if ((matcher->getStartingIndex() != matcher->getEndingIndex()) || keepEmptys)
+ {
+ if (li != matcher->getStartingIndex() || keepEmptys)
+ {
+ ret.push_back(str.substr(li, matcher->getStartingIndex() - li));
+ }
+ li = matcher->getEndingIndex();
+ }
+ }
+ if (li < (int)str.size()) ret.push_back(str.substr(li));
+
+ return ret;
+}
+std::vector<bkstring> Pattern::findAll(const bkstring & str)
+{
+ matcher->setString(str);
+ return matcher->findAll();
+}
+bool Pattern::matches(const bkstring & str)
+{
+ matcher->setString(str);
+ return matcher->matches();
+}
+unsigned long Pattern::getFlags() const
+{
+ return flags;
+}
+bkstring Pattern::getPattern() const
+{
+ return pattern;
+}
+Matcher * Pattern::createMatcher(const bkstring & str)
+{
+ return new Matcher(this, str);
+}
+
+// NFANode
+
+NFANode::NFANode() { next = NULL; }
+NFANode::~NFANode() { }
+void NFANode::findAllNodes(std::map<NFANode*, bool> & soFar)
+{
+ if (soFar.find(this) == soFar.end()) return;
+ soFar[this] = 1;
+ if (next) next->findAllNodes(soFar);
+}
+
+// NFACharNode
+
+NFACharNode::NFACharNode(const char c) { ch = c; }
+int NFACharNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && str[curInd] == ch) return next->match(str, matcher, curInd + 1);
+ return -1;
+}
+
+// NFACICharNode
+
+NFACICharNode::NFACICharNode(const char c) { ch = to_lower(c); }
+int NFACICharNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && to_lower(str[curInd]) == ch) return next->match(str, matcher, curInd + 1);
+ return -1;
+}
+
+// NFAStartNode
+
+NFAStartNode::NFAStartNode() { }
+int NFAStartNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int ret = -1, ci = curInd;
+
+ matcher->starts[0] = curInd;
+ if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) == (unsigned int)Matcher::MATCH_ENTIRE_STRING)
+ {
+ if (curInd != 0)
+ {
+ matcher->starts[0] = -1;
+ return -1;
+ }
+ return next->match(str, matcher, 0);
+ }
+ while ((ret = next->match(str, matcher, ci)) == -1 && ci < (int)str.size())
+ {
+ matcher->clearGroups();
+ matcher->starts[0] = ++ci;
+ }
+ if (ret < 0) matcher->starts[0] = -1;
+ return ret;
+}
+
+// NFAEndNode
+
+NFAEndNode::NFAEndNode() { }
+int NFAEndNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ matcher->ends[0] = curInd;
+ if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) != 0)
+ {
+ if (curInd == (int)str.size()) return curInd;
+ matcher->ends[0] = -1;
+ return -1;
+ }
+ return curInd;
+}
+
+// NFAQuantifierNode
+
+void NFAQuantifierNode::findAllNodes(std::map<NFANode*, bool> & soFar)
+{
+ inner->findAllNodes(soFar);
+ NFANode::findAllNodes(soFar);
+}
+NFAQuantifierNode::NFAQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
+{
+ inner = internal;
+ inner->next = pat->registerNode(new NFAAcceptNode);
+ min = (minMatch < Pattern::MIN_QMATCH) ? Pattern::MIN_QMATCH : minMatch;
+ max = (maxMatch > Pattern::MAX_QMATCH) ? Pattern::MAX_QMATCH : maxMatch;
+}
+
+int NFAQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int i0, i1, i2 = 0;
+
+ i0 = i1 = curInd;
+ while (i2 < min)
+ {
+
+ ++i2;
+ i1 = inner->match(str, matcher, i0);
+ if (i1 <= i0) return i1; // i1 < i0 means i1 is -1
+ i0 = i1;
+ }
+
+ return i1;
+}
+// NFAGreedyQuantifierNode
+
+NFAGreedyQuantifierNode::NFAGreedyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
+ : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }
+int NFAGreedyQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int t = NFAQuantifierNode::match(str, matcher, curInd);
+ if (t != -1) return matchInternal(str, matcher, t, min);
+ return t;
+}
+int NFAGreedyQuantifierNode::matchInternal(const bkstring & str, Matcher * matcher, const int curInd, const int soFar) const
+{
+ if (soFar >= max) return next->match(str, matcher, curInd);
+
+ int i, j;
+
+ i = inner->match(str, matcher, curInd);
+ if (i != -1)
+ {
+ j = matchInternal(str, matcher, i, soFar + 1);
+ if (j != -1) return j;
+ }
+ return next->match(str, matcher, curInd);
+}
+
+// NFALazyQuantifierNode
+
+NFALazyQuantifierNode::NFALazyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
+ : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }
+int NFALazyQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int i, j, m = NFAQuantifierNode::match(str, matcher, curInd);
+
+ if (m == -1) return -1;
+
+ for (i = min; i < max; ++i)
+ {
+ j = next->match(str, matcher, m);
+ if (j == -1)
+ {
+ j = inner->match(str, matcher, m);
+ // if j < m, then j is -1, so we bail.
+ // if j == m, then we would just go and call next->match on the same index,
+ // but it already failed trying to match right there, so we know we can
+ // just bail
+ if (j <= m) return -1;
+ m = j;
+ }
+ else return j;
+ }
+ return next->match(str, matcher, m);
+}
+
+// NFAPossessiveQuantifierNode
+
+NFAPossessiveQuantifierNode::NFAPossessiveQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
+ : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }
+int NFAPossessiveQuantifierNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int i, j, m = NFAQuantifierNode::match(str, matcher, curInd);
+
+ if (m == -1) return -1;
+ for (i = min; i < max; ++i)
+ {
+ j = inner->match(str, matcher, m);
+ if (j <= m) return next->match(str, matcher, m);
+ m = j;
+ }
+ return next->match(str, matcher, m);
+}
+
+// NFAAcceptNode
+
+NFAAcceptNode::NFAAcceptNode() { }
+int NFAAcceptNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (!next) return curInd;
+ else return next->match(str, matcher, curInd);
+}
+
+// NFAClassNode
+
+NFAClassNode::NFAClassNode(const bool invert)
+{
+ inv = invert;
+}
+NFAClassNode::NFAClassNode(const bkstring & clazz, const bool invert)
+{
+ inv = invert;
+ for (int i = 0; i < (int)clazz.size(); ++i) vals[clazz[i]] = 1;
+}
+int NFAClassNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && ((vals.find(str[curInd]) != vals.end()) ^ inv))
+ {
+ return next->match(str, matcher, curInd + 1);
+ }
+ return -1;
+}
+
+// NFACIClassNode
+
+NFACIClassNode::NFACIClassNode(const bool invert)
+{
+ inv = invert;
+}
+NFACIClassNode::NFACIClassNode(const bkstring & clazz, const bool invert)
+{
+ inv = invert;
+ for (int i = 0; i < (int)clazz.size(); ++i) vals[to_lower(clazz[i])] = 1;
+}
+int NFACIClassNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && ((vals.find(to_lower(str[curInd])) != vals.end()) ^ inv))
+ {
+ return next->match(str, matcher, curInd + 1);
+ }
+ return -1;
+}
+
+// NFASubStartNode
+
+NFASubStartNode::NFASubStartNode() { }
+int NFASubStartNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ return next->match(str, matcher, curInd);
+}
+
+// NFAOrNode
+
+NFAOrNode::NFAOrNode(NFANode * first, NFANode * second) : one(first), two(second) { }
+void NFAOrNode::findAllNodes(std::map<NFANode*, bool> & soFar)
+{
+ if (one) one->findAllNodes(soFar);
+ if (two) two->findAllNodes(soFar);
+ NFANode::findAllNodes(soFar);
+}
+int NFAOrNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int ci = one->match(str, matcher, curInd);
+
+ if (ci != -1) ci = next->match(str, matcher, ci);
+ if (ci != -1) return ci;
+ if (ci == -1) ci = two->match(str, matcher, curInd);
+ if (ci != -1) ci = next->match(str, matcher, ci);
+ return ci;
+}
+
+// NFAQuoteNode
+
+NFAQuoteNode::NFAQuoteNode(const bkstring & quoted) : qStr(quoted) { }
+int NFAQuoteNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd + qStr.size() > str.size()) return -1;
+ if (str.substr(curInd, qStr.size()) != qStr) return -1;
+ return next->match(str, matcher, curInd + qStr.size());
+}
+
+// NFACIQuoteNode
+
+NFACIQuoteNode::NFACIQuoteNode(const bkstring & quoted) : qStr(quoted) { }
+int NFACIQuoteNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd + qStr.size() > str.size()) return -1;
+ if (str_icmp(str.substr(curInd, qStr.size()).c_str(), qStr.c_str())) return -1;
+ return next->match(str, matcher, qStr.size());
+}
+
+// NFALookAheadNode
+
+NFALookAheadNode::NFALookAheadNode(NFANode * internal, const bool positive) : NFANode(), pos(positive), inner(internal) { }
+void NFALookAheadNode::findAllNodes(std::map<NFANode*, bool> & soFar)
+{
+ if (inner) inner->findAllNodes(soFar);
+ NFANode::findAllNodes(soFar);
+}
+int NFALookAheadNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ return ((inner->match(str, matcher, curInd) == -1) ^ pos) ? next->match(str, matcher, curInd) : -1;
+}
+
+// NFALookBehindNode
+
+NFALookBehindNode::NFALookBehindNode(const bkstring & str, const bool positive) : pos(positive), mStr(str) { }
+int NFALookBehindNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (pos)
+ {
+ if (curInd < (int)mStr.size()) return -1;
+ if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return next->match(str, matcher, curInd);
+ }
+ else
+ {
+ if (curInd < (int)mStr.size()) return next->match(str, matcher, curInd);
+ if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return -1;
+ return next->match(str, matcher, curInd);
+ }
+ return -1;
+}
+
+// NFAStartOfLineNode
+
+NFAStartOfLineNode::NFAStartOfLineNode() { }
+int NFAStartOfLineNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd == 0 || str[curInd - 1] == '\n' || str[curInd - 1] == '\r')
+ {
+ return next->match(str, matcher, curInd);
+ }
+ return -1;
+}
+
+// NFAEndOfLineNode
+
+NFAEndOfLineNode::NFAEndOfLineNode() { }
+int NFAEndOfLineNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd >= (int)str.size() || str[curInd] == '\n' || str[curInd] == '\r')
+ {
+ return next->match(str, matcher, curInd);
+ }
+ return -1;
+}
+
+// NFAReferenceNode
+
+NFAReferenceNode::NFAReferenceNode(const int groupIndex) : gi(groupIndex) { }
+int NFAReferenceNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int len = matcher->ends[gi] - matcher->starts[gi];
+ int ni = -1;
+ if (gi < 1 || matcher->ends[gi] < matcher->starts[gi] || len == 0) ni = curInd;
+ else if (curInd + len > (int)str.size()) return -1;
+ else if (str.substr(curInd, len) != str.substr(matcher->starts[gi], len)) return -1;
+ else ni = curInd + len;
+
+ return next->match(str, matcher, ni);
+}
+
+// NFAStartOfInputNode
+
+NFAStartOfInputNode::NFAStartOfInputNode() { }
+int NFAStartOfInputNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd == 0) return next->match(str, matcher, curInd);
+ return -1;
+}
+
+// NFAEndOfInputNode
+
+NFAEndOfInputNode::NFAEndOfInputNode(const bool lookForTerm) : term(lookForTerm) { }
+int NFAEndOfInputNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int len = (int)str.size();
+ if (curInd == len) return next->match(str, matcher, curInd);
+ else if (term)
+ {
+ if (curInd == len - 1 && (str[curInd] == '\r' || str[curInd] == '\n'))
+ {
+ return next->match(str, matcher, curInd);
+ }
+ else if (curInd == len - 2 && str.substr(curInd, 2) == "\r\n")
+ {
+ return next->match(str, matcher, curInd);
+ }
+ }
+ return -1;
+}
+
+// NFAWordBoundaryNode
+
+NFAWordBoundaryNode::NFAWordBoundaryNode(const bool positive) : pos(positive) { }
+int NFAWordBoundaryNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int len = (int)str.size();
+
+ char c1 = (curInd - 1 < len && curInd > 0) ? str[curInd - 1] : '\n';
+ char c2 = (curInd < len) ? str[curInd ] : '\n';
+
+ if (curInd == len) return next->match(str, matcher, curInd);
+ bool ok = is_alpha(c1) != is_alpha(c2);
+ if (ok && pos) return next->match(str, matcher, curInd);
+ return -1;
+}
+
+// NFAEndOfMatchNode
+
+NFAEndOfMatchNode::NFAEndOfMatchNode() { }
+int NFAEndOfMatchNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ if (curInd == matcher->lm) return next->match(str, matcher, curInd);
+ return -1;
+}
+
+// NFAGroupHeadNode
+
+NFAGroupHeadNode::NFAGroupHeadNode(const int groupIndex) : gi(groupIndex) { }
+int NFAGroupHeadNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int ret, o = matcher->starts[gi];
+
+ matcher->starts[gi] = curInd;
+ ret = next->match(str, matcher, curInd);
+ if (ret < 0) matcher->starts[gi] = o;
+
+ return ret;
+}
+
+// NFAGroupTailNode
+
+NFAGroupTailNode::NFAGroupTailNode(const int groupIndex) : gi(groupIndex) { }
+int NFAGroupTailNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int ret, o = matcher->ends[gi];
+
+ matcher->ends[gi] = curInd;
+ ret = next->match(str, matcher, curInd);
+ if (ret < 0) matcher->ends[gi] = o;
+
+ return ret;
+}
+
+// NFAGroupLoopPrologueNode
+
+NFAGroupLoopPrologueNode::NFAGroupLoopPrologueNode(const int groupIndex) : gi(groupIndex) { }
+int NFAGroupLoopPrologueNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int ret, o1 = matcher->groups[gi], o2 = matcher->groupPos[gi], o3 = matcher->groupIndeces[gi];
+
+ matcher->groups[gi] = 0;
+ matcher->groupPos[gi] = 0;
+ matcher->groupIndeces[gi] = -1;
+ ret = next->match(str, matcher, curInd);
+ if (ret < 0)
+ {
+ matcher->groups[gi] = o1;
+ matcher->groupPos[gi] = o2;
+ matcher->groupIndeces[gi] = o3;
+ }
+
+ return ret;
+}
+
+// NFAGroupLoopNode
+
+NFAGroupLoopNode::NFAGroupLoopNode(NFANode * internal, const int minMatch, const int maxMatch,
+ const int groupIndex, const int matchType)
+{
+ inner = internal;
+ min = minMatch;
+ max = maxMatch;
+ gi = groupIndex;
+ type = matchType;
+}
+void NFAGroupLoopNode::findAllNodes(std::map<NFANode*, bool> & soFar)
+{
+ if (inner) inner->findAllNodes(soFar);
+ NFANode::findAllNodes(soFar);
+}
+int NFAGroupLoopNode::match(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ bool b = (curInd > matcher->groupIndeces[gi]);
+
+ if (b && matcher->groups[gi] < min)
+ {
+ ++matcher->groups[gi];
+ int o = matcher->groupIndeces[gi];
+ matcher->groupIndeces[gi] = curInd;
+ int ret = inner->match(str, matcher, curInd);
+ if (ret < 0)
+ {
+ matcher->groupIndeces[gi] = o;
+ --matcher->groups[gi];
+ }
+ return ret;
+ }
+ else if (!b || matcher->groups[gi] >= max)
+ {
+ return next->match(str, matcher, curInd);
+ }
+ else
+ {
+ switch (type)
+ {
+ case 0: return matchGreedy(str, matcher, curInd);
+ case 1: return matchLazy(str, matcher, curInd);
+ case 2: return matchPossessive(str, matcher, curInd);
+ }
+ }
+ return -1;
+}
+int NFAGroupLoopNode::matchGreedy(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int o = matcher->groupIndeces[gi]; // save our info for backtracking
+ matcher->groupIndeces[gi] = curInd; // move along
+ ++matcher->groups[gi];
+ int ret = inner->match(str, matcher, curInd); // match internally
+ if (ret < 0)
+ { // if we failed, then restore info and match next
+ --matcher->groups[gi];
+ matcher->groupIndeces[gi] = o;
+ ret = next->match(str, matcher, curInd);
+ }
+ return ret;
+}
+int NFAGroupLoopNode::matchLazy(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int ret = next->match(str, matcher, curInd); // be lazy, just go on
+ if (ret < 0)
+ {
+ int o = matcher->groupIndeces[gi]; // save info for backtracking
+ matcher->groupIndeces[gi] = curInd; // advance our position
+ ++matcher->groups[gi];
+ ret = inner->match(str, matcher, curInd); // match our internal stuff
+ if (ret < 0) // if we failed, then restore the info
+ {
+ --matcher->groups[gi];
+ matcher->groupIndeces[gi] = o;
+ }
+ }
+ return ret;
+}
+int NFAGroupLoopNode::matchPossessive(const bkstring & str, Matcher * matcher, const int curInd) const
+{
+ int o = matcher->groupIndeces[gi]; // save info for backtracking
+ matcher->groupPos[gi] = matcher->groups[gi]; // set a flag stating we have matcher at least this much
+ matcher->groupIndeces[gi] = curInd; // move along
+ ++matcher->groups[gi];
+ int ret = inner->match(str, matcher, curInd); // try and match again
+ if (ret < 0)
+ { // if we fail, back off, but to an extent
+ --matcher->groups[gi];
+ matcher->groupIndeces[gi] = o;
+ if (matcher->groups[gi] == matcher->groupPos[gi]) ret = next->match(str, matcher, curInd);
+ }
+ return ret;
+}
+
+#ifdef _WIN32
+ #pragma warning(pop)
+#endif
diff --git a/plugins/smileyadd/regexp/Pattern.h b/plugins/smileyadd/regexp/Pattern.h new file mode 100644 index 0000000000..bb16ad90fa --- /dev/null +++ b/plugins/smileyadd/regexp/Pattern.h @@ -0,0 +1,1663 @@ +#ifndef __PATTERN_H__
+#define __PATTERN_H__
+
+#ifdef _WIN32
+ #pragma warning(disable:4786)
+#endif
+
+#include <vector>
+#include <map>
+
+#include "bkstring.h"
+
+class Matcher;
+class NFANode;
+class NFAQuantifierNode;
+
+/**
+ This pattern class is very similar in functionality to Java's
+ java.util.regex.Pattern class. The pattern class represents an immutable
+ regular expression object. Instead of having a single object contain both the
+ regular expression object and the matching object, instead the two objects are
+ split apart. The {@link Matcher Matcher} class represents the maching
+ object.
+
+ The Pattern class works primarily off of "compiled" patterns. A typical
+ instantiation of a regular expression looks like:
+
+ <pre>
+ Pattern * p = Pattern::compile("a*b");
+ Matcher * m = p->createMatcher("aaaaaab");
+ if (m->matches()) ...
+ </pre>
+
+ However, if you do not need to use a pattern more than once, it is often times
+ okay to use the Pattern's static methods insteads. An example looks like this:
+
+ <pre>
+ if (Pattern::matches("a*b", "aaaab")) { ... }
+ </pre>
+
+ This class does not currently support unicode. The unicode update for this
+ class is coming soon.
+
+ This class is partially immutable. It is completely safe to call createMatcher
+ concurrently in different threads, but the other functions (e.g. split) should
+ not be called concurrently on the same <code>Pattern</code>.
+
+ <table border="0" cellpadding="1" cellspacing="0">
+ <tr align="left" bgcolor="#CCCCFF">
+ <td>
+ <b>Construct</b>
+ </td>
+ <td>
+ <b>Matches</b>
+ </th>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Characters</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x</i></code>
+ </td>
+ <td>
+ The character <code><i>x</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\\</code>
+ </td>
+ <td>
+ The character <code>\</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\0<i>nn</i></code>
+ </td>
+ <td>
+ The character with octal ASCII value <code><i>nn</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\0<i>nnn</i></code>
+ </td>
+ <td>
+ The character with octal ASCII value <code><i>nnn</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\x<i>hh</i></code>
+ </td>
+ <td>
+ The character with hexadecimal ASCII value <code><i>hh</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\t</code>
+ </td>
+ <td>
+ A tab character
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\r</code>
+ </td>
+ <td>
+ A carriage return character
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\n</code>
+ </td>
+ <td>
+ A new-line character
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>Character Classes</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[abc]</code>
+ </td>
+ <td>
+ Either <code>a</code>, <code>b</code>, or <code>c</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[^abc]</code>
+ </td>
+ <td>
+ Any character but <code>a</code>, <code>b</code>, or <code>c</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-zA-Z]</code>
+ </td>
+ <td>
+ Any character ranging from <code>a</code> thru <code>z</code>, or
+ <code>A</code> thru <code>Z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[^a-zA-Z]</code>
+ </td>
+ <td>
+ Any character except those ranging from <code>a</code> thru
+ <code>z</code>, or <code>A</code> thru <code>Z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a\-z]</code>
+ </td>
+ <td>
+ Either <code>a</code>, <code>-</code>, or <code>z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z[A-Z]]</code>
+ </td>
+ <td>
+ Same as <code>[a-zA-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z&&[g-i]]</code>
+ </td>
+ <td>
+ Any character in the intersection of <code>a-z</code> and
+ <code>g-i</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z&&[^g-i]]</code>
+ </td>
+ <td>
+ Any character in <code>a-z</code> and not in <code>g-i</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Prefefined character classes</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><b>.</b></code>
+ </td>
+ <td>
+ Any character. Multiline matching must be compiled into the pattern for
+ <code><b>.</b></code> to match a <code>\r</code> or a <code>\n</code>.
+ Even if multiline matching is enabled, <code><b>.</b></code> will not
+ match a <code>\r\n</code>, only a <code>\r</code> or a <code>\n</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\d</code>
+ </td>
+ <td>
+ <code>[0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\D</code>
+ </td>
+ <td>
+ <code>[^\d]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\s</code>
+ </td>
+ <td>
+ <code>[ \t\r\n\x0B]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\S</code>
+ </td>
+ <td>
+ <code>[^\s]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\w</code>
+ </td>
+ <td>
+ <code>[a-zA-Z0-9_]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\W</code>
+ </td>
+ <td>
+ <code>[^\w]</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>POSIX character classes
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Lower}</code>
+ </td>
+ <td>
+ <code>[a-z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Upper}</code>
+ </td>
+ <td>
+ <code>[A-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{ASCII}</code>
+ </td>
+ <td>
+ <code>[\x00-\x7F]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Alpha}</code>
+ </td>
+ <td>
+ <code>[a-zA-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Digit}</code>
+ </td>
+ <td>
+ <code>[0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Alnum}</code>
+ </td>
+ <td>
+ <code>[\w&&[^_]]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Punct}</code>
+ </td>
+ <td>
+ <code>[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{XDigit}</code>
+ </td>
+ <td>
+ <code>[a-fA-F0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Boundary Matches</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>^</code>
+ </td>
+ <td>
+ The beginning of a line. Also matches the beginning of input.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>$</code>
+ </td>
+ <td>
+ The end of a line. Also matches the end of input.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\b</code>
+ </td>
+ <td>
+ A word boundary
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\B</code>
+ </td>
+ <td>
+ A non word boundary
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\A</code>
+ </td>
+ <td>
+ The beginning of input
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\G</code>
+ </td>
+ <td>
+ The end of the previous match. Ensures that a "next" match will only
+ happen if it begins with the character immediately following the end of
+ the "current" match.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\Z</code>
+ </td>
+ <td>
+ The end of input. Will also match if there is a single trailing
+ <code>\r\n</code>, a single trailing <code>\r</code>, or a single
+ trailing <code>\n</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\z</code>
+ </td>
+ <td>
+ The end of input
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Greedy Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x?</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x+</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Possessive Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x?+</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*+</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x++</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Reluctant Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x??</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*?</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x+?</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Operators</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>xy</i></code>
+ </td>
+ <td>
+ <code><i>x</i></code> then <code><i>y</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x</i></code>|<code><i>y</i></code>
+ </td>
+ <td>
+ <code><i>x</i></code> or <code><i>y</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code> as a capturing group
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Quoting</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\Q</code>
+ </td>
+ <td>
+ Nothing, but treat every character (including \s) literally until a
+ matching <code>\E</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\E</code>
+ </td>
+ <td>
+ Nothing, but ends its matching <code>\Q</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Special Constructs</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?:<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, but not as a capturing group
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?=<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via positive lookahead. This means that the
+ expression will match only if it is trailed by <code><i>x</i></code>.
+ It will not "eat" any of the characters matched by
+ <code><i>x</i></code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?!<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via negative lookahead. This means that the
+ expression will match only if it is not trailed by
+ <code><i>x</i></code>. It will not "eat" any of the characters
+ matched by <code><i>x</i></code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?<=<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via positive lookbehind. <code><i>x</i></code>
+ cannot contain any quantifiers.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?<!<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via negative lookbehind. <code><i>x</i></code>
+ cannot contain any quantifiers.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?><i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i>{1}+</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Registered Expression Matching</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>{<i>x</i>}</code>
+ </td>
+ <td>
+ The registered pattern <code><i>x</i></code>
+ </td>
+ </tr>
+ </table>
+
+ <hr>
+
+ <i>Begin Text Extracted And Modified From java.util.regex.Pattern documentation</i>
+
+ <h4> Backslashes, escapes, and quoting </h4>
+
+ <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped
+ constructs, as defined in the table above, as well as to quote characters
+ that otherwise would be interpreted as unescaped constructs. Thus the
+ expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
+ left brace.
+
+ <p> It is an error to use a backslash prior to any alphabetic character that
+ does not denote an escaped construct; these are reserved for future
+ extensions to the regular-expression language. A backslash may be used
+ prior to a non-alphabetic character regardless of whether that character is
+ part of an unescaped construct.
+
+ <p>It is necessary to double backslashes in string literals that represent
+ regular expressions to protect them from interpretation by a compiler. The
+ string literal <tt>"\b"</tt>, for example, matches a single backspace
+ character when interpreted as a regular expression, while
+ <tt>"\\b"</tt> matches a word boundary. The string litera
+ <tt>"\(hello\)"</tt> is illegal and leads to a compile-time error;
+ in order to match the string <tt>(hello)</tt> the string literal
+ <tt>"\\(hello\\)"</tt> must be used.
+
+ <h4> Character Classes </h4>
+
+ <p> Character classes may appear within other character classes, and
+ may be composed by the union operator (implicit) and the intersection
+ operator (<tt>&&</tt>).
+ The union operator denotes a class that contains every character that is
+ in at least one of its operand classes. The intersection operator
+ denotes a class that contains every character that is in both of its
+ operand classes.
+
+ <p> The precedence of character-class operators is as follows, from
+ highest to lowest:
+
+ <blockquote><table border="0" cellpadding="1" cellspacing="0"
+ summary="Precedence of character class operators.">
+
+ <tr><th>1 </th>
+ <td>Literal escape </td>
+ <td><tt>\x</tt></td></tr>
+ <tr><th>2 </th>
+ <td>Range</td>
+ <td><tt>a-z</tt></td></tr>
+ <tr><th>3 </th>
+ <td>Grouping</td>
+ <td><tt>[...]</tt></td></tr>
+ <tr><th>4 </th>
+ <td>Intersection</td>
+ <td><tt>[a-z&&[aeiou]]</tt></td></tr>
+ <tr><th>5 </th>
+ <td>Union</td>
+ <td><tt>[a-e][i-u]<tt></td></tr>
+ </table></blockquote>
+
+ <p> Note that a different set of metacharacters are in effect inside
+ a character class than outside a character class. For instance, the
+ regular expression <tt>.</tt> loses its special meaning inside a
+ character class, while the expression <tt>-</tt> becomes a range
+ forming metacharacter.
+
+ <a name="lt">
+
+ <a name="cg">
+ <h4> Groups and capturing </h4>
+
+ <p> Capturing groups are numbered by counting their opening parentheses from
+ left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
+ are four such groups: </p>
+
+ <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
+
+ <tr><th>1 </th>
+ <td><tt>((A)(B(C)))</tt></td></tr>
+ <tr><th>2 </th>
+ <td><tt>(A)</tt></td></tr>
+ <tr><th>3 </th>
+ <td><tt>(B(C))</tt></td></tr>
+
+ <tr><th>4 </th>
+ <td><tt>(C)</tt></td></tr>
+ </table></blockquote>
+
+ <p> Group zero always stands for the entire expression.
+
+ <p> Capturing groups are so named because, during a match, each subsequence
+ of the input sequence that matches such a group is saved. The captured
+ subsequence may be used later in the expression, via a back reference, and
+ may also be retrieved from the matcher once the match operation is complete.
+
+ <p> The captured input associated with a group is always the subsequence
+ that the group most recently matched. If a group is evaluated a second time
+ because of quantification then its previously-captured value, if any, will
+ be retained if the second evaluation fails. Matching the string
+ <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
+ group two set to <tt>"b"</tt>. All captured input is discarded at the
+ beginning of each match.
+
+ <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
+ that do not capture text and do not count towards the group total.
+
+
+ <h4> Unicode support </h4>
+
+ <p> Coming Soon.
+
+ <h4> Comparison to Perl 5 </h4>
+
+ <p>The <code>Pattern</code> engine performs traditional NFA-based matching
+ with ordered alternation as occurs in Perl 5.
+
+ <p> Perl constructs not supported by this class: </p>
+
+ <ul>
+
+ <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
+ <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
+ </p></li>
+
+ <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
+ and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
+
+ <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
+
+ <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>,
+ <tt>\L</tt>, and <tt>\U</tt>. </p></li>
+
+ <li><p> Embedded flags</p></li>
+
+ </ul>
+
+ <p> Constructs supported by this class but not by Perl: </p>
+
+ <ul>
+
+ <li><p> Possessive quantifiers, which greedily match as much as they can
+ and do not back off, even when doing so would allow the overall match to
+ succeed. </p></li>
+
+ <li><p> Character-class union and intersection as described
+ above.</p></li>
+
+ </ul>
+
+ <p> Notable differences from Perl: </p>
+
+ <ul>
+
+ <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
+ as back references; a backslash-escaped number greater than <tt>9</tt> is
+ treated as a back reference if at least that many subexpressions exist,
+ otherwise it is interpreted, if possible, as an octal escape. In this
+ class octal escapes must always begin with a zero. In this class,
+ <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
+ references, and a larger number is accepted as a back reference if at
+ least that many subexpressions exist at that point in the regular
+ expression, otherwise the parser will drop digits until the number is
+ smaller or equal to the existing number of groups or it is one digit.
+ </p></li>
+
+ <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
+ where the last match left off. This functionality is provided implicitly
+ by the <CODE>Matcher</CODE> class: Repeated invocations of the
+ <code>find</code> method will resume where the last match left off,
+ unless the matcher is reset. </p></li>
+
+ <li><p> Perl is forgiving about malformed matching constructs, as in the
+ expression <tt>*a</tt>, as well as dangling brackets, as in the
+ expression <tt>abc]</tt>, and treats them as literals. This
+ class also strict and will not compile a pattern when dangling characters
+ are encountered.</p></li>
+
+ </ul>
+
+
+ <p> For a more precise description of the behavior of regular expression
+ constructs, please see <a href="http://www.oreilly.com/catalog/regex2/">
+ <i>Mastering Regular Expressions, 2nd Edition</i>, Jeffrey E. F. Friedl,
+ O'Reilly and Associates, 2002.</a>
+ </p>
+ <P>
+
+ <i>End Text Extracted And Modified From java.util.regex.Pattern documentation</i>
+
+ <hr>
+
+ @author Jeffery Stuart
+ @since March 2003, Stable Since November 2004
+ @version 1.07.00
+ @memo A class used to represent "PERL 5"-ish regular expressions
+ */
+class Pattern
+{
+ friend class Matcher;
+ friend class NFANode;
+ friend class NFAQuantifierNode;
+ private:
+ /**
+ This constructor should not be called directly. Those wishing to use the
+ Pattern class should instead use the {@link compile compile} method.
+
+ @param rhs The pattern to compile
+ @memo Creates a new pattern from the regular expression in <code>rhs</code>.
+ */
+ Pattern(const bkstring & rhs);
+ protected:
+ /**
+ This currently is not used, so don't try to do anything with it.
+ @memo Holds all the compiled patterns for quick access.
+ */
+ static std::map<bkstring, Pattern *> compiledPatterns;
+ /**
+ Holds all of the registered patterns as strings. Due to certain problems
+ with compilation of patterns, especially with capturing groups, this seemed
+ to be the best way to do it.
+ */
+ static std::map<bkstring, std::pair<bkstring, unsigned long> > registeredPatterns;
+ protected:
+ /**
+ Holds all the NFA nodes used. This makes deletion of a pattern, as well as
+ clean-up from an unsuccessful compile much easier and faster.
+ */
+ std::map<NFANode*, bool> nodes;
+ /**
+ Used when methods like split are called. The matcher class uses a lot of
+ dynamic memeory, so having an instance increases speedup of certain
+ operations.
+ */
+ Matcher * matcher;
+ /**
+ The front node of the NFA.
+ */
+ NFANode * head;
+ /**
+ The actual regular expression we rerpesent
+ */
+ bkstring pattern;
+ /**
+ Flag used during compilation. Once the pattern is successfully compiled,
+ <code>error</code> is no longer used.
+ */
+ bool error;
+ /**
+ Used during compilation to keep track of the current index into
+ <code>{@link pattern pattern}<code>. Once the pattern is successfully
+ compiled, <code>error</code> is no longer used.
+ */
+ int curInd;
+ /**
+ The number of capture groups this contains.
+ */
+ int groupCount;
+ /**
+ The number of non-capture groups this contains.
+ */
+ int nonCapGroupCount;
+ /**
+ The flags specified when this was compiled.
+ */
+ unsigned long flags;
+ protected:
+ /**
+ Raises an error during compilation. Compilation will cease at that point
+ and compile will return <code>NULL</code>.
+ */
+ void raiseError();
+ /**
+ Convenience function for registering a node in <code>nodes</code>.
+ @param node The node to register
+ @return The registered node
+ */
+ NFANode * registerNode(NFANode * node);
+
+ /**
+ Calculates the union of two strings. This function will first sort the
+ strings and then use a simple selection algorithm to find the union.
+ @param s1 The first "class" to union
+ @param s2 The second "class" to union
+ @return A new string containing all unique characters. Each character
+ must have appeared in one or both of <code>s1</code> and
+ <code>s2</code>.
+ */
+ bkstring classUnion (bkstring s1, bkstring s2) const;
+ /**
+ Calculates the intersection of two strings. This function will first sort
+ the strings and then use a simple selection algorithm to find the
+ intersection.
+ @param s1 The first "class" to intersect
+ @param s2 The second "class" to intersect
+ @return A new string containing all unique characters. Each character
+ must have appeared both <code>s1</code> and <code>s2</code>.
+ */
+ bkstring classIntersect (bkstring s1, bkstring s2) const;
+ /**
+ Calculates the negation of a string. The negation is the set of all
+ characters between <code>\x00</code> and <code>\xFF</code> not
+ contained in <code>s1</code>.
+ @param s1 The "class" to be negated.
+ @param s2 The second "class" to intersect
+ @return A new string containing all unique characters. Each character
+ must have appeared both <code>s1</code> and <code>s2</code>.
+ */
+ bkstring classNegate (bkstring s1) const;
+ /**
+ Creates a new "class" representing the range from <code>low</code> thru
+ <code>hi</code>. This function will wrap if <code>low</code> >
+ <code>hi</code>. This is a feature, not a buf. Sometimes it is useful
+ to be able to say [\x70-\x10] instead of [\x70-\x7F\x00-\x10].
+ @param low The beginning character
+ @param hi The ending character
+ @return A new string containing all the characters from low thru hi.
+ */
+ bkstring classCreateRange(char low, char hi) const;
+
+ /**
+ Extracts a decimal number from the substring of member-variable
+ <code>{@link pattern pattern}<code> starting at <code>start</code> and
+ ending at <code>end</code>.
+ @param start The starting index in <code>{@link pattern pattern}<code>
+ @param end The last index in <code>{@link pattern pattern}<code>
+ @return The decimal number in <code>{@link pattern pattern}<code>
+ */
+ int getInt(int start, int end);
+ /**
+ Parses a <code>{n,m}</code> string out of the member-variable
+ <code>{@link pattern pattern}<code> stores the result in <code>sNum</code>
+ and <code>eNum</code>.
+ @param sNum Output parameter. The minimum number of matches required
+ by the curly quantifier are stored here.
+ @param eNum Output parameter. The maximum number of matches allowed
+ by the curly quantifier are stored here.
+ @return Success/Failure. Fails when the curly does not have the proper
+ syntax
+ */
+ bool quantifyCurly(int & sNum, int & eNum);
+ /**
+ Tries to quantify the currently parsed group. If the group being parsed
+ is indeed quantified in the member-variable
+ <code>{@link pattern pattern}<code>, then the NFA is modified accordingly.
+ @param start The starting node of the current group being parsed
+ @param stop The ending node of the current group being parsed
+ @param gn The group number of the current group being parsed
+ @return The node representing the starting node of the group. If the
+ group becomes quantified, then this node is not necessarily
+ a GroupHead node.
+ */
+ NFANode * quantifyGroup(NFANode * start, NFANode * stop, const int gn);
+
+ /**
+ Tries to quantify the last parsed expression. If the character was indeed
+ quantified, then the NFA is modified accordingly.
+ @param newNode The recently created expression node
+ @return The node representing the last parsed expression. If the
+ expression was quantified, <code>return value != newNode</code>
+ */
+ NFANode * quantify(NFANode * newNode);
+ /**
+ Parses the current class being examined in
+ <code>{@link pattern pattern}</code>.
+ @return A string of unique characters contained in the current class being
+ parsed
+ */
+ bkstring parseClass();
+ /**
+ Parses the current POSIX class being examined in
+ <code>{@link pattern pattern}</code>.
+ @return A string of unique characters representing the POSIX class being
+ parsed
+ */
+ bkstring parsePosix();
+ /**
+ Returns a string containing the octal character being parsed
+ @return The string contained the octal value being parsed
+ */
+ bkstring parseOctal();
+ /**
+ Returns a string containing the hex character being parsed
+ @return The string contained the hex value being parsed
+ */
+ bkstring parseHex();
+ /**
+ Returns a new node representing the back reference being parsed
+ @return The new node representing the back reference being parsed
+ */
+ NFANode * parseBackref();
+ /**
+ Parses the escape sequence currently being examined. Determines if the
+ escape sequence is a class, a single character, or the beginning of a
+ quotation sequence.
+ @param inv Output parameter. Whether or not to invert the returned class
+ @param quo Output parameter. Whether or not this sequence starts a
+ quotation.
+ @return The characters represented by the class
+ */
+ bkstring parseEscape(bool & inv, bool & quo);
+ /**
+ Parses a supposed registered pattern currently under compilation. If the
+ sequence of characters does point to a registered pattern, then the
+ registered pattern is appended to <code>*end<code>. The registered pattern
+ is parsed with the current compilation flags.
+ @param end The ending node of the thus-far compiled pattern
+ @return The new end node of the current pattern
+ */
+ NFANode * parseRegisteredPattern(NFANode ** end);
+ /**
+ Parses a lookbehind expression. Appends the necessary nodes
+ <code>*end</code>.
+ @param pos Positive or negative look behind
+ @param end The ending node of the current pattern
+ @return The new end node of the current pattern
+ */
+ NFANode * parseBehind(const bool pos, NFANode ** end);
+ /**
+ Parses the current expression and tacks on nodes until a \E is found.
+ @return The end of the current pattern
+ */
+ NFANode * parseQuote();
+ /**
+ Parses <code>{@link pattern pattern}</code>. This function is called
+ recursively when an or (<code>|</code>) or a group is encountered.
+ @param inParen Are we currently parsing inside a group
+ @param inOr Are we currently parsing one side of an or (<code>|</code>)
+ @param end The end of the current expression
+ @return The starting node of the NFA constructed from this parse
+ */
+ NFANode * parse(const bool inParen = 0, const bool inOr = 0, NFANode ** end = NULL);
+ public:
+ /// We should match regardless of case
+ const static unsigned long CASE_INSENSITIVE;
+ /// We are implicitly quoted
+ const static unsigned long LITERAL;
+ /// @memo We should treat a <code><b>.</b></code> as [\x00-\x7F]
+ const static unsigned long DOT_MATCHES_ALL;
+ /** <code>^</code> and <code>$</code> should anchor to the beginning and
+ ending of lines, not all input
+ */
+ const static unsigned long MULTILINE_MATCHING;
+ /** When enabled, only instances of <code>\n</codes> are recognized as
+ line terminators
+ */
+ const static unsigned long UNIX_LINE_MODE;
+ /// The absolute minimum number of matches a quantifier can match (0)
+ const static int MIN_QMATCH;
+ /// The absolute maximum number of matches a quantifier can match (0x7FFFFFFF)
+ const static int MAX_QMATCH;
+ public:
+ /**
+ Call this function to compile a regular expression into a
+ <code>Pattern</code> object. Special values can be assigned to
+ <code>mode</code> when certain non-standard behaviors are expected from
+ the <code>Pattern</code> object.
+ @param pattern The regular expression to compile
+ @param mode A bitwise or of flags signalling what special behaviors are
+ wanted from this <code>Pattern</code> object
+ @return If successful, <code>compile</code> returns a <code>Pattern</code>
+ pointer. Upon failure, <code>compile</code> returns
+ <code>NULL</code>
+ */
+ static Pattern * compile (const bkstring & pattern,
+ const unsigned long mode = 0);
+ /**
+ Dont use this function. This function will compile a pattern, and cache
+ the result. This will eventually be used as an optimization when people
+ just want to call static methods using the same pattern over and over
+ instead of first compiling the pattern and then using the compiled
+ instance for matching.
+ @param pattern The regular expression to compile
+ @param mode A bitwise or of flags signalling what special behaviors are
+ wanted from this <code>Pattern</code> object
+ @return If successful, <code>compileAndKeep</code> returns a
+ <code>Pattern</code> pointer. Upon failure, <code>compile</code>
+ returns <code>NULL</code>.
+ */
+ static Pattern * compileAndKeep (const bkstring & pattern,
+ const unsigned long mode = 0);
+
+ /**
+ Searches through <code>replace</code> and replaces all substrings matched
+ by <code>pattern</code> with <code>str</code>. <code>str</code> may
+ contain backreferences (e.g. <code>\1</code>) to capture groups. A typical
+ invocation looks like:
+ <p>
+ <code>
+ Pattern::replace("(a+)b(c+)", "abcccbbabcbabc", "\\2b\\1");
+ </code>
+ <p>
+ which would replace <code>abcccbbabcbabc</code> with
+ <code>cccbabbcbabcba</code>.
+ @param pattern The regular expression
+ @param str The replacement text
+ @param replacementText The string in which to perform replacements
+ @param mode The special mode requested of the <code>Pattern</code>
+ during the replacement process
+ @return The text with the replacement string substituted where necessary
+ */
+ static bkstring replace (const bkstring & pattern,
+ const bkstring & str,
+ const bkstring & replacementText,
+ const unsigned long mode = 0);
+
+ /**
+ Splits the specified string over occurrences of the specified pattern.
+ Empty strings can be optionally ignored. The number of strings returned is
+ configurable. A typical invocation looks like:
+ <p>
+ <code>
+ bkstring str(strSize, '\0');<br>
+ FILE * fp = fopen(fileName, "r");<br>
+ fread((char*)str.data(), strSize, 1, fp);<br>
+ fclose(fp);<br>
+ <br>
+ std::vector<bkstring> lines = Pattern::split("[\r\n]+", str, true);<br>
+ <br>
+ </code>
+
+ @param pattern The regular expression
+ @param replace The string to split
+ @param keepEmptys Whether or not to keep empty strings
+ @param limit The maximum number of splits to make
+ @param mode The special mode requested of the <code>Pattern</code>
+ during the split process
+ @return All substrings of <code>str</code> split across <code>pattern</code>.
+ */
+ static std::vector<bkstring> split (const bkstring & pattern,
+ const bkstring & str,
+ const bool keepEmptys = 0,
+ const unsigned long limit = 0,
+ const unsigned long mode = 0);
+
+ /**
+ Finds all the instances of the specified pattern within the string. You
+ should be careful to only pass patterns with a minimum length of one. For
+ example, the pattern <code>a*</code> can be matched by an empty string, so
+ instead you should pass <code>a+</code> since at least one character must
+ be matched. A typical invocation of <code>findAll</code> looks like:
+ <p>
+ <code>
+ std::vector<td::string> numbers = Pattern::findAll("\\d+", string);
+ </code>
+ <p>
+
+ @param pattern The pattern for which to search
+ @param str The string to search
+ @param mode The special mode requested of the <code>Pattern</code>
+ during the find process
+ @return All instances of <code>pattern</code> in <code>str</code>
+ */
+ static std::vector<bkstring> findAll (const bkstring & pattern,
+ const bkstring & str,
+ const unsigned long mode = 0);
+
+ /**
+ Determines if an entire string matches the specified pattern
+
+ @param pattern The pattern for to match
+ @param str The string to match
+ @param mode The special mode requested of the <code>Pattern</code>
+ during the replacement process
+ @return True if <code>str</code> is recognized by <code>pattern</code>
+ */
+ static bool matches (const bkstring & pattern,
+ const bkstring & str,
+ const unsigned long mode = 0);
+
+ /**
+ Registers a pattern under a specific name for use in later compilations.
+ A typical invocation and later use looks like:
+ <p>
+ <code>
+ Pattern::registerPattern("ip", "(?:\\d{1,3}\\.){3}\\d{1,3}");<br>
+ Pattern * p1 = Pattern::compile("{ip}:\\d+");<br>
+ Pattern * p2 = Pattern::compile("Connection from ({ip}) on port \\d+");<br>
+ </code>
+ <p>
+ Multiple calls to <code>registerPattern</code> with the same
+ <code>name</code> will result in the pattern getting overwritten.
+
+ @param name The name to give to the pattern
+ @param pattern The pattern to register
+ @param mode Any special flags to use when compiling pattern
+ @return Success/Failure. Fails only if <code>pattern</code> has invalid
+ syntax
+ */
+ static bool registerPattern(const bkstring & name,
+ const bkstring & pattern,
+ const unsigned long mode = 0);
+
+ /**
+ Clears the pattern registry
+ */
+ static void unregisterPatterns();
+ /**
+ Don't use
+ */
+ static void clearPatternCache();
+
+ /**
+ Searches through a string for the <code>n<sup>th</sup></code> match of the
+ given pattern in the string. Match indeces start at zero, not one.
+ A typical invocation looks like this:
+ <p>
+ <code>
+ std::pair<bkstring, int> match = Pattern::findNthMatch("\\d{1,3}", "192.168.1.101:22", 1);<br>
+ printf("%s %i\n", match.first.c_str(), match.second);<br>
+ <br>
+ Output: 168 4<br>
+ <br>
+
+ @param pattern The pattern for which to search
+ @param str The string to search
+ @param matchNum Which match to find
+ @param mode Any special flags to use during the matching process
+ @return A string and an integer. The string is the string matched. The
+ integer is the starting location of the matched string in
+ <code>str</code>. You can check for success/failure by making sure
+ that the integer returned is greater than or equal to zero.
+ */
+ static std::pair<bkstring, int> findNthMatch (const bkstring & pattern,
+ const bkstring & str,
+ const int matchNum,
+ const unsigned long mode = 0);
+ public:
+ /**
+ Deletes all NFA nodes allocated during compilation
+ */
+ ~Pattern();
+
+ bkstring replace (const bkstring & str,
+ const bkstring & replacementText);
+ std::vector<bkstring> split (const bkstring & str, const bool keepEmptys = 0,
+ const unsigned long limit = 0);
+ std::vector<bkstring> findAll (const bkstring & str);
+ bool matches (const bkstring & str);
+ /**
+ Returns the flags used during compilation of this pattern
+ @return The flags used during compilation of this pattern
+ */
+ unsigned long getFlags () const;
+ /**
+ Returns the regular expression this pattern represents
+ @return The regular expression this pattern represents
+ */
+ bkstring getPattern () const;
+ /**
+ Creates a matcher object using the specified string and this pattern.
+ @param str The string to match against
+ @return A new matcher using object using this pattern and the specified
+ string
+ */
+ Matcher * createMatcher (const bkstring & str);
+};
+
+class NFANode
+{
+ friend class Matcher;
+ public:
+ NFANode * next;
+ NFANode();
+ virtual ~NFANode();
+ virtual void findAllNodes(std::map<NFANode*, bool> & soFar);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const = 0;
+ inline virtual bool isGroupHeadNode() const { return false; }
+ inline virtual bool isStartOfInputNode() const { return false; }
+};
+class NFACharNode : public NFANode
+{
+ protected:
+ char ch;
+ public:
+ NFACharNode(const char c);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFACICharNode : public NFANode
+{
+ protected:
+ char ch;
+ public:
+ NFACICharNode(const char c);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAStartNode : public NFANode
+{
+ public:
+ NFAStartNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAEndNode : public NFANode
+{
+ public:
+ NFAEndNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAQuantifierNode : public NFANode
+{
+ public:
+ int min, max;
+ NFANode * inner;
+ virtual void findAllNodes(std::map<NFANode*, bool> & soFar);
+ NFAQuantifierNode(Pattern * pat, NFANode * internal,
+ const int minMatch = Pattern::MIN_QMATCH,
+ const int maxMatch = Pattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAGreedyQuantifierNode : public NFAQuantifierNode
+{
+ public:
+ NFAGreedyQuantifierNode(Pattern * pat, NFANode * internal,
+ const int minMatch = Pattern::MIN_QMATCH,
+ const int maxMatch = Pattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+ virtual int matchInternal(const bkstring & str, Matcher * matcher, const int curInd, const int soFar) const;
+};
+class NFALazyQuantifierNode : public NFAQuantifierNode
+{
+ public:
+ NFALazyQuantifierNode(Pattern * pat, NFANode * internal,
+ const int minMatch = Pattern::MIN_QMATCH,
+ const int maxMatch = Pattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAPossessiveQuantifierNode : public NFAQuantifierNode
+{
+ public:
+ NFAPossessiveQuantifierNode(Pattern * pat, NFANode * internal,
+ const int minMatch = Pattern::MIN_QMATCH,
+ const int maxMatch = Pattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAAcceptNode : public NFANode
+{
+ public:
+ NFAAcceptNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAClassNode : public NFANode
+{
+ public:
+ bool inv;
+ std::map<char, bool> vals;
+ NFAClassNode(const bool invert = 0);
+ NFAClassNode(const bkstring & clazz, const bool invert);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFACIClassNode : public NFANode
+{
+ public:
+ bool inv;
+ std::map<char, bool> vals;
+ NFACIClassNode(const bool invert = 0);
+ NFACIClassNode(const bkstring & clazz, const bool invert);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFASubStartNode : public NFANode
+{
+ public:
+ NFASubStartNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAOrNode : public NFANode
+{
+ public:
+ NFANode * one;
+ NFANode * two;
+ NFAOrNode(NFANode * first, NFANode * second);
+ virtual void findAllNodes(std::map<NFANode*, bool> & soFar);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAQuoteNode : public NFANode
+{
+ public:
+ bkstring qStr;
+ NFAQuoteNode(const bkstring & quoted);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFACIQuoteNode : public NFANode
+{
+ public:
+ bkstring qStr;
+ NFACIQuoteNode(const bkstring & quoted);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFALookAheadNode : public NFANode
+{
+ public:
+ bool pos;
+ NFANode * inner;
+ NFALookAheadNode(NFANode * internal, const bool positive);
+ virtual void findAllNodes(std::map<NFANode*, bool> & soFar);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFALookBehindNode : public NFANode
+{
+ public:
+ bool pos;
+ bkstring mStr;
+ NFALookBehindNode(const bkstring & str, const bool positive);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAStartOfLineNode : public NFANode
+{
+ public:
+ NFAStartOfLineNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAEndOfLineNode : public NFANode
+{
+ public:
+ NFAEndOfLineNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAReferenceNode : public NFANode
+{
+ public:
+ int gi;
+ NFAReferenceNode(const int groupIndex);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAStartOfInputNode : public NFANode
+{
+ public:
+ NFAStartOfInputNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+ inline virtual bool isStartOfInputNode() const { return true; }
+};
+class NFAEndOfInputNode : public NFANode
+{
+ public:
+ bool term;
+ NFAEndOfInputNode(const bool lookForTerm);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAWordBoundaryNode : public NFANode
+{
+ public:
+ bool pos;
+ NFAWordBoundaryNode(const bool positive);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAEndOfMatchNode : public NFANode
+{
+ public:
+ NFAEndOfMatchNode();
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupHeadNode : public NFANode
+{
+ public:
+ int gi;
+ NFAGroupHeadNode(const int groupIndex);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+ inline virtual bool isGroupHeadNode() const { return true; }
+};
+class NFAGroupTailNode : public NFANode
+{
+ public:
+ int gi;
+ NFAGroupTailNode(const int groupIndex);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupLoopPrologueNode : public NFANode
+{
+ public:
+ int gi;
+ NFAGroupLoopPrologueNode(const int groupIndex);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupLoopNode : public NFANode
+{
+ public:
+ int gi, min, max, type;
+ NFANode * inner;
+ NFAGroupLoopNode(NFANode * internal, const int minMatch,
+ const int maxMatch, const int groupIndex, const int matchType);
+ virtual void findAllNodes(std::map<NFANode*, bool> & soFar);
+ virtual int match(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+ int matchGreedy(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+ int matchLazy(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+ int matchPossessive(const bkstring & str, Matcher * matcher, const int curInd = 0) const;
+};
+
+#endif
+
diff --git a/plugins/smileyadd/regexp/WCMatcher.cpp b/plugins/smileyadd/regexp/WCMatcher.cpp new file mode 100644 index 0000000000..bed5a1944b --- /dev/null +++ b/plugins/smileyadd/regexp/WCMatcher.cpp @@ -0,0 +1,181 @@ +#include "WCMatcher.h"
+#include "WCPattern.h"
+
+const int WCMatcher::MATCH_ENTIRE_STRING = 0x01;
+
+/*
+ Detailed documentation is provided in this class' header file
+
+ @author Jeffery Stuart
+ @since November 2004
+ @version 1.07.00
+*/
+
+WCMatcher::WCMatcher(WCPattern * pattern, const bkstring & text)
+{
+ pat = pattern;
+ str = &text;
+ gc = pattern->groupCount;
+ ncgc = -pattern->nonCapGroupCount;
+ flags = 0;
+ matchedSomething = false;
+ starts = new int[gc + ncgc];
+ ends = new int[gc + ncgc];
+ groups = new int[gc + ncgc];
+ groupPos = new int[gc + ncgc];
+ groupIndeces = new int[gc + ncgc];
+ starts = starts + ncgc;
+ ends = ends + ncgc;
+ groups = groups + ncgc;
+ groupPos = groupPos + ncgc;
+ groupIndeces = groupIndeces + ncgc;
+ for (int i = 0; i < gc; ++i) starts[i] = ends[i] = 0;
+}
+WCMatcher::~WCMatcher()
+{
+ delete [] (starts - ncgc);
+ delete [] (ends - ncgc);
+ delete [] (groups - ncgc);
+ delete [] (groupIndeces - ncgc);
+ delete [] (groupPos - ncgc);
+}
+void WCMatcher::clearGroups()
+{
+ int i;
+ lm = 0;
+ for (i = 0; i < gc; ++i) groups[i] = starts[i] = ends[i] = -1;
+ for (i = 1; i <= ncgc; ++i) groups[0 - i] = starts[0 - i] = ends[0 - i] = -1;
+}
+bkstring WCMatcher::replaceWithGroups(const bkstring & str)
+{
+ bkstring ret = L"";
+
+ bkstring t = str;
+ while (t.size() > 0)
+ {
+ if (t[0] == (wchar_t)'\\')
+ {
+ t.erase(0, 1);
+ if (t.size() == 0)
+ {
+ ret += L"\\";
+ }
+ else if (t[0] < (wchar_t)'0' || t[0] > (wchar_t)'9')
+ {
+ ret += t[0];
+ t.erase(0, 1);
+ }
+ else
+ {
+ int gn = 0;
+ while (t.size() > 0 && t[0] >= (wchar_t)'0' && t[0] <= (wchar_t)'9')
+ {
+ gn = gn * 10 + (t[0] - (wchar_t)'0');
+ t.erase(0, 1);
+ }
+ ret += getGroup(gn);
+ }
+ }
+ else
+ {
+ ret += t[0];
+ t.erase(0, 1);
+ }
+ }
+
+ return ret;
+}
+unsigned long WCMatcher::getFlags() const
+{
+ return flags;
+}
+const bkstring& WCMatcher::getText() const
+{
+ return *str;
+}
+
+bool WCMatcher::matches()
+{
+ flags = MATCH_ENTIRE_STRING;
+ matchedSomething = false;
+ clearGroups();
+ lm = 0;
+ return pat->head->match(*str, this, 0) == (int)str->size();
+}
+bool WCMatcher::findFirstMatch()
+{
+ starts[0] = 0;
+ flags = 0;
+ clearGroups();
+ start = 0;
+ lm = 0;
+ ends[0] = pat->head->match(*str, this, 0);
+ if (ends[0] >= 0)
+ {
+ matchedSomething = true;
+ return 1;
+ }
+ return 0;
+}
+bool WCMatcher::findNextMatch()
+{
+ int s = starts[0], e = ends[0];
+
+ if (!matchedSomething) return findFirstMatch();
+ if (s == e) ++e;
+ flags = 0;
+ clearGroups();
+
+ starts[0] = e;
+ if (e >= (int)str->size()) return 0;
+ start = e;
+ lm = e;
+ ends[0] = pat->head->match(*str, this, e);
+ return ends[0] >= 0;
+}
+std::vector<bkstring> WCMatcher::findAll()
+{
+ std::vector<bkstring> ret;
+ reset();
+ while (findNextMatch())
+ {
+ ret.push_back(getGroup());
+ }
+ return ret;
+}
+
+void WCMatcher::reset()
+{
+ lm = 0;
+ clearGroups();
+ matchedSomething = false;
+}
+
+int WCMatcher::getStartingIndex(const int groupNum) const
+{
+ if (groupNum < 0 || groupNum >= gc) return -1;
+ return starts[groupNum];
+}
+int WCMatcher::getEndingIndex(const int groupNum) const
+{
+ if (groupNum < 0 || groupNum >= gc) return -1;
+ return ends[groupNum];
+}
+bkstring WCMatcher::getGroup(const int groupNum) const
+{
+ if (groupNum < 0 || groupNum >= gc) return L"";
+ if (starts[groupNum] < 0 || ends[groupNum] < 0) return L"";
+ return str->substr(starts[groupNum], ends[groupNum] - starts[groupNum]);
+}
+std::vector<bkstring> WCMatcher::getGroups(const bool includeGroupZero) const
+{
+ int i, start = (includeGroupZero ? 0 : 1);
+ std::vector<bkstring> ret;
+
+ for (i = start; i < gc; ++i)
+ {
+ ret.push_back(getGroup(i));
+ }
+ return ret;
+}
+
diff --git a/plugins/smileyadd/regexp/WCMatcher.h b/plugins/smileyadd/regexp/WCMatcher.h new file mode 100644 index 0000000000..23cc49e41f --- /dev/null +++ b/plugins/smileyadd/regexp/WCMatcher.h @@ -0,0 +1,234 @@ +#ifndef __WCMATCHER_H__
+#define __WCMATCHER_H__
+
+#include "bkstring.h"
+#include <vector>
+#include <WCPattern.h>
+
+/**
+ A matcher is a non thread-safe object used to scan strings using a given
+ {@link WCPattern WCPattern} object. Using a <code>WCMatcher</code> is the preferred
+ method for scanning strings. WCMatchers are not thread-safe. WCMatchers require
+ very little dynamic memory, hence one is encouraged to create several
+ instances of a matcher when necessary as opposed to sharing a single instance
+ of a matcher.
+ <p>
+ The most common methods needed by the matcher are <code>matches</code>,
+ <code>findNextMatch</code>, and <code>getGroup</code>. <code>matches</code>
+ and <code>findNextMatch</code> both return success or failure, and further
+ details can be gathered from their documentation.
+ <p>
+ Unlike Java's <code>WCMatcher</code>, this class allows you to change the string
+ you are matching against. This provides a small optimization, since you no
+ longer need multiple matchers for a single pattern in a single thread.
+ <p>
+ This class also provides an extremely handy method for replacing text with
+ captured data via the <code>replaceWithGroups</code> method. A typical
+ invocation looks like:
+ <pre>
+ wchar_t buf[10000];
+ bkstring str = "\\5 (user name \\1) uses \\7 for his/her shell and \\6 is their home directory";
+ FILE * fp = fopen("/etc/passwd", "r");
+ WCPattern::registerWCPattern("entry", "[^:]+");
+ WCPattern * p = WCPattern::compile("^({entry}):({entry}):({entry}):({entry}):({entry}):({entry}):({entry})$",
+ WCPattern::MULTILINE_MATCHING | WCPattern::UNIX_LINE_MODE);
+ WCMatcher * m = p->createWCMatcher("");
+ while (fgets(buf, 9999, fp))
+ {
+ m->setString(buf);
+ if (m->matches())
+ {
+ printf("%s\n", m->replaceWithGroups(str).c_str());
+ }
+ }
+ fclose(fp);
+
+ </pre>
+ Calling any of the following functions before first calling
+ <code>matches</code>, <code>findFirstMatch</code>, or
+ <code>findNextMatch</code> results in undefined behavior and may cause your
+ program to crash.
+ <code>
+ <ul>
+ <li>replaceWithGroups</code>
+ <li>getStartingIndex</li>
+ <li>getEndingIndex</li>
+ <li>getGroup</li>
+ <li>getGroups</li>
+ </ul>
+ </code>
+ <p>
+ The function <code>findFirstMatch</code> will attempt to find the first match
+ in the input string. The same results can be obtained by first calling
+ <code>reset</code> followed by <code>findNextMatch</code>.
+ <p>
+ To eliminate the necessity of looping through a string to find all the
+ matching substrings, <code>findAll</code> was created. The function will find
+ all matching substrings and return them in a <code>vector</code>. If you need
+ to examine specific capture groups within the substrings, then this method
+ should not be used.
+
+ @author Jeffery Stuart
+ @since March 2003, Stable Since November 2004
+ @version 1.05.00
+ @memo Mutable object used on instances of a WCPattern class
+ */
+class WCMatcher
+{
+ friend class NFAUNode;
+ friend class NFAStartUNode;
+ friend class NFAEndUNode;
+ friend class NFAGroupHeadUNode;
+ friend class NFAGroupLoopUNode;
+ friend class NFAGroupLoopPrologueUNode;
+ friend class NFAGroupTailUNode;
+ friend class NFALookBehindUNode;
+ friend class NFAStartOfLineUNode;
+ friend class NFAEndOfLineUNode;
+ friend class NFAEndOfMatchUNode;
+ friend class NFAReferenceUNode;
+ friend class WCPattern;
+ private:
+ /**
+ Creates a new matcher object against <code>text</code> using
+ <code>pattern</code>.
+
+ @param pattern The pattern with which to search
+ @param text The text in which to search
+ */
+ WCMatcher(WCPattern * pattern, const bkstring & text);
+ protected:
+ /// The pattern we use to match
+ WCPattern * pat;
+ /// The string in which we are matching
+ const bkstring * str;
+ /// The starting point of our match
+ int start;
+ /// An array of the starting positions for each group
+ int * starts;
+ /// An array of the ending positions for each group
+ int * ends;
+ /// An array of private data used by NFAUNodes during matching
+ int * groups;
+ /// An array of private data used by NFAUNodes during matching
+ int * groupIndeces;
+ /// An array of private data used by NFAUNodes during matching
+ int * groupPos;
+ /// The ending index of the last match
+ int lm;
+ /// The number of capturing groups we have
+ int gc;
+ /// The number of non-capturing groups we havew
+ int ncgc;
+ /// Whether or not we have matched something (used only by findFirstMatch and findNextMatch)
+ int matchedSomething;
+ /// The flags with which we were made
+ unsigned long flags;
+ /// Called by reset to clear the group arrays
+ void clearGroups();
+ public:
+ /// Used internally by match to signify we want the entire string matched
+ const static int MATCH_ENTIRE_STRING;
+ public:
+ /// Cleans up the dynamic memory used by this matcher
+ ~WCMatcher();
+ /**
+ Replaces the contents of <code>str</code> with the appropriate captured
+ text. <code>str</code> should have at least one back reference, otherwise
+ this function does nothing.
+ @param str The string in which to replace text
+ @return A string with all backreferences appropriately replaced
+ */
+ bkstring replaceWithGroups(const bkstring & str);
+ /**
+ The flags currently being used by the matcher.
+ @return Zero
+ */
+ unsigned long getFlags() const;
+ /**
+ The text being searched by the matcher.
+ @return the text being searched by the matcher.
+ */
+ const bkstring& getText() const;
+
+ /**
+ Scans the string from start to finish for a match. The entire string must
+ match for this function to return success. Group variables are
+ appropriately set and can be queried after this function returns.
+
+ @return Success if and only if the entire string matches the pattern
+ */
+ bool matches();
+ /**
+ Scans the string for the first substring matching the pattern. The entire
+ string does not necessarily have to match for this function to return
+ success. Group variables are appropriately set and can be queried after
+ this function returns.
+
+ @return Success if any substring matches the specified pattern
+ */
+ bool findFirstMatch();
+ /**
+ Scans the string for the next substring matching the pattern. If no calls
+ have been made to findFirstMatch of findNextMatch since the last call to
+ reset, matches, or setString, then this function's behavior results to
+ that of findFirstMatch.
+
+ @return Success if another substring can be found that matches the pattern
+ */
+ bool findNextMatch();
+ /**
+ Returns a vector of every substring in order which matches the given
+ pattern.
+
+ @return Every substring in order which matches the given pattern
+ */
+ std::vector<bkstring> findAll();
+ /**
+ Resets the internal state of the matcher
+ */
+ void reset();
+ /**
+ Same as getText. Left n for backwards compatibilty with old source code
+ @return Returns the string that is currently being used for matching
+ */
+ inline const bkstring& getString() const { return *str; }
+ /**
+ Sets the string to scan
+ @param newStr The string to scan for subsequent matches
+ */
+ inline void setString(const bkstring & newStr) { str = &newStr; reset(); }
+
+ /**
+ Returns the starting index of the specified group.
+ @param groupNum The group to query
+ @return The starting index of the group if it was matched, -1 for an
+ invalid group or if the group was not matched
+ */
+ int getStartingIndex(const int groupNum = 0) const;
+ /**
+ Returns the ending index of the specified group.
+ @param groupNum The group to query
+ @return The ending index of the group if it was matched, -1 for an
+ invalid group or if the group was not matched
+ */
+ int getEndingIndex(const int groupNum = 0) const;
+ /**
+ Returns the specified group. An empty string ("") does not necessarily
+ mean the group was not matched. A group such as (a*b?) could be matched by
+ a zero length. If an empty string is returned, getStartingIndex can be
+ called to determine if the group was actually matched.
+ @param groupNum The group to query
+ @return The text of the group
+ */
+ bkstring getGroup(const int groupNum = 0) const;
+ /**
+ Returns every capture group in a vector
+
+ @param includeGroupZero Whether or not include capture group zero
+ @return Every capture group
+ */
+ std::vector<bkstring> getGroups(const bool includeGroupZero = 0) const;
+};
+
+#endif
diff --git a/plugins/smileyadd/regexp/WCPattern.cpp b/plugins/smileyadd/regexp/WCPattern.cpp new file mode 100644 index 0000000000..d396bd187d --- /dev/null +++ b/plugins/smileyadd/regexp/WCPattern.cpp @@ -0,0 +1,1747 @@ +/**
+ From the author (Jeff Stuart)
+ "
+ Let me start by saying this file is pretty big. If you feel up to it, you can
+ try making changes yourself, but you would be better off to just email me at
+ stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you
+ would like added. This project is very "near and dear" to me, so I am fairly quick
+ to make bug fixes. The header files for WCPattern and WCMatcher are fairly well
+ documented and the function names are pretty self-explanatory, but if you are having
+ any trouble, feel free to email me at stuart@cs.ucdavis.edu.
+
+ If you email me, make sure you put something like C++RE in the subject because
+ I tend to delete email if I don't recognize the name and the subject is
+ something like "I Need Your Help" or "Got A Second" or "I Found It".
+ "
+ */
+
+/*
+ Detailed documentation is provided in this class' header file
+
+ @author Jeffery Stuart
+ @since November 2004
+ @version 1.07.00
+*/
+
+#ifdef _WIN32
+ #pragma warning(push)
+ #pragma warning(disable:4996)
+#endif
+
+#include <WCPattern.h>
+#include <WCMatcher.h>
+#include <wchar.h>
+#include <algorithm>
+#ifndef _WIN32
+ #include <wctype.h>
+#endif
+
+std::map<bkstring, WCPattern *> WCPattern::compiledWCPatterns;
+std::map<bkstring, std::pair<bkstring, unsigned long> > WCPattern::registeredWCPatterns;
+
+const int WCPattern::MIN_QMATCH = 0x00000000;
+const int WCPattern::MAX_QMATCH = 0x7FFFFFFF;
+
+const unsigned long WCPattern::CASE_INSENSITIVE = 0x01;
+const unsigned long WCPattern::LITERAL = 0x02;
+const unsigned long WCPattern::DOT_MATCHES_ALL = 0x04;
+const unsigned long WCPattern::MULTILINE_MATCHING = 0x08;
+const unsigned long WCPattern::UNIX_LINE_MODE = 0x10;
+
+#define to_lower(a) (wchar_t)(UINT_PTR)CharLowerW((LPWSTR)(unsigned)a)
+#define is_alpha IsCharAlphaW
+
+#if defined(_WIN32)
+ #define str_icmp lstrcmpiW
+#elif defined(__CYGWIN__) || defined(__APPLE__)
+ #include <wctype.h>
+ static inline int str_icmp(const wchar_t * a, const wchar_t * b)
+ {
+ while (*a && *b)
+ {
+ const int t = (int)towlower(*a) - (int)tolower(*b);
+ if (t) return t;
+ ++a; ++b;
+ }
+ if (*a)
+ {
+ if (*b) return (int)towlower(*a) - (int)tolower(*b);
+ return 1;
+ }
+ else if (*b) return 1;
+ return 0;
+ }
+#else
+ #define str_icmp wcscasecmp
+#endif
+
+WCPattern::WCPattern(const bkstring & rhs)
+{
+ matcher = NULL;
+ pattern = rhs;
+ curInd = 0;
+ groupCount = 0;
+ nonCapGroupCount = 0;
+ error = 0;
+ head = NULL;
+}
+// convenience function in case we want to add any extra debugging output
+void WCPattern::raiseError()
+{
+/* switch (pattern[curInd - 1])
+ {
+ case '*':
+ case ')':
+ case '+':
+ case '?':
+ case ']':
+ case '}':
+ fwprintf(stderr, L"%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
+ fwprintf(stderr, L"Syntax Error near here. Possible unescaped meta character.\n");
+ break;
+ default:
+ fwprintf(stderr, L"%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
+ fwprintf(stderr, L"Syntax Error near here. \n");
+ break;
+ }*/
+ error = 1;
+}
+NFAUNode * WCPattern::registerNode(NFAUNode * node)
+{
+ nodes[node] = 1;
+ return node;
+}
+
+bkstring WCPattern::classUnion (bkstring s1, bkstring s2) const
+{
+ wchar_t * out = new wchar_t[66000];
+ std::sort(s1.begin(), s1.end());
+ std::sort(s2.begin(), s2.end());
+ *std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
+ bkstring ret = out;
+ delete [] out;
+ return ret;
+}
+bkstring WCPattern::classIntersect (bkstring s1, bkstring s2) const
+{
+ wchar_t * out = new wchar_t[66000];
+ std::sort(s1.begin(), s1.end());
+ std::sort(s2.begin(), s2.end());
+ *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
+ bkstring ret = out;
+ delete [] out;
+ return ret;
+}
+bkstring WCPattern::classNegate (bkstring s1) const
+{
+ wchar_t * out = new wchar_t[66000];
+ int i, ind = 0;
+ std::map<wchar_t, bool> m;
+
+ for (i = 0; i < (int)s1.size(); ++i) m[s1[i]] = 1;
+ for (i = 0xFF; i >= 0; --i) if (m.find((wchar_t)i) == m.end()) out[ind++] = (wchar_t)i;
+ out[ind] = 0;
+ bkstring ret(out, ind);
+ delete [] out;
+ return ret;
+}
+bkstring WCPattern::classCreateRange(wchar_t low, wchar_t hi) const
+{
+ wchar_t out[300];
+ int ind = 0;
+ while (low != hi) out[ind++] = low++;
+ out[ind++] = low;
+ return bkstring(out, ind);
+}
+
+int WCPattern::getInt(int start, int end)
+{
+ int ret = 0;
+ for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - (wchar_t)'0');
+ return ret;
+}
+bool WCPattern::quantifyCurly(int & sNum, int & eNum)
+{
+ bool good = 1;
+ int i, ci = curInd + 1;
+ int commaInd = ci, endInd = ci, len = pattern.size();
+ sNum = eNum = 0;
+
+ while (endInd < len && pattern[endInd ] != (wchar_t)'}') ++endInd;
+ while (commaInd < endInd && pattern[commaInd] != (wchar_t)',') ++commaInd;
+ if (endInd >= len) { raiseError(); return 0; }
+ for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0;
+ if (!good && commaInd < endInd) { raiseError(); return 0; }
+ if (!good) return 0;
+ /* so now everything in here is either a comma (and there is at most one comma) or a digit */
+ if (commaInd == ci) // {,*}
+ {
+ if (endInd == commaInd + 1) { sNum = MIN_QMATCH; eNum = MAX_QMATCH; } // {,} = *
+ else { sNum = MIN_QMATCH; eNum = getInt(commaInd + 1, endInd - 1); } // {,+}
+ }
+ else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH; } // {+,}
+ else if (commaInd == endInd) { sNum = getInt(ci, endInd - 1); eNum = sNum; } // {+}
+ else { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1); } // {+,+}
+ curInd = endInd + 1;
+ return 1;
+}
+NFAUNode * WCPattern::quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn)
+{
+ NFAUNode * newNode = NULL;
+ int type = 0;
+
+ if (curInd < (int)pattern.size())
+ {
+ wchar_t ch = (curInd + 1 >= (int)pattern.size()) ? USHRT_MAX : pattern[curInd + 1];
+ switch (pattern[curInd])
+ {
+ case (wchar_t)'*':
+ ++curInd;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; type = 1; break;
+ case (wchar_t)'+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopUNode(start, MIN_QMATCH, MAX_QMATCH, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ case (wchar_t)'?':
+ ++curInd;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; type = 1; break;
+ case (wchar_t)'+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopUNode(start, MIN_QMATCH, 1, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ case (wchar_t)'+':
+ ++curInd;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; type = 1; break;
+ case (wchar_t)'+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopUNode(start, 1, MAX_QMATCH, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ case (wchar_t)'{':
+ {
+ int s, e;
+ if (quantifyCurly(s, e))
+ {
+ ch = (curInd < (int)pattern.size()) ? pattern[curInd] : USHRT_MAX;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; type = 1; break;
+ case (wchar_t)'+': ++curInd; type = 2; break;
+ }
+ newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
+ newNode->next = registerNode(new NFAGroupLoopUNode(start, s, e, gn, type));
+ stop->next = newNode->next;
+ return newNode;
+ }
+ }
+ default:
+ break;
+ }
+ }
+ return NULL;
+}
+
+NFAUNode * WCPattern::quantify(NFAUNode * newNode)
+{
+ if (curInd < (int)pattern.size())
+ {
+ wchar_t ch = (curInd + 1 >= (int)pattern.size()) ? USHRT_MAX : pattern[curInd + 1];
+ switch (pattern[curInd])
+ {
+ case (wchar_t)'*':
+ ++curInd;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
+ case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
+ }
+ break;
+ case (wchar_t)'?':
+ ++curInd;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, MIN_QMATCH, 1)); break;
+ case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, MIN_QMATCH, 1)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, MIN_QMATCH, 1)); break;
+ }
+ break;
+ case (wchar_t)'+':
+ ++curInd;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, 1, MAX_QMATCH)); break;
+ case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, 1, MAX_QMATCH)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, 1, MAX_QMATCH)); break;
+ }
+ break;
+ case (wchar_t)'{':
+ {
+ int s, e;
+ if (quantifyCurly(s, e))
+ {
+ ch = (curInd < (int)pattern.size()) ? pattern[curInd] : USHRT_MAX;
+ switch (ch)
+ {
+ case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, s, e)); break;
+ case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, s, e)); break;
+ default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, s, e)); break;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return newNode;
+}
+bkstring WCPattern::parseClass()
+{
+ bkstring t, ret = L"";
+ wchar_t ch, c1, c2;
+ bool inv = 0, neg = 0, quo = 0;
+
+ if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'^')
+ {
+ ++curInd;
+ neg = 1;
+ }
+ while (curInd < (int)pattern.size() && pattern[curInd] != (wchar_t)']')
+ {
+ ch = pattern[curInd++];
+ if (ch == (wchar_t)'[')
+ {
+ t = parseClass();
+ ret = classUnion(ret, t);
+ }
+ /*else if (ch == (wchar_t)'-')
+ {
+ raiseError();
+ curInd = pattern.size();
+ }*/
+ else if (ch == (wchar_t)'&' && curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'&')
+ {
+ if (pattern[++curInd] != (wchar_t)'[')
+ {
+ raiseError();
+ curInd = pattern.size();
+ }
+ else
+ {
+ ++curInd;
+ t = parseClass();
+ ret = classIntersect(ret, t);
+ }
+ }
+ else if (ch == (wchar_t)'\\')
+ {
+ t = parseEscape(inv, quo);
+ if (quo)
+ {
+ raiseError();
+ curInd = pattern.size();
+ }
+ else if (inv || t.size() > 1) // cant be part of a range (a-z)
+ {
+ if (inv) t = classNegate(t);
+ ret = classUnion(ret, t);
+ }
+ else if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'-') // part of a range (a-z)
+ {
+ c1 = t[0];
+ ++curInd;
+ if (curInd >= (int)pattern.size()) raiseError();
+ else
+ {
+ c2 = pattern[curInd++];
+ if (c2 == (wchar_t)'\\')
+ {
+ t = parseEscape(inv, quo);
+ if (quo)
+ {
+ raiseError();
+ curInd = pattern.size();
+ }
+ else if (inv || t.size() > 1) raiseError();
+ else ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ else if (c2 == (wchar_t)'[' || c2 == (wchar_t)']' || c2 == (wchar_t)'-' || c2 == (wchar_t)'&')
+ {
+ raiseError();
+ curInd = pattern.size();
+ }
+ else ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ }
+ else
+ {
+ ret = classUnion(ret, t);
+ }
+ }
+ else if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'-')
+ {
+ c1 = ch;
+ ++curInd;
+ if (curInd >= (int)pattern.size()) raiseError();
+ else
+ {
+ c2 = pattern[curInd++];
+ if (c2 == (wchar_t)'\\')
+ {
+ t = parseEscape(inv, quo);
+ if (quo)
+ {
+ raiseError();
+ curInd = pattern.size();
+ }
+ else if (inv || t.size() > 1) raiseError();
+ else ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ else if (c2 == (wchar_t)'[' || c2 == (wchar_t)']' || c2 == (wchar_t)'-' || c2 == (wchar_t)'&')
+ {
+ raiseError();
+ curInd = pattern.size();
+ }
+ else
+ {
+ ret = classUnion(ret, classCreateRange(c1, c2));
+ }
+ }
+ }
+ else
+ {
+ ret += L" ";
+ ret[ret.size() - 1] = ch;
+ }
+ }
+ if (curInd >= (int)pattern.size() || pattern[curInd] != (wchar_t)']')
+ {
+ raiseError();
+ ret = L"";
+ }
+ else
+ {
+ ++curInd;
+ if (neg) ret = classNegate(ret);
+ }
+ return ret;
+}
+bkstring WCPattern::parsePosix()
+{
+ bkstring s7 = pattern.substr(curInd, 7);
+ if (s7 == L"{Lower}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyz"; }
+ if (s7 == L"{Upper}") { curInd += 7; return L"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; }
+ if (s7 == L"{Alpha}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; }
+ if (s7 == L"{Digit}") { curInd += 7; return L"0123456789"; }
+ if (s7 == L"{Alnum}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; }
+ if (s7 == L"{Punct}") { curInd += 7; return L"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
+ if (s7 == L"{Graph}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
+ if (s7 == L"{Print}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
+ if (s7 == L"{Blank}") { curInd += 7; return L" \t"; }
+ if (s7 == L"{Space}") { curInd += 7; return L" \t\n\x0B\f\r"; }
+ if (s7 == L"{Cntrl}")
+ {
+ bkstring::value_type i;
+ bkstring s = L" ";
+
+ for (i = 0; i < 5; ++i) s += s;
+ s += L" ";
+ for (i = 0; i <= 0x1F; ++i) s[i] = i;
+ s[0x20] = 0x7F;
+ curInd += 7;
+ return s;
+ }
+ if (s7 == L"{ASCII}")
+ {
+ bkstring s(0x80, (wchar_t)' ');
+ for (bkstring::value_type i = 0; i <= 0x7f; ++i) s[i] = i;
+ curInd += 7;
+ return s;
+ }
+ if (pattern.substr(curInd, 8) == L"{XDigit}") { curInd += 8; return L"abcdefABCDEF0123456789"; }
+ raiseError();
+ return L"";
+}
+NFAUNode * WCPattern::parseBackref()
+{
+ #define is_dig(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'9')
+ #define to_int(x) ((x) - (wchar_t)'0')
+ int ci = curInd;
+ int oldRef = 0, ref = 0;
+
+ while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount))
+ {
+ oldRef = ref;
+ ref = ref * 10 + to_int(pattern[ci++]);
+ }
+ if (ci == (int)pattern.size())
+ {
+ oldRef = ref;
+ ++ci;
+ }
+ if (oldRef < 0 || ci <= curInd)
+ {
+ raiseError();
+ return registerNode(new NFAReferenceUNode(-1));
+ }
+ curInd = ci;
+ return registerNode(new NFAReferenceUNode(ref));
+
+ #undef is_dig
+ #undef to_int
+}
+bkstring WCPattern::parseOctal()
+{
+ #define islowoc(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'3')
+ #define isoc(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'7')
+ #define fromoc(x) ((x) - (wchar_t)'0')
+ int ci = curInd;
+ wchar_t ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : USHRT_MAX;
+ wchar_t ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : USHRT_MAX;
+ wchar_t ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : USHRT_MAX;
+ bkstring s = L" ";
+
+ if (islowoc(ch1) && isoc(ch2))
+ {
+ curInd += 2;
+ s[0] = fromoc(ch1) * 8 + fromoc(ch2);
+ if (isoc(ch3))
+ {
+ ++curInd;
+ s[0] = s[0] * 8 + fromoc(ch3);
+ }
+ }
+ else if (isoc(ch1) && isoc(ch2))
+ {
+ curInd += 2;
+ s[0] = fromoc(ch1) * 8 + fromoc(ch2);
+ }
+ else raiseError();
+
+ return s;
+ #undef islowoc
+ #undef isoc
+ #undef fromoc
+}
+bkstring WCPattern::parseHex()
+{
+ #define to_low(x) (((x) >= (wchar_t)'A' && (x) <= (wchar_t)'Z') ? ((x) - (wchar_t)'A' + (wchar_t)'a') : (x))
+ #define is_dig(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'9')
+ #define is_hex(x) (is_dig(x) || (to_low(x) >= (wchar_t)'a' && to_low(x) <= (wchar_t)'f'))
+ #define to_int(x) ((is_dig(x)) ? ((x) - (wchar_t)'0') : (to_low(x) - (wchar_t)'a' + 10))
+
+ int ci = curInd;
+ wchar_t ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : USHRT_MAX;
+ wchar_t ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : USHRT_MAX;
+ wchar_t ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : USHRT_MAX;
+ wchar_t ch4 = (ci + 3 < (int)pattern.size()) ? pattern[ci + 3] : USHRT_MAX;
+ bkstring s = L" ";
+
+ if (is_hex(ch1) && is_hex(ch2) && is_hex(ch3) && is_hex(ch4))
+ {
+ curInd += 2;
+ s[0] = (to_int(ch1) << 12 & 0xF000) | (to_int(ch2) << 8 & 0x0F00) |
+ (to_int(ch3) << 4 & 0x0F00) | (to_int(ch4) & 0x000F);
+ }
+ else if (is_hex(ch1) && is_hex(ch2))
+ {
+ curInd += 2;
+ s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F);
+ }
+
+ return s;
+ #undef to_low
+ #undef is_dig
+ #undef is_hex
+ #undef to_int
+}
+bkstring WCPattern::parseEscape(bool & inv, bool & quo)
+{
+ wchar_t ch = pattern[curInd++];
+ bkstring classes = L"";
+
+ if (curInd > (int)pattern.size())
+ {
+ raiseError();
+ return NULL;
+ }
+
+ quo = 0;
+ inv = 0;
+ switch (ch)
+ {
+ case (wchar_t)'p': classes = parsePosix(); break;
+ case (wchar_t)'P': classes = L"!!"; classes += parsePosix(); break;
+ case (wchar_t)'d': classes = L"0123456789"; break;
+ case (wchar_t)'D': classes = L"!!0123456789"; break;
+ case (wchar_t)'s': classes = L" \t\r\n\f"; break;
+ case (wchar_t)'S': classes = L"!! \t\r\n\f"; break;
+ case (wchar_t)'w': classes = L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break;
+ case (wchar_t)'W': classes = L"!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break;
+ case (wchar_t)'0': classes = parseOctal(); break;
+ case (wchar_t)'x': classes = parseHex(); break;
+
+ case (wchar_t)'Q': quo = 1; break;
+ case (wchar_t)'t': classes = L"\t"; break;
+ case (wchar_t)'r': classes = L"\r"; break;
+ case (wchar_t)'n': classes = L"\n"; break;
+ case (wchar_t)'f': classes = L"\f"; break;
+ case (wchar_t)'a': classes = L"\a"; break;
+ case (wchar_t)'e': classes = L"\r"; break;
+ default: classes = L" "; classes[0] = ch; break;
+ }
+ if (classes.substr(0, 2) == L"!!")
+ {
+ classes = classes.substr(2);
+ inv = 1;
+ }
+ return classes;
+}
+NFAUNode * WCPattern::parseRegisteredWCPattern(NFAUNode ** end)
+{
+ int i, j;
+ bkstring s;
+ NFAUNode * ret = NULL;
+ for (i = curInd; i < (int)pattern.size() && pattern[i] != (wchar_t)'}'; ++i) { }
+ if (pattern[i] != (wchar_t)'}') { raiseError(); return NULL; }
+ if (i == curInd + 1) { raiseError(); return NULL; } // {}
+ if (
+ !(
+ (pattern[curInd] >= (wchar_t)'a' && pattern[curInd] <= (wchar_t)'z') ||
+ (pattern[curInd] >= (wchar_t)'A' && pattern[curInd] <= (wchar_t)'Z') ||
+ (pattern[curInd] == (wchar_t)'_')
+ )
+ )
+ {
+ raiseError();
+ return NULL;
+ }
+ for (j = curInd; !error && j < i; ++j)
+ {
+ if (
+ !(
+ (pattern[j] >= (wchar_t)'a' && pattern[j] <= (wchar_t)'z') ||
+ (pattern[j] >= (wchar_t)'A' && pattern[j] <= (wchar_t)'Z') ||
+ (pattern[j] >= (wchar_t)'0' && pattern[j] <= (wchar_t)'9') ||
+ (pattern[j] == (wchar_t)'_')
+ )
+ )
+ {
+ raiseError();
+ return NULL;
+ }
+ }
+ s = pattern.substr(curInd, i - curInd);
+ if (registeredWCPatterns.find(s) == registeredWCPatterns.end()) raiseError();
+ else
+ {
+ unsigned long oflags = flags;
+ bkstring op = pattern;
+ int ci = i + 1;
+
+ pattern = registeredWCPatterns[s].first;
+ curInd = 0;
+ flags = registeredWCPatterns[s].second;
+
+ --groupCount;
+ ret = parse(0, 0, end);
+
+ pattern = op;
+ curInd = ci;
+ flags = oflags;
+ }
+ if (error) { *end = ret = NULL; }
+ return ret;
+}
+
+// look behind should interpret everything as a literal (except \\) since the
+// pattern must have a concrete length
+NFAUNode * WCPattern::parseBehind(const bool pos, NFAUNode ** end)
+{
+ bkstring t = L"";
+ while (curInd < (int)pattern.size() && pattern[curInd] != (wchar_t)')')
+ {
+ wchar_t ch = pattern[curInd++];
+ t += L" ";
+ if (ch == (wchar_t)'\\')
+ {
+ if (curInd + 1 >= (int)pattern.size())
+ {
+ raiseError();
+ return *end = registerNode(new NFACharUNode((wchar_t)' '));
+ }
+ ch = pattern[curInd++];
+ }
+ t[t.size() - 1] = ch;
+ }
+ if (curInd >= (int)pattern.size() || pattern[curInd] != (wchar_t)')') raiseError();
+ else ++curInd;
+ return *end = registerNode(new NFALookBehindUNode(t, pos));
+}
+NFAUNode * WCPattern::parseQuote()
+{
+ bool done = 0;
+ bkstring s = L"";
+
+ while (!done)
+ {
+ if (curInd >= (int)pattern.size())
+ {
+ raiseError();
+ done = 1;
+ }
+ else if (pattern.substr(curInd, 2) == L"\\E")
+ {
+ curInd += 2;
+ done = 1;
+ }
+ else if (pattern[curInd] == (wchar_t)'\\')
+ {
+ s += L" ";
+ s[s.size() - 1] = pattern[++curInd];
+ ++curInd;
+ }
+ else
+ {
+ s += L" ";
+ s[s.size() - 1] = pattern[curInd++];
+ }
+ }
+ if ((flags & WCPattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteUNode(s));
+ return registerNode(new NFAQuoteUNode(s));
+}
+NFAUNode * WCPattern::parse(const bool inParen, const bool inOr, NFAUNode ** end)
+{
+ NFAUNode * start, * cur, * next = NULL;
+ bkstring t;
+ int grc = groupCount++;
+ bool inv, quo;
+ bool ahead = 0, pos = 0, noncap = 0, indep = 0;
+ unsigned long oldFlags = flags;
+
+ if (inParen)
+ {
+ if (pattern[curInd] == (wchar_t)'?')
+ {
+ ++curInd;
+ --groupCount;
+ if (pattern[curInd] == (wchar_t)':') { noncap = 1; ++curInd; grc = --nonCapGroupCount; }
+ else if (pattern[curInd] == (wchar_t)'=') { ++curInd; ahead = 1; pos = 1; }
+ else if (pattern[curInd] == (wchar_t)'!') { ++curInd; ahead = 1; pos = 0; }
+ else if (pattern.substr(curInd, 2) == L"<=") { curInd += 2; return parseBehind(1, end); }
+ else if (pattern.substr(curInd, 2) == L"<!") { curInd += 2; return parseBehind(0, end); }
+ else if (pattern[curInd] == (wchar_t)'>') { ++curInd; indep = 1; }
+ else
+ {
+ bool negate = false, done = false;
+ while (!done)
+ {
+ if (curInd >= (int)pattern.size())
+ {
+ raiseError();
+ return NULL;
+ }
+ else if (negate)
+ {
+ switch (pattern[curInd])
+ {
+ case (wchar_t)'i': flags &= ~WCPattern::CASE_INSENSITIVE; break;
+ case (wchar_t)'d': flags &= ~WCPattern::UNIX_LINE_MODE; break;
+ case (wchar_t)'m': flags &= ~WCPattern::MULTILINE_MATCHING; break;
+ case (wchar_t)'s': flags &= ~WCPattern::DOT_MATCHES_ALL; break;
+ case (wchar_t)':': done = true; break;
+ case (wchar_t)')':
+ ++curInd;
+ *end = registerNode(new NFALookBehindUNode(L"", true));
+ return *end;
+ case (wchar_t)'-':
+ default: raiseError(); return NULL;
+ }
+ }
+ else
+ {
+ switch (pattern[curInd])
+ {
+ case (wchar_t)'i': flags |= WCPattern::CASE_INSENSITIVE; break;
+ case (wchar_t)'d': flags |= WCPattern::UNIX_LINE_MODE; break;
+ case (wchar_t)'m': flags |= WCPattern::MULTILINE_MATCHING; break;
+ case (wchar_t)'s': flags |= WCPattern::DOT_MATCHES_ALL; break;
+ case (wchar_t)':': done = true; break;
+ case (wchar_t)'-': negate = true; break;
+ case (wchar_t)')':
+ ++curInd;
+ *end = registerNode(new NFALookBehindUNode(L"", true));
+ return *end;
+ default: raiseError(); return NULL;
+ }
+ }
+ ++curInd;
+ }
+ noncap = 1;
+ grc = --nonCapGroupCount;
+ }
+
+ if (noncap) cur = start = registerNode(new NFAGroupHeadUNode(grc));
+ else cur = start = registerNode(new NFASubStartUNode);
+ }
+ else cur = start = registerNode(new NFAGroupHeadUNode(grc));
+ }
+ else cur = start = registerNode(new NFASubStartUNode);
+ while (curInd < (int)pattern.size())
+ {
+ wchar_t ch = pattern[curInd++];
+
+ next = NULL;
+ if (error) return NULL;
+ switch (ch)
+ {
+ case (wchar_t)'^':
+ if ((flags & WCPattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineUNode);
+ else next = registerNode(new NFAStartOfInputUNode);
+ break;
+ case (wchar_t)'$':
+ if ((flags & WCPattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineUNode);
+ else next = registerNode(new NFAEndOfInputUNode(0));
+ break;
+ case (wchar_t)'|':
+ --groupCount;
+ cur->next = registerNode(new NFAAcceptUNode);
+ cur = start = registerNode(new NFAOrUNode(start, parse(inParen, 1)));
+ break;
+ case (wchar_t)'\\':
+ if (curInd < (int)pattern.size())
+ {
+ bool eoi = 0;
+ switch (pattern[curInd])
+ {
+ case (wchar_t)'1':
+ case (wchar_t)'2':
+ case (wchar_t)'3':
+ case (wchar_t)'4':
+ case (wchar_t)'5':
+ case (wchar_t)'6':
+ case (wchar_t)'7':
+ case (wchar_t)'8':
+ case (wchar_t)'9': next = parseBackref(); break;
+ case (wchar_t)'A': ++curInd; next = registerNode(new NFAStartOfInputUNode); break;
+ case (wchar_t)'B': ++curInd; next = registerNode(new NFAWordBoundaryUNode(0)); break;
+ case (wchar_t)'b': ++curInd; next = registerNode(new NFAWordBoundaryUNode(1)); break;
+ case (wchar_t)'G': ++curInd; next = registerNode(new NFAEndOfMatchUNode); break;
+ case (wchar_t)'Z': eoi = 1;
+ case (wchar_t)'z': ++curInd; next = registerNode(new NFAEndOfInputUNode(eoi)); break;
+ default:
+ t = parseEscape(inv, quo);
+ //printf("inv quo classes { %c %c %s }\n", inv ? (wchar_t)'t' : (wchar_t)'f', quo ? (wchar_t)'t' : (wchar_t)'f', t.c_str());
+ if (!quo)
+ {
+ if (t.size() > 1 || inv)
+ {
+ if ((flags & WCPattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassUNode(t, inv));
+ else next = registerNode(new NFAClassUNode(t, inv));
+ }
+ else
+ {
+ next = registerNode(new NFACharUNode(t[0]));
+ }
+ }
+ else
+ {
+ next = parseQuote();
+ }
+ }
+ }
+ else raiseError();
+ break;
+ case (wchar_t)'[':
+ if ((flags & WCPattern::CASE_INSENSITIVE) == 0)
+ {
+ NFAClassUNode * clazz = new NFAClassUNode();
+ bkstring s = parseClass();
+ for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1;
+ next = registerNode(clazz);
+ }
+ else
+ {
+ NFACIClassUNode * clazz = new NFACIClassUNode();
+ bkstring s = parseClass();
+ for (int i = 0; i < (int)s.size(); ++i) clazz->vals[to_lower(s[i])] = 1;
+ next = registerNode(clazz);
+ }
+ break;
+ case (wchar_t)'.':
+ {
+ bool useN = 1, useR = 1;
+ NFAClassUNode * clazz = new NFAClassUNode(1);
+ if ((flags & WCPattern::UNIX_LINE_MODE) != 0) useR = 0;
+ if ((flags & WCPattern::DOT_MATCHES_ALL) != 0) useN = useR = 0;
+ if (useN) clazz->vals[(wchar_t)'\n'] = 1;
+ if (useR) clazz->vals[(wchar_t)'\r'] = 1;
+ next = registerNode(clazz);
+ }
+ break;
+ case (wchar_t)'(':
+ {
+ NFAUNode * end, * t1, * t2;
+ t1 = parse(1, 0, &end);
+ if (!t1) raiseError();
+ else if (t1->isGroupHeadNode() && (t2 = quantifyGroup(t1, end, grc)) != NULL)
+ {
+ cur->next = t2;
+ cur = t2->next;
+ }
+ else
+ {
+ cur->next = t1;
+ cur = end;
+ }
+ }
+ break;
+ case (wchar_t)')':
+ if (!inParen) raiseError();
+ else if (inOr)
+ {
+ --curInd;
+ cur = cur->next = registerNode(new NFAAcceptUNode);
+ flags = oldFlags;
+ return start;
+ }
+ else
+ {
+ if (ahead)
+ {
+ cur = cur->next = registerNode(new NFAAcceptUNode);
+ flags = oldFlags;
+ return *end = registerNode(new NFALookAheadUNode(start, pos));
+ }
+ else if (indep)
+ {
+ cur = cur->next = registerNode(new NFAAcceptUNode);
+ flags = oldFlags;
+ return *end = registerNode(new NFAPossessiveQuantifierUNode(this, start, 1, 1));
+ }
+ else // capping or noncapping, it doesnt matter
+ {
+ *end = cur = cur->next = registerNode(new NFAGroupTailUNode(grc));
+ next = quantifyGroup(start, *end, grc);
+ if (next)
+ {
+ start = next;
+ *end = next->next;
+ }
+ flags = oldFlags;
+ return start;
+ }
+ }
+ break;
+ case (wchar_t)'{': // registered pattern
+ cur->next = parseRegisteredWCPattern(&next);
+ if (cur->next) cur = next;
+ break;
+ case (wchar_t)'*':
+ case (wchar_t)'+':
+ case (wchar_t)'?':
+// case (wchar_t)'}':
+// case (wchar_t)']':
+ raiseError();
+ break;
+ default:
+ if ((flags & WCPattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACICharUNode(ch));
+ else next = registerNode(new NFACharUNode(ch));
+ break;
+ }
+ if (next) cur = cur->next = quantify(next);
+ }
+ if (inParen) raiseError();
+ else
+ {
+ if (inOr) cur = cur->next = registerNode(new NFAAcceptUNode);
+ if (end) *end = cur;
+ }
+
+ flags = oldFlags;
+ if (error) return NULL;
+
+ return start;
+}
+
+WCPattern * WCPattern::compile(const bkstring & pattern, const unsigned long mode)
+{
+ WCPattern * p = new WCPattern(pattern);
+ NFAUNode * end;
+
+ p->flags = mode;
+ if ((mode & WCPattern::LITERAL) != 0)
+ {
+ p->head = p->registerNode(new NFAStartUNode);
+ if ((mode & WCPattern::CASE_INSENSITIVE) != 0) p->head->next = p->registerNode(new NFACIQuoteUNode(pattern));
+ else p->head->next = p->registerNode(new NFAQuoteUNode(pattern));
+ p->head->next->next = p->registerNode(new NFAEndUNode);
+ }
+ else
+ {
+ p->head = p->parse(0, 0, &end);
+ if (!p->head)
+ {
+ delete p;
+ p = NULL;
+ }
+ else
+ {
+ if (!(p->head && p->head->isStartOfInputNode()))
+ {
+ NFAUNode * n = p->registerNode(new NFAStartUNode);
+ n->next = p->head;
+ p->head = n;
+ }
+ end->next = p->registerNode(new NFAEndUNode);
+ }
+ }
+ if (p != NULL)
+ {
+ p->matcher = new WCMatcher(p, L"");
+ }
+
+ return p;
+}
+
+WCPattern * WCPattern::compileAndKeep(const bkstring & pattern, const unsigned long mode)
+{
+ WCPattern * ret = NULL;
+ std::map<bkstring, WCPattern*>::iterator it = compiledWCPatterns.find(pattern);
+
+ if (it != compiledWCPatterns.end())
+ {
+ ret = it->second;
+ }
+ else
+ {
+ ret = compile(pattern, mode);
+ compiledWCPatterns[pattern] = ret;
+ }
+
+ return ret;
+}
+bkstring WCPattern::replace(const bkstring & pattern, const bkstring & str,
+ const bkstring & replacementText, const unsigned long mode)
+{
+ bkstring ret;
+ WCPattern * p = WCPattern::compile(pattern, mode);
+ if (p)
+ {
+ ret = p->replace(str, replacementText);
+ delete p;
+ }
+ return ret;
+}
+
+std::vector<bkstring> WCPattern::split(const bkstring & pattern, const bkstring & str, const bool keepEmptys,
+ const unsigned long limit, const unsigned long mode)
+{
+ std::vector<bkstring> ret;
+ WCPattern * p = WCPattern::compile(pattern, mode);
+ if (p)
+ {
+ ret = p->split(str, keepEmptys, limit);
+ delete p;
+ }
+ return ret;
+}
+
+std::vector<bkstring> WCPattern::findAll(const bkstring & pattern, const bkstring & str, const unsigned long mode)
+{
+ std::vector<bkstring> ret;
+ WCPattern * p = WCPattern::compile(pattern, mode);
+ if (p)
+ {
+ ret = p->findAll(str);
+ delete p;
+ }
+ return ret;
+}
+
+bool WCPattern::matches(const bkstring & pattern, const bkstring & str, const unsigned long mode)
+{
+ bool ret = 0;
+ WCPattern * p = compile(pattern, mode);
+
+ if (p)
+ {
+ ret = p->matches(str);
+ delete p;
+ }
+
+ return ret;
+}
+
+bool WCPattern::registerWCPattern(const bkstring & name, const bkstring & pattern, const unsigned long mode)
+{
+ WCPattern * p = WCPattern::compile(pattern, mode);
+ if (!p) return 0;
+ WCPattern::registeredWCPatterns[name] = std::make_pair(pattern, mode);
+ delete p;
+ return 1;
+}
+
+void WCPattern::unregisterWCPatterns()
+{
+ registeredWCPatterns.clear();
+}
+void WCPattern::clearWCPatternCache()
+{
+ std::map<bkstring, WCPattern*>::iterator it;
+ for (it = compiledWCPatterns.begin(); it != compiledWCPatterns.end(); ++it)
+ {
+ delete it->second;
+ }
+ compiledWCPatterns.clear();
+}
+
+std::pair<bkstring, int> WCPattern::findNthMatch(const bkstring & pattern, const bkstring & str,
+ const int matchNum, const unsigned long mode)
+{
+ std::pair<bkstring, int> ret;
+ WCPattern * p = WCPattern::compile(pattern, mode);
+
+ ret.second = -1;
+ if (p)
+ {
+ int i = -1;
+ p->matcher->setString(str);
+ while (i < matchNum && p->matcher->findNextMatch()) { ++i; }
+ if (i == matchNum && p->matcher->getStartingIndex() >= 0)
+ {
+ ret.first = p->matcher->getGroup(0);
+ ret.second = p->matcher->getStartingIndex();
+ }
+ delete p;
+ }
+
+ return ret;
+}
+
+WCPattern::~WCPattern()
+{
+ /*
+ nodes.clear();
+ if (head) head->findAllNodes(nodes);
+ */
+ if (matcher) delete matcher;
+ for (std::map<NFAUNode*, bool>::iterator it = nodes.begin(); it != nodes.end(); ++it) delete it->first;
+}
+bkstring WCPattern::replace(const bkstring & str, const bkstring & replacementText)
+{
+ int li = 0;
+ bkstring ret = L"";
+
+ matcher->setString(str);
+ while (matcher->findNextMatch())
+ {
+ ret += str.substr(li, matcher->getStartingIndex() - li);
+ ret += matcher->replaceWithGroups(replacementText);
+ li = matcher->getEndingIndex();
+ }
+ ret += str.substr(li);
+
+ return ret;
+}
+std::vector<bkstring> WCPattern::split(const bkstring & str, const bool keepEmptys, const unsigned long limit)
+{
+ unsigned long lim = (limit == 0 ? MAX_QMATCH : limit);
+ int li = 0;
+ std::vector<bkstring> ret;
+
+ matcher->setString(str);
+
+ while (matcher->findNextMatch() && ret.size() < lim)
+ {
+ if (matcher->getStartingIndex() == 0 && keepEmptys) ret.push_back(L"");
+ if ((matcher->getStartingIndex() != matcher->getEndingIndex()) || keepEmptys)
+ {
+ if (li != matcher->getStartingIndex() || keepEmptys)
+ {
+ ret.push_back(str.substr(li, matcher->getStartingIndex() - li));
+ }
+ li = matcher->getEndingIndex();
+ }
+ }
+ if (li < (int)str.size()) ret.push_back(str.substr(li));
+
+ return ret;
+}
+std::vector<bkstring> WCPattern::findAll(const bkstring & str)
+{
+ matcher->setString(str);
+ return matcher->findAll();
+}
+bool WCPattern::matches(const bkstring & str)
+{
+ matcher->setString(str);
+ return matcher->matches();
+}
+unsigned long WCPattern::getFlags() const
+{
+ return flags;
+}
+bkstring WCPattern::getWCPattern() const
+{
+ return pattern;
+}
+WCMatcher * WCPattern::createWCMatcher(const bkstring & str)
+{
+ return new WCMatcher(this, str);
+}
+
+// NFAUNode
+
+NFAUNode::NFAUNode() { next = NULL; }
+NFAUNode::~NFAUNode() { }
+void NFAUNode::findAllNodes(std::map<NFAUNode*, bool> & soFar)
+{
+ if (soFar.find(this) == soFar.end()) return;
+ soFar[this] = 1;
+ if (next) next->findAllNodes(soFar);
+}
+
+// NFACharUNode
+
+NFACharUNode::NFACharUNode(const wchar_t c) { ch = c; }
+int NFACharUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && str[curInd] == ch) return next->match(str, matcher, curInd + 1);
+ return -1;
+}
+
+// NFACICharUNode
+
+NFACICharUNode::NFACICharUNode(const wchar_t c) { ch = to_lower(c); }
+int NFACICharUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && to_lower(str[curInd]) == ch) return next->match(str, matcher, curInd + 1);
+ return -1;
+}
+
+// NFAStartUNode
+
+NFAStartUNode::NFAStartUNode() { }
+int NFAStartUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int ret = -1, ci = curInd;
+
+ matcher->starts[0] = curInd;
+ if ((matcher->getFlags() & WCMatcher::MATCH_ENTIRE_STRING) == (unsigned int)WCMatcher::MATCH_ENTIRE_STRING)
+ {
+ if (curInd != 0)
+ {
+ matcher->starts[0] = -1;
+ return -1;
+ }
+ return next->match(str, matcher, 0);
+ }
+ while ((ret = next->match(str, matcher, ci)) == -1 && ci < (int)str.size())
+ {
+ matcher->clearGroups();
+ matcher->starts[0] = ++ci;
+ }
+ if (ret < 0) matcher->starts[0] = -1;
+ return ret;
+}
+
+// NFAEndUNode
+
+NFAEndUNode::NFAEndUNode() { }
+int NFAEndUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ matcher->ends[0] = curInd;
+ if ((matcher->getFlags() & WCMatcher::MATCH_ENTIRE_STRING) != 0)
+ {
+ if (curInd == (int)str.size()) return curInd;
+ matcher->ends[0] = -1;
+ return -1;
+ }
+ return curInd;
+}
+
+// NFAQuantifierUNode
+
+void NFAQuantifierUNode::findAllNodes(std::map<NFAUNode*, bool> & soFar)
+{
+ inner->findAllNodes(soFar);
+ NFAUNode::findAllNodes(soFar);
+}
+NFAQuantifierUNode::NFAQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch)
+{
+ inner = internal;
+ inner->next = pat->registerNode(new NFAAcceptUNode);
+ min = (minMatch < WCPattern::MIN_QMATCH) ? WCPattern::MIN_QMATCH : minMatch;
+ max = (maxMatch > WCPattern::MAX_QMATCH) ? WCPattern::MAX_QMATCH : maxMatch;
+}
+
+int NFAQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int i0, i1, i2 = 0;
+
+ i0 = i1 = curInd;
+ while (i2 < min)
+ {
+
+ ++i2;
+ i1 = inner->match(str, matcher, i0);
+ if (i1 <= i0) return i1; // i1 < i0 means i1 is -1
+ i0 = i1;
+ }
+
+ return i1;
+}
+// NFAGreedyQuantifierUNode
+
+NFAGreedyQuantifierUNode::NFAGreedyQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch)
+ : NFAQuantifierUNode(pat, internal, minMatch, maxMatch) { }
+int NFAGreedyQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int t = NFAQuantifierUNode::match(str, matcher, curInd);
+ if (t != -1) return matchInternal(str, matcher, t, min);
+ return t;
+}
+int NFAGreedyQuantifierUNode::matchInternal(const bkstring & str, WCMatcher * matcher, const int curInd, const int soFar) const
+{
+ if (soFar >= max) return next->match(str, matcher, curInd);
+
+ int i, j;
+
+ i = inner->match(str, matcher, curInd);
+ if (i != -1)
+ {
+ j = matchInternal(str, matcher, i, soFar + 1);
+ if (j != -1) return j;
+ }
+ return next->match(str, matcher, curInd);
+}
+
+// NFALazyQuantifierUNode
+
+NFALazyQuantifierUNode::NFALazyQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch)
+ : NFAQuantifierUNode(pat, internal, minMatch, maxMatch) { }
+int NFALazyQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int i, j, m = NFAQuantifierUNode::match(str, matcher, curInd);
+
+ if (m == -1) return -1;
+
+ for (i = min; i < max; ++i)
+ {
+ j = next->match(str, matcher, m);
+ if (j == -1)
+ {
+ j = inner->match(str, matcher, m);
+ // if j < m, then j is -1, so we bail.
+ // if j == m, then we would just go and call next->match on the same index,
+ // but it already failed trying to match right there, so we know we can
+ // just bail
+ if (j <= m) return -1;
+ m = j;
+ }
+ else return j;
+ }
+ return next->match(str, matcher, m);
+}
+
+// NFAPossessiveQuantifierUNode
+
+NFAPossessiveQuantifierUNode::NFAPossessiveQuantifierUNode(WCPattern * pat, NFAUNode * internal, const int minMatch, const int maxMatch)
+ : NFAQuantifierUNode(pat, internal, minMatch, maxMatch) { }
+int NFAPossessiveQuantifierUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int i, j, m = NFAQuantifierUNode::match(str, matcher, curInd);
+
+ if (m == -1) return -1;
+ for (i = min; i < max; ++i)
+ {
+ j = inner->match(str, matcher, m);
+ if (j <= m) return next->match(str, matcher, m);
+ m = j;
+ }
+ return next->match(str, matcher, m);
+}
+
+// NFAAcceptUNode
+
+NFAAcceptUNode::NFAAcceptUNode() { }
+int NFAAcceptUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (!next) return curInd;
+ else return next->match(str, matcher, curInd);
+}
+
+// NFAClassUNode
+
+NFAClassUNode::NFAClassUNode(const bool invert)
+{
+ inv = invert;
+}
+NFAClassUNode::NFAClassUNode(const bkstring & clazz, const bool invert)
+{
+ inv = invert;
+ for (int i = 0; i < (int)clazz.size(); ++i) vals[clazz[i]] = 1;
+}
+int NFAClassUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && ((vals.find(str[curInd]) != vals.end()) ^ inv))
+ {
+ return next->match(str, matcher, curInd + 1);
+ }
+ return -1;
+}
+
+// NFACIClassUNode
+
+NFACIClassUNode::NFACIClassUNode(const bool invert)
+{
+ inv = invert;
+}
+NFACIClassUNode::NFACIClassUNode(const bkstring & clazz, const bool invert)
+{
+ inv = invert;
+ for (int i = 0; i < (int)clazz.size(); ++i) vals[to_lower(clazz[i])] = 1;
+}
+int NFACIClassUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd < (int)str.size() && ((vals.find(to_lower(str[curInd])) != vals.end()) ^ inv))
+ {
+ return next->match(str, matcher, curInd + 1);
+ }
+ return -1;
+}
+
+// NFASubStartUNode
+
+NFASubStartUNode::NFASubStartUNode() { }
+int NFASubStartUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ return next->match(str, matcher, curInd);
+}
+
+// NFAOrUNode
+
+NFAOrUNode::NFAOrUNode(NFAUNode * first, NFAUNode * second) : one(first), two(second) { }
+void NFAOrUNode::findAllNodes(std::map<NFAUNode*, bool> & soFar)
+{
+ if (one) one->findAllNodes(soFar);
+ if (two) two->findAllNodes(soFar);
+ NFAUNode::findAllNodes(soFar);
+}
+int NFAOrUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int ci = one->match(str, matcher, curInd);
+
+ if (ci != -1) ci = next->match(str, matcher, ci);
+ if (ci != -1) return ci;
+ if (ci == -1) ci = two->match(str, matcher, curInd);
+ if (ci != -1) ci = next->match(str, matcher, ci);
+ return ci;
+}
+
+// NFAQuoteUNode
+
+NFAQuoteUNode::NFAQuoteUNode(const bkstring & quoted) : qStr(quoted) { }
+int NFAQuoteUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd + qStr.size() > str.size()) return -1;
+ if (str.substr(curInd, qStr.size()) != qStr) return -1;
+ return next->match(str, matcher, curInd + (int)qStr.size());
+}
+
+// NFACIQuoteUNode
+
+NFACIQuoteUNode::NFACIQuoteUNode(const bkstring & quoted) : qStr(quoted) { }
+int NFACIQuoteUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd + qStr.size() > str.size()) return -1;
+ if (str_icmp(str.substr(curInd, qStr.size()).c_str(), qStr.c_str())) return -1;
+ return next->match(str, matcher, (int)qStr.size());
+}
+
+// NFALookAheadUNode
+
+NFALookAheadUNode::NFALookAheadUNode(NFAUNode * internal, const bool positive) : NFAUNode(), pos(positive), inner(internal) { }
+void NFALookAheadUNode::findAllNodes(std::map<NFAUNode*, bool> & soFar)
+{
+ if (inner) inner->findAllNodes(soFar);
+ NFAUNode::findAllNodes(soFar);
+}
+int NFALookAheadUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ return ((inner->match(str, matcher, curInd) == -1) ^ pos) ? next->match(str, matcher, curInd) : -1;
+}
+
+// NFALookBehindUNode
+
+NFALookBehindUNode::NFALookBehindUNode(const bkstring & str, const bool positive) : pos(positive), mStr(str) { }
+int NFALookBehindUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (pos)
+ {
+ if (curInd < (int)mStr.size()) return -1;
+ if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return next->match(str, matcher, curInd);
+ }
+ else
+ {
+ if (curInd < (int)mStr.size()) return next->match(str, matcher, curInd);
+ if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return -1;
+ return next->match(str, matcher, curInd);
+ }
+ return -1;
+}
+
+// NFAStartOfLineUNode
+
+NFAStartOfLineUNode::NFAStartOfLineUNode() { }
+int NFAStartOfLineUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd == 0 || str[curInd - 1] == (wchar_t)'\n' || str[curInd - 1] == (wchar_t)'\r')
+ {
+ return next->match(str, matcher, curInd);
+ }
+ return -1;
+}
+
+// NFAEndOfLineUNode
+
+NFAEndOfLineUNode::NFAEndOfLineUNode() { }
+int NFAEndOfLineUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd >= (int)str.size() || str[curInd] == (wchar_t)'\n' || str[curInd] == (wchar_t)'\r')
+ {
+ return next->match(str, matcher, curInd);
+ }
+ return -1;
+}
+
+// NFAReferenceUNode
+
+NFAReferenceUNode::NFAReferenceUNode(const int groupIndex) : gi(groupIndex) { }
+int NFAReferenceUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int len = matcher->ends[gi] - matcher->starts[gi];
+ int ni = -1;
+ if (gi < 1 || matcher->ends[gi] < matcher->starts[gi] || len == 0) ni = curInd;
+ else if (curInd + len > (int)str.size()) return -1;
+ else if (str.substr(curInd, len) != str.substr(matcher->starts[gi], len)) return -1;
+ else ni = curInd + len;
+
+ return next->match(str, matcher, ni);
+}
+
+// NFAStartOfInputUNode
+
+NFAStartOfInputUNode::NFAStartOfInputUNode() { }
+int NFAStartOfInputUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd == 0) return next->match(str, matcher, curInd);
+ return -1;
+}
+
+// NFAEndOfInputUNode
+
+NFAEndOfInputUNode::NFAEndOfInputUNode(const bool lookForTerm) : term(lookForTerm) { }
+int NFAEndOfInputUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int len = (int)str.size();
+ if (curInd == len) return next->match(str, matcher, curInd);
+ else if (term)
+ {
+ if (curInd == len - 1 && (str[curInd] == (wchar_t)'\r' || str[curInd] == (wchar_t)'\n'))
+ {
+ return next->match(str, matcher, curInd);
+ }
+ else if (curInd == len - 2 && str.substr(curInd, 2) == L"\r\n")
+ {
+ return next->match(str, matcher, curInd);
+ }
+ }
+ return -1;
+}
+
+// NFAWordBoundaryUNode
+
+NFAWordBoundaryUNode::NFAWordBoundaryUNode(const bool positive) : pos(positive) { }
+int NFAWordBoundaryUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int len = (int)str.size();
+
+ wchar_t c1 = (curInd - 1 < len && curInd > 0) ? str[curInd - 1] : '\n';
+ wchar_t c2 = (curInd < len) ? str[curInd ] : '\n';
+
+ if (curInd == len) return next->match(str, matcher, curInd);
+ bool ok = is_alpha(c1) != is_alpha(c2);
+ if (ok && pos) return next->match(str, matcher, curInd);
+ return -1;
+}
+
+// NFAEndOfMatchUNode
+
+NFAEndOfMatchUNode::NFAEndOfMatchUNode() { }
+int NFAEndOfMatchUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ if (curInd == matcher->lm) return next->match(str, matcher, curInd);
+ return -1;
+}
+
+// NFAGroupHeadUNode
+
+NFAGroupHeadUNode::NFAGroupHeadUNode(const int groupIndex) : gi(groupIndex) { }
+int NFAGroupHeadUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int ret, o = matcher->starts[gi];
+
+ matcher->starts[gi] = curInd;
+ ret = next->match(str, matcher, curInd);
+ if (ret < 0) matcher->starts[gi] = o;
+
+ return ret;
+}
+
+// NFAGroupTailUNode
+
+NFAGroupTailUNode::NFAGroupTailUNode(const int groupIndex) : gi(groupIndex) { }
+int NFAGroupTailUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int ret, o = matcher->ends[gi];
+
+ matcher->ends[gi] = curInd;
+ ret = next->match(str, matcher, curInd);
+ if (ret < 0) matcher->ends[gi] = o;
+
+ return ret;
+}
+
+// NFAGroupLoopPrologueUNode
+
+NFAGroupLoopPrologueUNode::NFAGroupLoopPrologueUNode(const int groupIndex) : gi(groupIndex) { }
+int NFAGroupLoopPrologueUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int ret, o1 = matcher->groups[gi], o2 = matcher->groupPos[gi], o3 = matcher->groupIndeces[gi];
+
+ matcher->groups[gi] = 0;
+ matcher->groupPos[gi] = 0;
+ matcher->groupIndeces[gi] = -1;
+ ret = next->match(str, matcher, curInd);
+ if (ret < 0)
+ {
+ matcher->groups[gi] = o1;
+ matcher->groupPos[gi] = o2;
+ matcher->groupIndeces[gi] = o3;
+ }
+
+ return ret;
+}
+
+// NFAGroupLoopUNode
+
+NFAGroupLoopUNode::NFAGroupLoopUNode(NFAUNode * internal, const int minMatch, const int maxMatch,
+ const int groupIndex, const int matchType)
+{
+ inner = internal;
+ min = minMatch;
+ max = maxMatch;
+ gi = groupIndex;
+ type = matchType;
+}
+void NFAGroupLoopUNode::findAllNodes(std::map<NFAUNode*, bool> & soFar)
+{
+ if (inner) inner->findAllNodes(soFar);
+ NFAUNode::findAllNodes(soFar);
+}
+int NFAGroupLoopUNode::match(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ bool b = (curInd > matcher->groupIndeces[gi]);
+
+ if (b && matcher->groups[gi] < min)
+ {
+ ++matcher->groups[gi];
+ int o = matcher->groupIndeces[gi];
+ matcher->groupIndeces[gi] = curInd;
+ int ret = inner->match(str, matcher, curInd);
+ if (ret < 0)
+ {
+ matcher->groupIndeces[gi] = o;
+ --matcher->groups[gi];
+ }
+ return ret;
+ }
+ else if (!b || matcher->groups[gi] >= max)
+ {
+ return next->match(str, matcher, curInd);
+ }
+ else
+ {
+ switch (type)
+ {
+ case 0: return matchGreedy(str, matcher, curInd);
+ case 1: return matchLazy(str, matcher, curInd);
+ case 2: return matchPossessive(str, matcher, curInd);
+ }
+ }
+ return -1;
+}
+int NFAGroupLoopUNode::matchGreedy(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int o = matcher->groupIndeces[gi]; // save our info for backtracking
+ matcher->groupIndeces[gi] = curInd; // move along
+ ++matcher->groups[gi];
+ int ret = inner->match(str, matcher, curInd); // match internally
+ if (ret < 0)
+ { // if we failed, then restore info and match next
+ --matcher->groups[gi];
+ matcher->groupIndeces[gi] = o;
+ ret = next->match(str, matcher, curInd);
+ }
+ return ret;
+}
+int NFAGroupLoopUNode::matchLazy(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int ret = next->match(str, matcher, curInd); // be lazy, just go on
+ if (ret < 0)
+ {
+ int o = matcher->groupIndeces[gi]; // save info for backtracking
+ matcher->groupIndeces[gi] = curInd; // advance our position
+ ++matcher->groups[gi];
+ ret = inner->match(str, matcher, curInd); // match our internal stuff
+ if (ret < 0) // if we failed, then restore the info
+ {
+ --matcher->groups[gi];
+ matcher->groupIndeces[gi] = o;
+ }
+ }
+ return ret;
+}
+int NFAGroupLoopUNode::matchPossessive(const bkstring & str, WCMatcher * matcher, const int curInd) const
+{
+ int o = matcher->groupIndeces[gi]; // save info for backtracking
+ matcher->groupPos[gi] = matcher->groups[gi]; // set a flag stating we have matcher at least this much
+ matcher->groupIndeces[gi] = curInd; // move along
+ ++matcher->groups[gi];
+ int ret = inner->match(str, matcher, curInd); // try and match again
+ if (ret < 0)
+ { // if we fail, back off, but to an extent
+ --matcher->groups[gi];
+ matcher->groupIndeces[gi] = o;
+ if (matcher->groups[gi] == matcher->groupPos[gi]) ret = next->match(str, matcher, curInd);
+ }
+ return ret;
+}
+
+#ifdef _WIN32
+ #pragma warning(pop)
+#endif
diff --git a/plugins/smileyadd/regexp/WCPattern.h b/plugins/smileyadd/regexp/WCPattern.h new file mode 100644 index 0000000000..3d52a7fd2e --- /dev/null +++ b/plugins/smileyadd/regexp/WCPattern.h @@ -0,0 +1,1663 @@ +#ifndef __WCPATTERN_H__
+#define __WCPATTERN_H__
+
+#ifdef _WIN32
+ #pragma warning(disable:4786)
+#endif
+
+#include "bkstring.h"
+
+#include <vector>
+#include <map>
+
+class WCMatcher;
+class NFAUNode;
+class NFAQuantifierUNode;
+
+/**
+ This pattern class is very similar in functionality to Java's
+ java.util.regex.WCPattern class. The pattern class represents an immutable
+ regular expression object. Instead of having a single object contain both the
+ regular expression object and the matching object, instead the two objects are
+ split apart. The {@link WCMatcher WCMatcher} class represents the maching
+ object.
+
+ The WCPattern class works primarily off of "compiled" patterns. A typical
+ instantiation of a regular expression looks like:
+
+ <pre>
+ WCPattern * p = WCPattern::compile(L"a*b");
+ WCMatcher * m = p->createWCMatcher(L"aaaaaab");
+ if (m->matches()) ...
+ </pre>
+
+ However, if you do not need to use a pattern more than once, it is often times
+ okay to use the WCPattern's static methods insteads. An example looks like this:
+
+ <pre>
+ if (WCPattern::matches(L"a*b", L"aaaab")) { ... }
+ </pre>
+
+ This class does not currently support unicode. The unicode update for this
+ class is coming soon.
+
+ This class is partially immutable. It is completely safe to call createWCMatcher
+ concurrently in different threads, but the other functions (e.g. split) should
+ not be called concurrently on the same <code>WCPattern</code>.
+
+ <table border="0" cellpadding="1" cellspacing="0">
+ <tr align="left" bgcolor="#CCCCFF">
+ <td>
+ <b>Construct</b>
+ </td>
+ <td>
+ <b>Matches</b>
+ </th>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Characters</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x</i></code>
+ </td>
+ <td>
+ The character <code><i>x</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\\</code>
+ </td>
+ <td>
+ The character <code>\</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\0<i>nn</i></code>
+ </td>
+ <td>
+ The character with octal ASCII value <code><i>nn</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\0<i>nnn</i></code>
+ </td>
+ <td>
+ The character with octal ASCII value <code><i>nnn</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\x<i>hh</i></code>
+ </td>
+ <td>
+ The character with hexadecimal ASCII value <code><i>hh</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\t</code>
+ </td>
+ <td>
+ A tab character
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\r</code>
+ </td>
+ <td>
+ A carriage return character
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\n</code>
+ </td>
+ <td>
+ A new-line character
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <b>Character Classes</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[abc]</code>
+ </td>
+ <td>
+ Either <code>a</code>, <code>b</code>, or <code>c</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[^abc]</code>
+ </td>
+ <td>
+ Any character but <code>a</code>, <code>b</code>, or <code>c</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-zA-Z]</code>
+ </td>
+ <td>
+ Any character ranging from <code>a</code> thru <code>z</code>, or
+ <code>A</code> thru <code>Z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[^a-zA-Z]</code>
+ </td>
+ <td>
+ Any character except those ranging from <code>a</code> thru
+ <code>z</code>, or <code>A</code> thru <code>Z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a\-z]</code>
+ </td>
+ <td>
+ Either <code>a</code>, <code>-</code>, or <code>z</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z[A-Z]]</code>
+ </td>
+ <td>
+ Same as <code>[a-zA-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z&&[g-i]]</code>
+ </td>
+ <td>
+ Any character in the intersection of <code>a-z</code> and
+ <code>g-i</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>[a-z&&[^g-i]]</code>
+ </td>
+ <td>
+ Any character in <code>a-z</code> and not in <code>g-i</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Prefefined character classes</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><b>.</b></code>
+ </td>
+ <td>
+ Any character. Multiline matching must be compiled into the pattern for
+ <code><b>.</b></code> to match a <code>\r</code> or a <code>\n</code>.
+ Even if multiline matching is enabled, <code><b>.</b></code> will not
+ match a <code>\r\n</code>, only a <code>\r</code> or a <code>\n</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\d</code>
+ </td>
+ <td>
+ <code>[0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\D</code>
+ </td>
+ <td>
+ <code>[^\d]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\s</code>
+ </td>
+ <td>
+ <code>[ \t\r\n\x0B]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\S</code>
+ </td>
+ <td>
+ <code>[^\s]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\w</code>
+ </td>
+ <td>
+ <code>[a-zA-Z0-9_]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\W</code>
+ </td>
+ <td>
+ <code>[^\w]</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>POSIX character classes
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Lower}</code>
+ </td>
+ <td>
+ <code>[a-z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Upper}</code>
+ </td>
+ <td>
+ <code>[A-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{ASCII}</code>
+ </td>
+ <td>
+ <code>[\x00-\x7F]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Alpha}</code>
+ </td>
+ <td>
+ <code>[a-zA-Z]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Digit}</code>
+ </td>
+ <td>
+ <code>[0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Alnum}</code>
+ </td>
+ <td>
+ <code>[\w&&[^_]]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{Punct}</code>
+ </td>
+ <td>
+ <code>[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\p{XDigit}</code>
+ </td>
+ <td>
+ <code>[a-fA-F0-9]</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Boundary Matches</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>^</code>
+ </td>
+ <td>
+ The beginning of a line. Also matches the beginning of input.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>$</code>
+ </td>
+ <td>
+ The end of a line. Also matches the end of input.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\b</code>
+ </td>
+ <td>
+ A word boundary
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\B</code>
+ </td>
+ <td>
+ A non word boundary
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\A</code>
+ </td>
+ <td>
+ The beginning of input
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\G</code>
+ </td>
+ <td>
+ The end of the previous match. Ensures that a "next" match will only
+ happen if it begins with the character immediately following the end of
+ the "current" match.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\Z</code>
+ </td>
+ <td>
+ The end of input. Will also match if there is a single trailing
+ <code>\r\n</code>, a single trailing <code>\r</code>, or a single
+ trailing <code>\n</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\z</code>
+ </td>
+ <td>
+ The end of input
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Greedy Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x?</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x+</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Possessive Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x?+</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*+</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x++</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}+</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Reluctant Quantifiers</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x??</i></code>
+ </td>
+ <td>
+ <i>x</i>, either zero times or one time
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x*?</i></code>
+ </td>
+ <td>
+ <i>x</i>, zero or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x+?</i></code>
+ </td>
+ <td>
+ <i>x</i>, one or more times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, exactly n times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{,m}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at most <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x{n,m}?</i></code>
+ </td>
+ <td>
+ <i>x</i>, at least <code><i>n</i></code> times and at most
+ <code><i>m</i></code> times
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Operators</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>xy</i></code>
+ </td>
+ <td>
+ <code><i>x</i></code> then <code><i>y</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code><i>x</i></code>|<code><i>y</i></code>
+ </td>
+ <td>
+ <code><i>x</i></code> or <code><i>y</i></code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code> as a capturing group
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Quoting</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\Q</code>
+ </td>
+ <td>
+ Nothing, but treat every character (including \s) literally until a
+ matching <code>\E</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>\E</code>
+ </td>
+ <td>
+ Nothing, but ends its matching <code>\Q</code>
+ </td>
+ </tr>
+ <tr>
+ <td>
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Special Constructs</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?:<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, but not as a capturing group
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?=<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via positive lookahead. This means that the
+ expression will match only if it is trailed by <code><i>x</i></code>.
+ It will not "eat" any of the characters matched by
+ <code><i>x</i></code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?!<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via negative lookahead. This means that the
+ expression will match only if it is not trailed by
+ <code><i>x</i></code>. It will not "eat" any of the characters
+ matched by <code><i>x</i></code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?<=<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via positive lookbehind. <code><i>x</i></code>
+ cannot contain any quantifiers.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?<!<i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i></code>, via negative lookbehind. <code><i>x</i></code>
+ cannot contain any quantifiers.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>(?><i>x</i>)</code>
+ </td>
+ <td>
+ <code><i>x</i>{1}+</code>
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+
+ </td>
+ </tr>
+ <tr>
+ <td colspan="2">
+ <b>Registered Expression Matching</b>
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>{<i>x</i>}</code>
+ </td>
+ <td>
+ The registered pattern <code><i>x</i></code>
+ </td>
+ </tr>
+ </table>
+
+ <hr>
+
+ <i>Begin Text Extracted And Modified From java.util.regex.WCPattern documentation</i>
+
+ <h4> Backslashes, escapes, and quoting </h4>
+
+ <p> The backslash character (<tt>(wchar_t)'\'</tt>) serves to introduce escaped
+ constructs, as defined in the table above, as well as to quote characters
+ that otherwise would be interpreted as unescaped constructs. Thus the
+ expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
+ left brace.
+
+ <p> It is an error to use a backslash prior to any alphabetic character that
+ does not denote an escaped construct; these are reserved for future
+ extensions to the regular-expression language. A backslash may be used
+ prior to a non-alphabetic character regardless of whether that character is
+ part of an unescaped construct.
+
+ <p>It is necessary to double backslashes in string literals that represent
+ regular expressions to protect them from interpretation by a compiler. The
+ string literal <tt>"\b"</tt>, for example, matches a single backspace
+ character when interpreted as a regular expression, while
+ <tt>"\\b"</tt> matches a word boundary. The string litera
+ <tt>"\(hello\)"</tt> is illegal and leads to a compile-time error;
+ in order to match the string <tt>(hello)</tt> the string literal
+ <tt>"\\(hello\\)"</tt> must be used.
+
+ <h4> Character Classes </h4>
+
+ <p> Character classes may appear within other character classes, and
+ may be composed by the union operator (implicit) and the intersection
+ operator (<tt>&&</tt>).
+ The union operator denotes a class that contains every character that is
+ in at least one of its operand classes. The intersection operator
+ denotes a class that contains every character that is in both of its
+ operand classes.
+
+ <p> The precedence of character-class operators is as follows, from
+ highest to lowest:
+
+ <blockquote><table border="0" cellpadding="1" cellspacing="0"
+ summary="Precedence of character class operators.">
+
+ <tr><th>1 </th>
+ <td>Literal escape </td>
+ <td><tt>\x</tt></td></tr>
+ <tr><th>2 </th>
+ <td>Range</td>
+ <td><tt>a-z</tt></td></tr>
+ <tr><th>3 </th>
+ <td>Grouping</td>
+ <td><tt>[...]</tt></td></tr>
+ <tr><th>4 </th>
+ <td>Intersection</td>
+ <td><tt>[a-z&&[aeiou]]</tt></td></tr>
+ <tr><th>5 </th>
+ <td>Union</td>
+ <td><tt>[a-e][i-u]<tt></td></tr>
+ </table></blockquote>
+
+ <p> Note that a different set of metacharacters are in effect inside
+ a character class than outside a character class. For instance, the
+ regular expression <tt>.</tt> loses its special meaning inside a
+ character class, while the expression <tt>-</tt> becomes a range
+ forming metacharacter.
+
+ <a name="lt">
+
+ <a name="cg">
+ <h4> Groups and capturing </h4>
+
+ <p> Capturing groups are numbered by counting their opening parentheses from
+ left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
+ are four such groups: </p>
+
+ <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
+
+ <tr><th>1 </th>
+ <td><tt>((A)(B(C)))</tt></td></tr>
+ <tr><th>2 </th>
+ <td><tt>(A)</tt></td></tr>
+ <tr><th>3 </th>
+ <td><tt>(B(C))</tt></td></tr>
+
+ <tr><th>4 </th>
+ <td><tt>(C)</tt></td></tr>
+ </table></blockquote>
+
+ <p> Group zero always stands for the entire expression.
+
+ <p> Capturing groups are so named because, during a match, each subsequence
+ of the input sequence that matches such a group is saved. The captured
+ subsequence may be used later in the expression, via a back reference, and
+ may also be retrieved from the matcher once the match operation is complete.
+
+ <p> The captured input associated with a group is always the subsequence
+ that the group most recently matched. If a group is evaluated a second time
+ because of quantification then its previously-captured value, if any, will
+ be retained if the second evaluation fails. Matching the string
+ <tt>L"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
+ group two set to <tt>L"b"</tt>. All captured input is discarded at the
+ beginning of each match.
+
+ <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
+ that do not capture text and do not count towards the group total.
+
+
+ <h4> WC support </h4>
+
+ <p> Coming Soon.
+
+ <h4> Comparison to Perl 5 </h4>
+
+ <p>The <code>WCPattern</code> engine performs traditional NFA-based matching
+ with ordered alternation as occurs in Perl 5.
+
+ <p> Perl constructs not supported by this class: </p>
+
+ <ul>
+
+ <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
+ <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
+ </p></li>
+
+ <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
+ and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
+
+ <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
+
+ <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>,
+ <tt>\L</tt>, and <tt>\U</tt>. </p></li>
+
+ <li><p> Embedded flags</p></li>
+
+ </ul>
+
+ <p> Constructs supported by this class but not by Perl: </p>
+
+ <ul>
+
+ <li><p> Possessive quantifiers, which greedily match as much as they can
+ and do not back off, even when doing so would allow the overall match to
+ succeed. </p></li>
+
+ <li><p> Character-class union and intersection as described
+ above.</p></li>
+
+ </ul>
+
+ <p> Notable differences from Perl: </p>
+
+ <ul>
+
+ <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
+ as back references; a backslash-escaped number greater than <tt>9</tt> is
+ treated as a back reference if at least that many subexpressions exist,
+ otherwise it is interpreted, if possible, as an octal escape. In this
+ class octal escapes must always begin with a zero. In this class,
+ <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
+ references, and a larger number is accepted as a back reference if at
+ least that many subexpressions exist at that point in the regular
+ expression, otherwise the parser will drop digits until the number is
+ smaller or equal to the existing number of groups or it is one digit.
+ </p></li>
+
+ <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
+ where the last match left off. This functionality is provided implicitly
+ by the <CODE>WCMatcher</CODE> class: Repeated invocations of the
+ <code>find</code> method will resume where the last match left off,
+ unless the matcher is reset. </p></li>
+
+ <li><p> Perl is forgiving about malformed matching constructs, as in the
+ expression <tt>*a</tt>, as well as dangling brackets, as in the
+ expression <tt>abc]</tt>, and treats them as literals. This
+ class also strict and will not compile a pattern when dangling characters
+ are encountered.</p></li>
+
+ </ul>
+
+
+ <p> For a more precise description of the behavior of regular expression
+ constructs, please see <a href="http://www.oreilly.com/catalog/regex2/">
+ <i>Mastering Regular Expressions, 2nd Edition</i>, Jeffrey E. F. Friedl,
+ O'Reilly and Associates, 2002.</a>
+ </p>
+ <P>
+
+ <i>End Text Extracted And Modified From java.util.regex.WCPattern documentation</i>
+
+ <hr>
+
+ @author Jeffery Stuart
+ @since March 2003, Stable Since November 2004
+ @version 1.07.00
+ @memo A class used to represent "PERL 5"-ish regular expressions
+ */
+class WCPattern
+{
+ friend class WCMatcher;
+ friend class NFAUNode;
+ friend class NFAQuantifierUNode;
+ private:
+ /**
+ This constructor should not be called directly. Those wishing to use the
+ WCPattern class should instead use the {@link compile compile} method.
+
+ @param rhs The pattern to compile
+ @memo Creates a new pattern from the regular expression in <code>rhs</code>.
+ */
+ WCPattern(const bkstring & rhs);
+ protected:
+ /**
+ This currently is not used, so don't try to do anything with it.
+ @memo Holds all the compiled patterns for quick access.
+ */
+ static std::map<bkstring, WCPattern *> compiledWCPatterns;
+ /**
+ Holds all of the registered patterns as strings. Due to certain problems
+ with compilation of patterns, especially with capturing groups, this seemed
+ to be the best way to do it.
+ */
+ static std::map<bkstring, std::pair<bkstring, unsigned long> > registeredWCPatterns;
+ protected:
+ /**
+ Holds all the NFA nodes used. This makes deletion of a pattern, as well as
+ clean-up from an unsuccessful compile much easier and faster.
+ */
+ std::map<NFAUNode*, bool> nodes;
+ /**
+ Used when methods like split are called. The matcher class uses a lot of
+ dynamic memeory, so having an instance increases speedup of certain
+ operations.
+ */
+ WCMatcher * matcher;
+ /**
+ The front node of the NFA.
+ */
+ NFAUNode * head;
+ /**
+ The actual regular expression we rerpesent
+ */
+ bkstring pattern;
+ /**
+ Flag used during compilation. Once the pattern is successfully compiled,
+ <code>error</code> is no longer used.
+ */
+ bool error;
+ /**
+ Used during compilation to keep track of the current index into
+ <code>{@link pattern pattern}<code>. Once the pattern is successfully
+ compiled, <code>error</code> is no longer used.
+ */
+ int curInd;
+ /**
+ The number of capture groups this contains.
+ */
+ int groupCount;
+ /**
+ The number of non-capture groups this contains.
+ */
+ int nonCapGroupCount;
+ /**
+ The flags specified when this was compiled.
+ */
+ unsigned long flags;
+ protected:
+ /**
+ Raises an error during compilation. Compilation will cease at that point
+ and compile will return <code>NULL</code>.
+ */
+ void raiseError();
+ /**
+ Convenience function for registering a node in <code>nodes</code>.
+ @param node The node to register
+ @return The registered node
+ */
+ NFAUNode * registerNode(NFAUNode * node);
+
+ /**
+ Calculates the union of two strings. This function will first sort the
+ strings and then use a simple selection algorithm to find the union.
+ @param s1 The first "class" to union
+ @param s2 The second "class" to union
+ @return A new string containing all unique characters. Each character
+ must have appeared in one or both of <code>s1</code> and
+ <code>s2</code>.
+ */
+ bkstring classUnion (bkstring s1, bkstring s2) const;
+ /**
+ Calculates the intersection of two strings. This function will first sort
+ the strings and then use a simple selection algorithm to find the
+ intersection.
+ @param s1 The first "class" to intersect
+ @param s2 The second "class" to intersect
+ @return A new string containing all unique characters. Each character
+ must have appeared both <code>s1</code> and <code>s2</code>.
+ */
+ bkstring classIntersect (bkstring s1, bkstring s2) const;
+ /**
+ Calculates the negation of a string. The negation is the set of all
+ characters between <code>\x00</code> and <code>\xFF</code> not
+ contained in <code>s1</code>.
+ @param s1 The "class" to be negated.
+ @param s2 The second "class" to intersect
+ @return A new string containing all unique characters. Each character
+ must have appeared both <code>s1</code> and <code>s2</code>.
+ */
+ bkstring classNegate (bkstring s1) const;
+ /**
+ Creates a new "class" representing the range from <code>low</code> thru
+ <code>hi</code>. This function will wrap if <code>low</code> >
+ <code>hi</code>. This is a feature, not a buf. Sometimes it is useful
+ to be able to say [\x70-\x10] instead of [\x70-\x7F\x00-\x10].
+ @param low The beginning character
+ @param hi The ending character
+ @return A new string containing all the characters from low thru hi.
+ */
+ bkstring classCreateRange(wchar_t low, wchar_t hi) const;
+
+ /**
+ Extracts a decimal number from the substring of member-variable
+ <code>{@link pattern pattern}<code> starting at <code>start</code> and
+ ending at <code>end</code>.
+ @param start The starting index in <code>{@link pattern pattern}<code>
+ @param end The last index in <code>{@link pattern pattern}<code>
+ @return The decimal number in <code>{@link pattern pattern}<code>
+ */
+ int getInt(int start, int end);
+ /**
+ Parses a <code>{n,m}</code> string out of the member-variable
+ <code>{@link pattern pattern}<code> stores the result in <code>sNum</code>
+ and <code>eNum</code>.
+ @param sNum Output parameter. The minimum number of matches required
+ by the curly quantifier are stored here.
+ @param eNum Output parameter. The maximum number of matches allowed
+ by the curly quantifier are stored here.
+ @return Success/Failure. Fails when the curly does not have the proper
+ syntax
+ */
+ bool quantifyCurly(int & sNum, int & eNum);
+ /**
+ Tries to quantify the currently parsed group. If the group being parsed
+ is indeed quantified in the member-variable
+ <code>{@link pattern pattern}<code>, then the NFA is modified accordingly.
+ @param start The starting node of the current group being parsed
+ @param stop The ending node of the current group being parsed
+ @param gn The group number of the current group being parsed
+ @return The node representing the starting node of the group. If the
+ group becomes quantified, then this node is not necessarily
+ a GroupHead node.
+ */
+ NFAUNode * quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn);
+
+ /**
+ Tries to quantify the last parsed expression. If the character was indeed
+ quantified, then the NFA is modified accordingly.
+ @param newNode The recently created expression node
+ @return The node representing the last parsed expression. If the
+ expression was quantified, <code>return value != newNode</code>
+ */
+ NFAUNode * quantify(NFAUNode * newNode);
+ /**
+ Parses the current class being examined in
+ <code>{@link pattern pattern}</code>.
+ @return A string of unique characters contained in the current class being
+ parsed
+ */
+ bkstring parseClass();
+ /**
+ Parses the current POSIX class being examined in
+ <code>{@link pattern pattern}</code>.
+ @return A string of unique characters representing the POSIX class being
+ parsed
+ */
+ bkstring parsePosix();
+ /**
+ Returns a string containing the octal character being parsed
+ @return The string contained the octal value being parsed
+ */
+ bkstring parseOctal();
+ /**
+ Returns a string containing the hex character being parsed
+ @return The string contained the hex value being parsed
+ */
+ bkstring parseHex();
+ /**
+ Returns a new node representing the back reference being parsed
+ @return The new node representing the back reference being parsed
+ */
+ NFAUNode * parseBackref();
+ /**
+ Parses the escape sequence currently being examined. Determines if the
+ escape sequence is a class, a single character, or the beginning of a
+ quotation sequence.
+ @param inv Output parameter. Whether or not to invert the returned class
+ @param quo Output parameter. Whether or not this sequence starts a
+ quotation.
+ @return The characters represented by the class
+ */
+ bkstring parseEscape(bool & inv, bool & quo);
+ /**
+ Parses a supposed registered pattern currently under compilation. If the
+ sequence of characters does point to a registered pattern, then the
+ registered pattern is appended to <code>*end<code>. The registered pattern
+ is parsed with the current compilation flags.
+ @param end The ending node of the thus-far compiled pattern
+ @return The new end node of the current pattern
+ */
+ NFAUNode * parseRegisteredWCPattern(NFAUNode ** end);
+ /**
+ Parses a lookbehind expression. Appends the necessary nodes
+ <code>*end</code>.
+ @param pos Positive or negative look behind
+ @param end The ending node of the current pattern
+ @return The new end node of the current pattern
+ */
+ NFAUNode * parseBehind(const bool pos, NFAUNode ** end);
+ /**
+ Parses the current expression and tacks on nodes until a \E is found.
+ @return The end of the current pattern
+ */
+ NFAUNode * parseQuote();
+ /**
+ Parses <code>{@link pattern pattern}</code>. This function is called
+ recursively when an or (<code>|</code>) or a group is encountered.
+ @param inParen Are we currently parsing inside a group
+ @param inOr Are we currently parsing one side of an or (<code>|</code>)
+ @param end The end of the current expression
+ @return The starting node of the NFA constructed from this parse
+ */
+ NFAUNode * parse(const bool inParen = 0, const bool inOr = 0, NFAUNode ** end = NULL);
+ public:
+ /// We should match regardless of case
+ const static unsigned long CASE_INSENSITIVE;
+ /// We are implicitly quoted
+ const static unsigned long LITERAL;
+ /// @memo We should treat a <code><b>.</b></code> as [\x00-\x7F]
+ const static unsigned long DOT_MATCHES_ALL;
+ /** <code>^</code> and <code>$</code> should anchor to the beginning and
+ ending of lines, not all input
+ */
+ const static unsigned long MULTILINE_MATCHING;
+ /** When enabled, only instances of <code>\n</codes> are recognized as
+ line terminators
+ */
+ const static unsigned long UNIX_LINE_MODE;
+ /// The absolute minimum number of matches a quantifier can match (0)
+ const static int MIN_QMATCH;
+ /// The absolute maximum number of matches a quantifier can match (0x7FFFFFFF)
+ const static int MAX_QMATCH;
+ public:
+ /**
+ Call this function to compile a regular expression into a
+ <code>WCPattern</code> object. Special values can be assigned to
+ <code>mode</code> when certain non-standard behaviors are expected from
+ the <code>WCPattern</code> object.
+ @param pattern The regular expression to compile
+ @param mode A bitwise or of flags signalling what special behaviors are
+ wanted from this <code>WCPattern</code> object
+ @return If successful, <code>compile</code> returns a <code>WCPattern</code>
+ pointer. Upon failure, <code>compile</code> returns
+ <code>NULL</code>
+ */
+ static WCPattern * compile (const bkstring & pattern,
+ const unsigned long mode = 0);
+ /**
+ Dont use this function. This function will compile a pattern, and cache
+ the result. This will eventually be used as an optimization when people
+ just want to call static methods using the same pattern over and over
+ instead of first compiling the pattern and then using the compiled
+ instance for matching.
+ @param pattern The regular expression to compile
+ @param mode A bitwise or of flags signalling what special behaviors are
+ wanted from this <code>WCPattern</code> object
+ @return If successful, <code>compileAndKeep</code> returns a
+ <code>WCPattern</code> pointer. Upon failure, <code>compile</code>
+ returns <code>NULL</code>.
+ */
+ static WCPattern * compileAndKeep (const bkstring & pattern,
+ const unsigned long mode = 0);
+
+ /**
+ Searches through <code>replace</code> and replaces all substrings matched
+ by <code>pattern</code> with <code>str</code>. <code>str</code> may
+ contain backreferences (e.g. <code>\1</code>) to capture groups. A typical
+ invocation looks like:
+ <p>
+ <code>
+ WCPattern::replace(L"(a+)b(c+)", L"abcccbbabcbabc", L"\\2b\\1");
+ </code>
+ <p>
+ which would replace <code>abcccbbabcbabc</code> with
+ <code>cccbabbcbabcba</code>.
+ @param pattern The regular expression
+ @param str The replacement text
+ @param replacementText The string in which to perform replacements
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the replacement process
+ @return The text with the replacement string substituted where necessary
+ */
+ static bkstring replace (const bkstring & pattern,
+ const bkstring & str,
+ const bkstring & replacementText,
+ const unsigned long mode = 0);
+
+ /**
+ Splits the specified string over occurrences of the specified pattern.
+ Empty strings can be optionally ignored. The number of strings returned is
+ configurable. A typical invocation looks like:
+ <p>
+ <code>
+ bkstring str(strSize, 0);<br>
+ FILE * fp = fopen(fileName, "r");<br>
+ fread((char*)str.data(), strSize * 2, 1, fp);<br>
+ fclose(fp);<br>
+ <br>
+ std::vector<bkstring> lines = WCPattern::split(L"[\r\n]+", str, true);<br>
+ <br>
+ </code>
+
+ @param pattern The regular expression
+ @param replace The string to split
+ @param keepEmptys Whether or not to keep empty strings
+ @param limit The maximum number of splits to make
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the split process
+ @return All substrings of <code>str</code> split across <code>pattern</code>.
+ */
+ static std::vector<bkstring> split (const bkstring & pattern,
+ const bkstring & str,
+ const bool keepEmptys = 0,
+ const unsigned long limit = 0,
+ const unsigned long mode = 0);
+
+ /**
+ Finds all the instances of the specified pattern within the string. You
+ should be careful to only pass patterns with a minimum length of one. For
+ example, the pattern <code>a*</code> can be matched by an empty string, so
+ instead you should pass <code>a+</code> since at least one character must
+ be matched. A typical invocation of <code>findAll</code> looks like:
+ <p>
+ <code>
+ std::vector<td::string> numbers = WCPattern::findAll(L"\\d+", string);
+ </code>
+ <p>
+
+ @param pattern The pattern for which to search
+ @param str The string to search
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the find process
+ @return All instances of <code>pattern</code> in <code>str</code>
+ */
+ static std::vector<bkstring> findAll (const bkstring & pattern,
+ const bkstring & str,
+ const unsigned long mode = 0);
+
+ /**
+ Determines if an entire string matches the specified pattern
+
+ @param pattern The pattern for to match
+ @param str The string to match
+ @param mode The special mode requested of the <code>WCPattern</code>
+ during the replacement process
+ @return True if <code>str</code> is recognized by <code>pattern</code>
+ */
+ static bool matches (const bkstring & pattern,
+ const bkstring & str,
+ const unsigned long mode = 0);
+
+ /**
+ Registers a pattern under a specific name for use in later compilations.
+ A typical invocation and later use looks like:
+ <p>
+ <code>
+ WCPattern::registerWCPattern(L"ip", L"(?:\\d{1,3}\\.){3}\\d{1,3}");<br>
+ WCPattern * p1 = WCPattern::compile(L"{ip}:\\d+");<br>
+ WCPattern * p2 = WCPattern::compile(L"Connection from ({ip}) on port \\d+");<br>
+ </code>
+ <p>
+ Multiple calls to <code>registerWCPattern</code> with the same
+ <code>name</code> will result in the pattern getting overwritten.
+
+ @param name The name to give to the pattern
+ @param pattern The pattern to register
+ @param mode Any special flags to use when compiling pattern
+ @return Success/Failure. Fails only if <code>pattern</code> has invalid
+ syntax
+ */
+ static bool registerWCPattern(const bkstring & name,
+ const bkstring & pattern,
+ const unsigned long mode = 0);
+
+ /**
+ Clears the pattern registry
+ */
+ static void unregisterWCPatterns();
+ /**
+ Don't use
+ */
+ static void clearWCPatternCache();
+
+ /**
+ Searches through a string for the <code>n<sup>th</sup></code> match of the
+ given pattern in the string. Match indeces start at zero, not one.
+ A typical invocation looks like this:
+ <p>
+ <code>
+ std::pair<bkstring, int> match = WCPattern::findNthMatch(L"\\d{1,3}", L"192.168.1.101:22", 1);<br>
+ wprintf(L"%s %i\n", match.first.c_str(), match.second);<br>
+ <br>
+ Output: 168 4<br>
+ <br>
+
+ @param pattern The pattern for which to search
+ @param str The string to search
+ @param matchNum Which match to find
+ @param mode Any special flags to use during the matching process
+ @return A string and an integer. The string is the string matched. The
+ integer is the starting location of the matched string in
+ <code>str</code>. You can check for success/failure by making sure
+ that the integer returned is greater than or equal to zero.
+ */
+ static std::pair<bkstring, int> findNthMatch (const bkstring & pattern,
+ const bkstring & str,
+ const int matchNum,
+ const unsigned long mode = 0);
+ public:
+ /**
+ Deletes all NFA nodes allocated during compilation
+ */
+ ~WCPattern();
+
+ bkstring replace (const bkstring & str,
+ const bkstring & replacementText);
+ std::vector<bkstring> split (const bkstring & str, const bool keepEmptys = 0,
+ const unsigned long limit = 0);
+ std::vector<bkstring> findAll (const bkstring & str);
+ bool matches (const bkstring & str);
+ /**
+ Returns the flags used during compilation of this pattern
+ @return The flags used during compilation of this pattern
+ */
+ unsigned long getFlags () const;
+ /**
+ Returns the regular expression this pattern represents
+ @return The regular expression this pattern represents
+ */
+ bkstring getWCPattern () const;
+ /**
+ Creates a matcher object using the specified string and this pattern.
+ @param str The string to match against
+ @return A new matcher using object using this pattern and the specified
+ string
+ */
+ WCMatcher * createWCMatcher (const bkstring & str);
+};
+
+class NFAUNode
+{
+ friend class WCMatcher;
+ public:
+ NFAUNode * next;
+ NFAUNode();
+ virtual ~NFAUNode();
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const = 0;
+ inline virtual bool isGroupHeadNode() const { return false; }
+ inline virtual bool isStartOfInputNode() const { return false; }
+};
+class NFACharUNode : public NFAUNode
+{
+ protected:
+ wchar_t ch;
+ public:
+ NFACharUNode(const wchar_t c);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFACICharUNode : public NFAUNode
+{
+ protected:
+ wchar_t ch;
+ public:
+ NFACICharUNode(const wchar_t c);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAStartUNode : public NFAUNode
+{
+ public:
+ NFAStartUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAEndUNode : public NFAUNode
+{
+ public:
+ NFAEndUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAQuantifierUNode : public NFAUNode
+{
+ public:
+ int min, max;
+ NFAUNode * inner;
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ NFAQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGreedyQuantifierUNode : public NFAQuantifierUNode
+{
+ public:
+ NFAGreedyQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ virtual int matchInternal(const bkstring & str, WCMatcher * matcher, const int curInd, const int soFar) const;
+};
+class NFALazyQuantifierUNode : public NFAQuantifierUNode
+{
+ public:
+ NFALazyQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAPossessiveQuantifierUNode : public NFAQuantifierUNode
+{
+ public:
+ NFAPossessiveQuantifierUNode(WCPattern * pat, NFAUNode * internal,
+ const int minMatch = WCPattern::MIN_QMATCH,
+ const int maxMatch = WCPattern::MAX_QMATCH);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAAcceptUNode : public NFAUNode
+{
+ public:
+ NFAAcceptUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAClassUNode : public NFAUNode
+{
+ public:
+ bool inv;
+ std::map<wchar_t, bool> vals;
+ NFAClassUNode(const bool invert = 0);
+ NFAClassUNode(const bkstring & clazz, const bool invert);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFACIClassUNode : public NFAUNode
+{
+ public:
+ bool inv;
+ std::map<wchar_t, bool> vals;
+ NFACIClassUNode(const bool invert = 0);
+ NFACIClassUNode(const bkstring & clazz, const bool invert);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFASubStartUNode : public NFAUNode
+{
+ public:
+ NFASubStartUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAOrUNode : public NFAUNode
+{
+ public:
+ NFAUNode * one;
+ NFAUNode * two;
+ NFAOrUNode(NFAUNode * first, NFAUNode * second);
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAQuoteUNode : public NFAUNode
+{
+ public:
+ bkstring qStr;
+ NFAQuoteUNode(const bkstring & quoted);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFACIQuoteUNode : public NFAUNode
+{
+ public:
+ bkstring qStr;
+ NFACIQuoteUNode(const bkstring & quoted);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFALookAheadUNode : public NFAUNode
+{
+ public:
+ bool pos;
+ NFAUNode * inner;
+ NFALookAheadUNode(NFAUNode * internal, const bool positive);
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFALookBehindUNode : public NFAUNode
+{
+ public:
+ bool pos;
+ bkstring mStr;
+ NFALookBehindUNode(const bkstring & str, const bool positive);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAStartOfLineUNode : public NFAUNode
+{
+ public:
+ NFAStartOfLineUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAEndOfLineUNode : public NFAUNode
+{
+ public:
+ NFAEndOfLineUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAReferenceUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAReferenceUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAStartOfInputUNode : public NFAUNode
+{
+ public:
+ NFAStartOfInputUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ inline virtual bool isStartOfInputNode() const { return false; }
+};
+class NFAEndOfInputUNode : public NFAUNode
+{
+ public:
+ bool term;
+ NFAEndOfInputUNode(const bool lookForTerm);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAWordBoundaryUNode : public NFAUNode
+{
+ public:
+ bool pos;
+ NFAWordBoundaryUNode(const bool positive);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAEndOfMatchUNode : public NFAUNode
+{
+ public:
+ NFAEndOfMatchUNode();
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupHeadUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAGroupHeadUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ inline virtual bool isGroupHeadNode() const { return false; }
+};
+class NFAGroupTailUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAGroupTailUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupLoopPrologueUNode : public NFAUNode
+{
+ public:
+ int gi;
+ NFAGroupLoopPrologueUNode(const int groupIndex);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+class NFAGroupLoopUNode : public NFAUNode
+{
+ public:
+ int gi, min, max, type;
+ NFAUNode * inner;
+ NFAGroupLoopUNode(NFAUNode * internal, const int minMatch,
+ const int maxMatch, const int groupIndex, const int matchType);
+ virtual void findAllNodes(std::map<NFAUNode*, bool> & soFar);
+ virtual int match(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ int matchGreedy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ int matchLazy(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+ int matchPossessive(const bkstring & str, WCMatcher * matcher, const int curInd = 0) const;
+};
+
+#endif
+
diff --git a/plugins/smileyadd/regexp/test.cpp b/plugins/smileyadd/regexp/test.cpp new file mode 100644 index 0000000000..bb41566f96 --- /dev/null +++ b/plugins/smileyadd/regexp/test.cpp @@ -0,0 +1,38 @@ +#include <regexp/Matcher.h> +#include <regexp/Pattern.h> +#include <regexp/UnicodeMatcher.h> +#include <regexp/UnicodePattern.h> + +int main() +{ + { + Pattern * p = Pattern::compile("^([^:]*)://([^/:]*)((?::[0-9]+)?)/?(([^?]*)((?:\\?.*)?))$"); + Matcher * m0 = p->createMatcher("http://www.example.com:80/test.php?a=1&a=1&a=1"); + + if (m0->matches()) + { + std::vector<std::string> groups = m0->getGroups(true); + for (int i = 0; i < (int)groups.size(); ++i) + { + printf("m->group(%d): %s\n", i, groups[i].c_str()); + } + } + } + { + std::wstring pat = L"^([^:]*)://([^/:]*)((?::[0-9]+)?)/?(([^?]*)((?:\\?.*)?))$"; + std::wstring mat = L"http://www.example.com:80/test.php?a=1&a=1&a=1"; + UnicodePattern * p = UnicodePattern::compile(pat); + UnicodeMatcher * m0 = p->createUnicodeMatcher(mat); + + if (m0->matches()) + { + std::vector<std::wstring> groups = m0->getGroups(true); + for (int i = 0; i < (int)groups.size(); ++i) + { + wprintf(L"m->group(%d): %s\n", i, groups[i].c_str()); + } + } + } + + return 0; +} |