summaryrefslogtreecommitdiff
path: root/plugins/ImportTXT/PerlRegEx.pas
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/ImportTXT/PerlRegEx.pas')
-rw-r--r--plugins/ImportTXT/PerlRegEx.pas447
1 files changed, 447 insertions, 0 deletions
diff --git a/plugins/ImportTXT/PerlRegEx.pas b/plugins/ImportTXT/PerlRegEx.pas
new file mode 100644
index 0000000000..617479fa91
--- /dev/null
+++ b/plugins/ImportTXT/PerlRegEx.pas
@@ -0,0 +1,447 @@
+(**********************************************************
+* *
+* Perl Regular Expressions *
+* *
+* Delphi wrapper around PCRE - http://www.pcre.org *
+* *
+* Copyright (C) 1999-2006 Jan Goyvaerts *
+* *
+* Design & implementation Jan Goyvaerts 1999-2006 *
+* *
+* *
+* Shorted and added partial ability for PCRE 7.0 *
+* by Abyss *
+**********************************************************)
+
+unit PerlRegEx;
+
+interface
+
+
+
+uses SysUtils;
+
+type
+ TPerlRegExOptions = set of (
+ preCaseLess, // /i -> Case insensitive
+ preMultiLine, // /m -> ^ and $ also match before/after a newline, not just at the beginning and the end of the string
+ preSingleLine, // /s -> Dot matches any character, including \n (newline). Otherwise, it matches anything except \n
+ preExtended, // /x -> Allow regex to contain extra whitespace, newlines and Perl-style comments, all of which will be filtered out
+ preAnchored, // /A -> Successful match can only occur at the start of the subject or right after the previous match
+ preDollarEndOnly, // /E
+ preExtra, // /X
+ preUnGreedy, // Repeat operators (+, *, ?) are not greedy by default
+ // (i.e. they try to match the minimum number of characters instead of the maximum)
+ preUTF8 // UTF8
+ );
+type
+ TPerlRegExState = set of (
+ preNotBOL, // Not Beginning Of Line: ^ does not match at the start of Subject
+ preNotEOL, // Not End Of Line: $ does not match at the end of Subject
+ preNotEmpty // Empty matches not allowed
+ );
+
+const
+ // Maximum number of subexpressions (backreferences)
+ // Subexpressions are created by placing round brackets in the regex, and are referenced by \1, \2, ...
+ // In Perl, they are available as $1, $2, ... after the regex matched; with TPerlRegEx, use the Subexpressions property
+ // You can also insert \1, \2, ... in the Replacement string; \0 is the complete matched expression
+ MAX_SUBEXPRESSIONS = 99;
+
+
+type
+ TPerlRegEx = class
+ private // *** Property storage, getters and setters
+ FCompiled, FStudied: Boolean;
+ FOptions: TPerlRegExOptions;
+ FState: TPerlRegExState;
+ FRegEx, FSubject: string;
+ FStart, FStop: Integer;
+ function GetMatchedExpression: string;
+ function GetMatchedExpressionLength: Integer;
+ function GetMatchedExpressionOffset: Integer;
+ procedure SetOptions(Value: TPerlRegExOptions);
+ procedure SetRegEx(const Value: string);
+ function GetSubExpressionCount: Integer;
+ function GetSubExpressions(Index: Integer): string;
+ function GetSubExpressionLengths(Index: Integer): Integer;
+ function GetSubExpressionOffsets(Index: Integer): Integer;
+ procedure SetSubject(const Value: string);
+ procedure SetStart(const Value: Integer);
+ procedure SetStop(const Value: Integer);
+ function GetFoundMatch: Boolean;
+ private // *** Variables used by pcrelib.dll
+ Offsets: array[0..(MAX_SUBEXPRESSIONS+1)*3] of Integer;
+ OffsetCount: Integer;
+ pcreOptions: Integer;
+ pattern, hints, chartable: Pointer;
+ FSubjectPChar: PChar;
+ protected
+ procedure CleanUp;
+ // Dispose off whatever we created, so we can start over. Called automatically when needed, so it is not made public
+ public
+ constructor Create;
+ // Come to life
+ destructor Destroy; override;
+ // Clean up after ourselves
+ class function EscapeRegExChars(const S: string): string;
+ // Escapes regex characters in S so that the regex engine can be used to match S as plain text
+ procedure Compile;
+ // Compile the regex. Called automatically by Match
+ procedure Study;
+ // Study the regex. Studying takes time, but will make the execution of the regex a lot faster.
+ // Call study if you will be using the same regex many times
+ function Match: Boolean;
+ // Attempt to match the regex
+ function MatchAgain: Boolean;
+ // Attempt to match the regex to the remainder of the string after the previous match
+ // To avoid problems (when using ^ in the regex), call MatchAgain only after a succesful Match()
+ function NamedSubExpression(const SEName: string): Integer;
+ // Returns the index of the named group SEName
+ function Config (What: integer): Integer;
+ //This function makes it possible for a client program
+ // to find out which optional features are available in the
+ //version of the PCRE library it is using.
+ property Compiled: Boolean read FCompiled;
+ // True if the RegEx has already been compiled.
+ property FoundMatch: Boolean read GetFoundMatch;
+ // Returns True when MatchedExpression* and SubExpression* indicate a match
+ property Studied: Boolean read FStudied;
+ // True if the RegEx has already been studied
+ property MatchedExpression: string read GetMatchedExpression;
+ // The matched string
+ property MatchedExpressionLength: Integer read GetMatchedExpressionLength;
+ // Length of the matched string
+ property MatchedExpressionOffset: Integer read GetMatchedExpressionOffset;
+ // Character offset in the Subject string at which the matched substring starts
+ property Start: Integer read FStart write SetStart;
+ // Starting position in Subject from which MatchAgain begins
+ property Stop: Integer read FStop write SetStop;
+ // Last character in Subject that Match and MatchAgain search through
+ property State: TPerlRegExState read FState write FState;
+ // State of Subject
+ property SubExpressionCount: Integer read GetSubExpressionCount;
+ // Number of matched subexpressions
+ property SubExpressions[Index: Integer]: string read GetSubExpressions;
+ // Matched subexpressions after a regex has been matched
+ property SubExpressionLengths[Index: Integer]: Integer read GetSubExpressionLengths;
+ // Lengths of the subexpressions
+ property SubExpressionOffsets[Index: Integer]: Integer read GetSubExpressionOffsets;
+ // Character offsets in the Subject string of the subexpressions
+ property Subject: string read FSubject write SetSubject;
+ // The string on which Match() will try to match RegEx
+ published
+ property Options: TPerlRegExOptions read FOptions write SetOptions;
+ // Options
+ property RegEx: string read FRegEx write SetRegEx;
+ // The regular expression to be matched
+ end;
+
+
+
+implementation
+
+ { ********* pcrelib.dll imports ********* }
+
+const
+ PCRE_CASELESS = $00000001;
+ PCRE_MULTILINE = $00000002;
+ PCRE_SINGLELINE = $00000004;
+ PCRE_EXTENDED = $00000008;
+ PCRE_ANCHORED = $00000010;
+ PCRE_DOLLAR_ENDONLY = $00000020;
+ PCRE_EXTRA = $00000040;
+ PCRE_NOTBOL = $00000080;
+ PCRE_NOTEOL = $00000100;
+ PCRE_UNGREEDY = $00000200;
+ PCRE_NOTEMPTY = $00000400;
+ PCRE_UTF8 = $00000800;
+ PCRE_NO_AUTO_CAPTURE = $00001000;
+ PCRE_NO_UTF8_CHECK = $00002000;
+ PCRE_AUTO_CALLOUT = $00004000;
+ PCRE_PARTIAL = $00008000;
+ PCRE_DFA_SHORTEST = $00010000;
+ PCRE_DFA_RESTART = $00020000;
+ PCRE_FIRSTLINE = $00040000;
+ PCRE_DUPNAMES = $00080000;
+ PCRE_NEWLINE_CR = $00100000;
+ PCRE_NEWLINE_LF = $00200000;
+ PCRE_NEWLINE_CRLF = $00300000;
+ PCRE_NEWLINE_ANY = $00400000;
+
+ // Exec error codes
+ PCRE_ERROR_NOMATCH = -1;
+ PCRE_ERROR_NULL = -2;
+ PCRE_ERROR_BADOPTION = -3;
+ PCRE_ERROR_BADMAGIC = -4;
+ PCRE_ERROR_UNKNOWN_OPCODE = -5;
+ PCRE_ERROR_UNKNOWN_NODE = -5; // For backward compatibility
+ PCRE_ERROR_NOMEMORY = -6;
+ PCRE_ERROR_NOSUBSTRING = -7;
+ PCRE_ERROR_MATCHLIMIT = -8;
+ PCRE_ERROR_CALLOUT = -9; // Never used by PCRE itself
+ PCRE_ERROR_BADUTF8 =-10;
+ PCRE_ERROR_BADUTF8_OFFSET =-11;
+ PCRE_ERROR_PARTIAL =-12;
+ PCRE_ERROR_BADPARTIAL =-13;
+ PCRE_ERROR_INTERNAL =-14;
+ PCRE_ERROR_BADCOUNT =-15;
+ PCRE_ERROR_DFA_UITEM =-16;
+ PCRE_ERROR_DFA_UCOND =-17;
+ PCRE_ERROR_DFA_UMLIMIT =-18;
+ PCRE_ERROR_DFA_WSSIZE =-19;
+ PCRE_ERROR_DFA_RECURSE =-20;
+ PCRE_ERROR_RECURSIONLIMIT =-21;
+ PCRE_ERROR_NULLWSLIMIT =-22;
+ PCRE_ERROR_BADNEWLINE =-23;
+
+(* Request types for pcre_config(). Do not re-arrange, in order to remain compatible. *)
+
+ PCRE_CONFIG_UTF8 = 0;
+ PCRE_CONFIG_NEWLINE = 1;
+ PCRE_CONFIG_LINK_SIZE = 2;
+ PCRE_CONFIG_POSIX_MALLOC_THRESHOLD = 3;
+ PCRE_CONFIG_MATCH_LIMIT = 4;
+ PCRE_CONFIG_STACKRECURSE = 5;
+ PCRE_CONFIG_UNICODE_PROPERTIES = 6;
+ PCRE_CONFIG_MATCH_LIMIT_RECURSION = 7;
+type
+ PPChar = ^PChar;
+ PInt = ^Integer;
+
+
+
+// Functions we import from the PCRE library DLL
+// Leading underscores gratuitously added by Borland C++Builder 6.0
+function pcre_maketables: PAnsiChar; cdecl; external 'pcre3.dll';
+function pcre_compile(const pattern: PChar; options: Integer; errorptr: PPChar; erroroffset: PInt;
+ const tables: PChar): Pointer; cdecl; external 'pcre3.dll';
+function pcre_exec(const pattern: Pointer; const hints: Pointer; const subject: PChar; length, startoffset: Integer;
+ options: Integer; offsets: PInt; offsetcount: Integer): Integer; cdecl; external 'pcre3.dll';
+function pcre_get_stringnumber(const pattern: Pointer; const Name: PChar): Integer; cdecl; external 'pcre3.dll';
+function pcre_study(const pattern: Pointer; options: Integer; errorptr: PPChar): Pointer; cdecl; external 'pcre3.dll';
+function pcre_fullinfo(const pattern: Pointer; const hints: Pointer; what: Integer; where: Pointer): Integer; cdecl; external 'pcre3.dll';
+function pcre_version: pchar; cdecl; external 'pcre3.dll';
+function pcre_config (what:integer; where:pointer):integer; cdecl; external 'pcre3.dll';
+//procedure pcre_free(ptr: Pointer); cdecl; external 'pcre3.dll';
+
+
+
+
+ { ********* TPerlRegEx component ********* }
+
+procedure TPerlRegEx.CleanUp;
+begin
+ FCompiled := False; FStudied := False;
+ pattern := nil; hints := nil;
+ OffsetCount := 0;
+end;
+
+procedure TPerlRegEx.Compile;
+var
+ Error: PChar;
+ ErrorOffset: Integer;
+begin
+ if FRegEx = '' then raise Exception.Create('TPerlRegEx.Compile() - Please specify a regular expression in RegEx first');
+ CleanUp;
+ Pattern := pcre_compile(PChar(FRegEx), pcreOptions, @Error, @ErrorOffset, chartable);
+ if Pattern = nil then
+ raise Exception.Create(Format('TPerlRegEx.Compile() - Error in regex at offset %d: %s', [ErrorOffset, AnsiString(Error)]));
+ FCompiled := True
+end;
+
+
+constructor TPerlRegEx.Create;
+begin
+ inherited Create;
+ FState := [preNotEmpty];
+ chartable := pcre_maketables;
+end;
+
+destructor TPerlRegEx.Destroy;
+begin
+ CleanUp;
+ inherited Destroy;
+end;
+
+class function TPerlRegEx.EscapeRegExChars(const S: string): string;
+var
+ I: Integer;
+begin
+ Result := S;
+ I := Length(Result);
+ while I > 0 do begin
+ if Result[I] in ['.', '[', ']', '(', ')', '?', '*', '+', '{', '}', '^', '$', '|', '\'] then
+ Insert('\', Result, I)
+ else if Result[I] = #0 then begin
+ Result[I] := '0';
+ Insert('\', Result, I);
+ end;
+ Dec(I);
+ end;
+end;
+
+function TPerlRegEx.GetFoundMatch: Boolean;
+begin
+ Result := OffsetCount > 0;
+end;
+
+function TPerlRegEx.GetMatchedExpression: string;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ Result := GetSubExpressions(0);
+end;
+
+function TPerlRegEx.GetMatchedExpressionLength: Integer;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ Result := GetSubExpressionLengths(0)
+end;
+
+function TPerlRegEx.GetMatchedExpressionOffset: Integer;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ Result := GetSubExpressionOffsets(0)
+end;
+
+function TPerlRegEx.GetSubExpressionCount: Integer;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ Result := OffsetCount-1
+end;
+
+function TPerlRegEx.GetSubExpressionLengths(Index: Integer): Integer;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ Assert((Index >= 0) and (Index <= SubExpressionCount), 'REQUIRE: Index <= SubExpressionCount');
+ Result := Offsets[Index*2+1]-Offsets[Index*2]
+end;
+
+function TPerlRegEx.GetSubExpressionOffsets(Index: Integer): Integer;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ Assert((Index >= 0) and (Index <= SubExpressionCount), 'REQUIRE: Index <= SubExpressionCount');
+ Result := Offsets[Index*2]
+end;
+
+function TPerlRegEx.GetSubExpressions(Index: Integer): string;
+begin
+ Assert(FoundMatch, 'REQUIRE: There must be a successful match first');
+ if Index > SubExpressionCount then Result := ''
+ else Result := Copy(FSubject, Offsets[Index*2], Offsets[Index*2+1]-Offsets[Index*2]);
+end;
+
+function TPerlRegEx.Match: Boolean;
+var
+ I, Opts: Integer;
+begin
+ if not Compiled then Compile;
+ if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
+ if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
+ if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
+ if FStart > FStop then OffsetCount := -1
+ else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, 0, Opts, @Offsets[0], High(Offsets));
+ Result := OffsetCount > 0;
+ // Convert offsets into string indices
+ if Result then begin
+ for I := 0 to OffsetCount*2-1 do
+ Inc(Offsets[I]);
+ FStart := Offsets[1];
+ if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
+ end;
+end;
+
+function TPerlRegEx.MatchAgain: Boolean;
+var
+ I, Opts: Integer;
+begin
+ if not Compiled then Compile;
+ if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0;
+ if preNotEOL in State then Opts := Opts or PCRE_NOTEOL;
+ if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY;
+ if FStart > FStop then OffsetCount := -1
+ else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, FStart-1, Opts, @Offsets[0], High(Offsets));
+ Result := OffsetCount > 0;
+ // Convert offsets into string indices
+ if Result then begin
+ for I := 0 to OffsetCount*2-1 do
+ Inc(Offsets[I]);
+ FStart := Offsets[1];
+ if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position
+ end;
+end;
+
+function TPerlRegEx.NamedSubExpression(const SEName: string): Integer;
+begin
+ Result := pcre_get_stringnumber(Pattern, PChar(SEName));
+end;
+
+function TPerlRegEx.Config(What: integer):integer;
+begin
+ result:=-1;
+ pcre_config(what, @result);
+end;
+
+
+procedure TPerlRegEx.SetOptions(Value: TPerlRegExOptions);
+begin
+ if (FOptions <> Value) then begin
+ FOptions := Value;
+ pcreOptions := 0;
+ if (preCaseLess in Value) then pcreOptions := pcreOptions or PCRE_CASELESS;
+ if (preMultiLine in Value) then pcreOptions := pcreOptions or PCRE_MULTILINE;
+ if (preSingleLine in Value) then pcreOptions := pcreOptions or PCRE_SINGLELINE;
+ if (preExtended in Value) then pcreOptions := pcreOptions or PCRE_EXTENDED;
+ if (preAnchored in Value) then pcreOptions := pcreOptions or PCRE_ANCHORED;
+ if (preDollarEndOnly in Value) then pcreOptions := pcreOptions or PCRE_DOLLAR_ENDONLY;
+ if (preExtra in Value) then pcreOptions := pcreOptions or PCRE_EXTRA;
+ if (preUnGreedy in Value) then pcreOptions := pcreOptions or PCRE_UNGREEDY;
+ if (preUTF8 in Value) then pcreOptions := pcreOptions or PCRE_UTF8;
+ CleanUp
+ end
+end;
+
+procedure TPerlRegEx.SetRegEx(const Value: string);
+begin
+ if FRegEx <> Value then begin
+ FRegEx := Value;
+ CleanUp
+ end
+end;
+
+procedure TPerlRegEx.SetStart(const Value: Integer);
+begin
+ if Value < 1 then FStart := 1
+ else FStart := Value;
+ // If FStart > Length(Subject), MatchAgain() will simply return False
+end;
+
+procedure TPerlRegEx.SetStop(const Value: Integer);
+begin
+ if Value > Length(Subject) then FStop := Length(Subject)
+ else FStop := Value;
+end;
+
+procedure TPerlRegEx.SetSubject(const Value: string);
+begin
+ FSubject := Value;
+ FSubjectPChar := PChar(Value);
+ FStart := 1;
+ FStop := Length(Subject);
+ OffsetCount := 0;
+end;
+
+
+
+procedure TPerlRegEx.Study;
+var
+ Error: PChar;
+begin
+ if not FCompiled then Compile;
+ Hints := pcre_study(Pattern, 0, @Error);
+ if Error <> nil then raise Exception.Create('TPerlRegEx.Study() - Error studying the regex: ' + AnsiString(Error));
+ FStudied := True
+end;
+
+
+end. \ No newline at end of file