(********************************************************** * * * Perl Regular Expressions * * * * Delphi wrapper around PCRE - http://www.pcre.org * * * * Copyright (C) 1999-2006 Jan Goyvaerts * * * * Design & implementation Jan Goyvaerts 1999-2006 * * * * * * Shorted and added partial ability for PCRE 7.0 * * by Abyss * **********************************************************) unit PerlRegEx; interface uses SysUtils; type TPerlRegExOptions = set of ( preCaseLess, // /i -> Case insensitive preMultiLine, // /m -> ^ and $ also match before/after a newline, not just at the beginning and the end of the string preSingleLine, // /s -> Dot matches any character, including \n (newline). Otherwise, it matches anything except \n preExtended, // /x -> Allow regex to contain extra whitespace, newlines and Perl-style comments, all of which will be filtered out preAnchored, // /A -> Successful match can only occur at the start of the subject or right after the previous match preDollarEndOnly, // /E preExtra, // /X preUnGreedy, // Repeat operators (+, *, ?) are not greedy by default // (i.e. they try to match the minimum number of characters instead of the maximum) preUTF8 // UTF8 ); type TPerlRegExState = set of ( preNotBOL, // Not Beginning Of Line: ^ does not match at the start of Subject preNotEOL, // Not End Of Line: $ does not match at the end of Subject preNotEmpty // Empty matches not allowed ); const // Maximum number of subexpressions (backreferences) // Subexpressions are created by placing round brackets in the regex, and are referenced by \1, \2, ... // In Perl, they are available as $1, $2, ... after the regex matched; with TPerlRegEx, use the Subexpressions property // You can also insert \1, \2, ... in the Replacement string; \0 is the complete matched expression MAX_SUBEXPRESSIONS = 99; type TPerlRegEx = class private // *** Property storage, getters and setters FCompiled, FStudied: Boolean; FOptions: TPerlRegExOptions; FState: TPerlRegExState; FRegEx, FSubject: string; FStart, FStop: Integer; function GetMatchedExpression: string; function GetMatchedExpressionLength: Integer; function GetMatchedExpressionOffset: Integer; procedure SetOptions(Value: TPerlRegExOptions); procedure SetRegEx(const Value: string); function GetSubExpressionCount: Integer; function GetSubExpressions(Index: Integer): string; function GetSubExpressionLengths(Index: Integer): Integer; function GetSubExpressionOffsets(Index: Integer): Integer; procedure SetSubject(const Value: string); procedure SetStart(const Value: Integer); procedure SetStop(const Value: Integer); function GetFoundMatch: Boolean; private // *** Variables used by pcrelib.dll Offsets: array[0..(MAX_SUBEXPRESSIONS+1)*3] of Integer; OffsetCount: Integer; pcreOptions: Integer; pattern, hints, chartable: Pointer; FSubjectPChar: PChar; protected procedure CleanUp; // Dispose off whatever we created, so we can start over. Called automatically when needed, so it is not made public public constructor Create; // Come to life destructor Destroy; override; // Clean up after ourselves class function EscapeRegExChars(const S: string): string; // Escapes regex characters in S so that the regex engine can be used to match S as plain text procedure Compile; // Compile the regex. Called automatically by Match procedure Study; // Study the regex. Studying takes time, but will make the execution of the regex a lot faster. // Call study if you will be using the same regex many times function Match: Boolean; // Attempt to match the regex function MatchAgain: Boolean; // Attempt to match the regex to the remainder of the string after the previous match // To avoid problems (when using ^ in the regex), call MatchAgain only after a succesful Match() function NamedSubExpression(const SEName: string): Integer; // Returns the index of the named group SEName function Config (What: integer): Integer; //This function makes it possible for a client program // to find out which optional features are available in the //version of the PCRE library it is using. property Compiled: Boolean read FCompiled; // True if the RegEx has already been compiled. property FoundMatch: Boolean read GetFoundMatch; // Returns True when MatchedExpression* and SubExpression* indicate a match property Studied: Boolean read FStudied; // True if the RegEx has already been studied property MatchedExpression: string read GetMatchedExpression; // The matched string property MatchedExpressionLength: Integer read GetMatchedExpressionLength; // Length of the matched string property MatchedExpressionOffset: Integer read GetMatchedExpressionOffset; // Character offset in the Subject string at which the matched substring starts property Start: Integer read FStart write SetStart; // Starting position in Subject from which MatchAgain begins property Stop: Integer read FStop write SetStop; // Last character in Subject that Match and MatchAgain search through property State: TPerlRegExState read FState write FState; // State of Subject property SubExpressionCount: Integer read GetSubExpressionCount; // Number of matched subexpressions property SubExpressions[Index: Integer]: string read GetSubExpressions; // Matched subexpressions after a regex has been matched property SubExpressionLengths[Index: Integer]: Integer read GetSubExpressionLengths; // Lengths of the subexpressions property SubExpressionOffsets[Index: Integer]: Integer read GetSubExpressionOffsets; // Character offsets in the Subject string of the subexpressions property Subject: string read FSubject write SetSubject; // The string on which Match() will try to match RegEx published property Options: TPerlRegExOptions read FOptions write SetOptions; // Options property RegEx: string read FRegEx write SetRegEx; // The regular expression to be matched end; implementation { ********* pcrelib.dll imports ********* } const PCRE_CASELESS = $00000001; PCRE_MULTILINE = $00000002; PCRE_SINGLELINE = $00000004; PCRE_EXTENDED = $00000008; PCRE_ANCHORED = $00000010; PCRE_DOLLAR_ENDONLY = $00000020; PCRE_EXTRA = $00000040; PCRE_NOTBOL = $00000080; PCRE_NOTEOL = $00000100; PCRE_UNGREEDY = $00000200; PCRE_NOTEMPTY = $00000400; PCRE_UTF8 = $00000800; PCRE_NO_AUTO_CAPTURE = $00001000; PCRE_NO_UTF8_CHECK = $00002000; PCRE_AUTO_CALLOUT = $00004000; PCRE_PARTIAL = $00008000; PCRE_DFA_SHORTEST = $00010000; PCRE_DFA_RESTART = $00020000; PCRE_FIRSTLINE = $00040000; PCRE_DUPNAMES = $00080000; PCRE_NEWLINE_CR = $00100000; PCRE_NEWLINE_LF = $00200000; PCRE_NEWLINE_CRLF = $00300000; PCRE_NEWLINE_ANY = $00400000; // Exec error codes PCRE_ERROR_NOMATCH = -1; PCRE_ERROR_NULL = -2; PCRE_ERROR_BADOPTION = -3; PCRE_ERROR_BADMAGIC = -4; PCRE_ERROR_UNKNOWN_OPCODE = -5; PCRE_ERROR_UNKNOWN_NODE = -5; // For backward compatibility PCRE_ERROR_NOMEMORY = -6; PCRE_ERROR_NOSUBSTRING = -7; PCRE_ERROR_MATCHLIMIT = -8; PCRE_ERROR_CALLOUT = -9; // Never used by PCRE itself PCRE_ERROR_BADUTF8 =-10; PCRE_ERROR_BADUTF8_OFFSET =-11; PCRE_ERROR_PARTIAL =-12; PCRE_ERROR_BADPARTIAL =-13; PCRE_ERROR_INTERNAL =-14; PCRE_ERROR_BADCOUNT =-15; PCRE_ERROR_DFA_UITEM =-16; PCRE_ERROR_DFA_UCOND =-17; PCRE_ERROR_DFA_UMLIMIT =-18; PCRE_ERROR_DFA_WSSIZE =-19; PCRE_ERROR_DFA_RECURSE =-20; PCRE_ERROR_RECURSIONLIMIT =-21; PCRE_ERROR_NULLWSLIMIT =-22; PCRE_ERROR_BADNEWLINE =-23; (* Request types for pcre_config(). Do not re-arrange, in order to remain compatible. *) PCRE_CONFIG_UTF8 = 0; PCRE_CONFIG_NEWLINE = 1; PCRE_CONFIG_LINK_SIZE = 2; PCRE_CONFIG_POSIX_MALLOC_THRESHOLD = 3; PCRE_CONFIG_MATCH_LIMIT = 4; PCRE_CONFIG_STACKRECURSE = 5; PCRE_CONFIG_UNICODE_PROPERTIES = 6; PCRE_CONFIG_MATCH_LIMIT_RECURSION = 7; type PPChar = ^PChar; PInt = ^Integer; // Functions we import from the PCRE library DLL // Leading underscores gratuitously added by Borland C++Builder 6.0 function pcre_maketables: PAnsiChar; cdecl; external 'pcre3.dll'; function pcre_compile(const pattern: PChar; options: Integer; errorptr: PPChar; erroroffset: PInt; const tables: PChar): Pointer; cdecl; external 'pcre3.dll'; function pcre_exec(const pattern: Pointer; const hints: Pointer; const subject: PChar; length, startoffset: Integer; options: Integer; offsets: PInt; offsetcount: Integer): Integer; cdecl; external 'pcre3.dll'; function pcre_get_stringnumber(const pattern: Pointer; const Name: PChar): Integer; cdecl; external 'pcre3.dll'; function pcre_study(const pattern: Pointer; options: Integer; errorptr: PPChar): Pointer; cdecl; external 'pcre3.dll'; function pcre_fullinfo(const pattern: Pointer; const hints: Pointer; what: Integer; where: Pointer): Integer; cdecl; external 'pcre3.dll'; function pcre_version: pchar; cdecl; external 'pcre3.dll'; function pcre_config (what:integer; where:pointer):integer; cdecl; external 'pcre3.dll'; //procedure pcre_free(ptr: Pointer); cdecl; external 'pcre3.dll'; { ********* TPerlRegEx component ********* } procedure TPerlRegEx.CleanUp; begin FCompiled := False; FStudied := False; pattern := nil; hints := nil; OffsetCount := 0; end; procedure TPerlRegEx.Compile; var Error: PChar; ErrorOffset: Integer; begin if FRegEx = '' then raise Exception.Create('TPerlRegEx.Compile() - Please specify a regular expression in RegEx first'); CleanUp; Pattern := pcre_compile(PChar(FRegEx), pcreOptions, @Error, @ErrorOffset, chartable); if Pattern = nil then raise Exception.Create(Format('TPerlRegEx.Compile() - Error in regex at offset %d: %s', [ErrorOffset, AnsiString(Error)])); FCompiled := True end; constructor TPerlRegEx.Create; begin inherited Create; FState := [preNotEmpty]; chartable := pcre_maketables; end; destructor TPerlRegEx.Destroy; begin CleanUp; inherited Destroy; end; class function TPerlRegEx.EscapeRegExChars(const S: string): string; var I: Integer; begin Result := S; I := Length(Result); while I > 0 do begin if Result[I] in ['.', '[', ']', '(', ')', '?', '*', '+', '{', '}', '^', '$', '|', '\'] then Insert('\', Result, I) else if Result[I] = #0 then begin Result[I] := '0'; Insert('\', Result, I); end; Dec(I); end; end; function TPerlRegEx.GetFoundMatch: Boolean; begin Result := OffsetCount > 0; end; function TPerlRegEx.GetMatchedExpression: string; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); Result := GetSubExpressions(0); end; function TPerlRegEx.GetMatchedExpressionLength: Integer; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); Result := GetSubExpressionLengths(0) end; function TPerlRegEx.GetMatchedExpressionOffset: Integer; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); Result := GetSubExpressionOffsets(0) end; function TPerlRegEx.GetSubExpressionCount: Integer; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); Result := OffsetCount-1 end; function TPerlRegEx.GetSubExpressionLengths(Index: Integer): Integer; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); Assert((Index >= 0) and (Index <= SubExpressionCount), 'REQUIRE: Index <= SubExpressionCount'); Result := Offsets[Index*2+1]-Offsets[Index*2] end; function TPerlRegEx.GetSubExpressionOffsets(Index: Integer): Integer; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); Assert((Index >= 0) and (Index <= SubExpressionCount), 'REQUIRE: Index <= SubExpressionCount'); Result := Offsets[Index*2] end; function TPerlRegEx.GetSubExpressions(Index: Integer): string; begin Assert(FoundMatch, 'REQUIRE: There must be a successful match first'); if Index > SubExpressionCount then Result := '' else Result := Copy(FSubject, Offsets[Index*2], Offsets[Index*2+1]-Offsets[Index*2]); end; function TPerlRegEx.Match: Boolean; var I, Opts: Integer; begin if not Compiled then Compile; if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0; if preNotEOL in State then Opts := Opts or PCRE_NOTEOL; if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY; if FStart > FStop then OffsetCount := -1 else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, 0, Opts, @Offsets[0], High(Offsets)); Result := OffsetCount > 0; // Convert offsets into string indices if Result then begin for I := 0 to OffsetCount*2-1 do Inc(Offsets[I]); FStart := Offsets[1]; if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position end; end; function TPerlRegEx.MatchAgain: Boolean; var I, Opts: Integer; begin if not Compiled then Compile; if preNotBOL in State then Opts := PCRE_NOTBOL else Opts := 0; if preNotEOL in State then Opts := Opts or PCRE_NOTEOL; if preNotEmpty in State then Opts := Opts or PCRE_NOTEMPTY; if FStart > FStop then OffsetCount := -1 else OffsetCount := pcre_exec(Pattern, Hints, FSubjectPChar, FStop, FStart-1, Opts, @Offsets[0], High(Offsets)); Result := OffsetCount > 0; // Convert offsets into string indices if Result then begin for I := 0 to OffsetCount*2-1 do Inc(Offsets[I]); FStart := Offsets[1]; if Offsets[0] = Offsets[1] then Inc(FStart); // Make sure we don't get stuck at the same position end; end; function TPerlRegEx.NamedSubExpression(const SEName: string): Integer; begin Result := pcre_get_stringnumber(Pattern, PChar(SEName)); end; function TPerlRegEx.Config(What: integer):integer; begin result:=-1; pcre_config(what, @result); end; procedure TPerlRegEx.SetOptions(Value: TPerlRegExOptions); begin if (FOptions <> Value) then begin FOptions := Value; pcreOptions := 0; if (preCaseLess in Value) then pcreOptions := pcreOptions or PCRE_CASELESS; if (preMultiLine in Value) then pcreOptions := pcreOptions or PCRE_MULTILINE; if (preSingleLine in Value) then pcreOptions := pcreOptions or PCRE_SINGLELINE; if (preExtended in Value) then pcreOptions := pcreOptions or PCRE_EXTENDED; if (preAnchored in Value) then pcreOptions := pcreOptions or PCRE_ANCHORED; if (preDollarEndOnly in Value) then pcreOptions := pcreOptions or PCRE_DOLLAR_ENDONLY; if (preExtra in Value) then pcreOptions := pcreOptions or PCRE_EXTRA; if (preUnGreedy in Value) then pcreOptions := pcreOptions or PCRE_UNGREEDY; if (preUTF8 in Value) then pcreOptions := pcreOptions or PCRE_UTF8; CleanUp end end; procedure TPerlRegEx.SetRegEx(const Value: string); begin if FRegEx <> Value then begin FRegEx := Value; CleanUp end end; procedure TPerlRegEx.SetStart(const Value: Integer); begin if Value < 1 then FStart := 1 else FStart := Value; // If FStart > Length(Subject), MatchAgain() will simply return False end; procedure TPerlRegEx.SetStop(const Value: Integer); begin if Value > Length(Subject) then FStop := Length(Subject) else FStop := Value; end; procedure TPerlRegEx.SetSubject(const Value: string); begin FSubject := Value; FSubjectPChar := PChar(Value); FStart := 1; FStop := Length(Subject); OffsetCount := 0; end; procedure TPerlRegEx.Study; var Error: PChar; begin if not FCompiled then Compile; Hints := pcre_study(Pattern, 0, @Error); if Error <> nil then raise Exception.Create('TPerlRegEx.Study() - Error studying the regex: ' + AnsiString(Error)); FStudied := True end; end.