Return to BSD News archive
Path: sserve!newshost.anu.edu.au!munnari.oz.au!news.Hawaii.Edu!ames!saimiri.primate.wisc.edu!zaphod.mps.ohio-state.edu!cs.utexas.edu!uunet!mcsun!dkuug!dde!kim From: kim@dde.dk (Kim Andersen) Newsgroups: comp.os.386bsd.bugs Subject: re_exec, re_comp etc. code Message-ID: <1993Mar18.174631.28789@dde.dk> Date: 18 Mar 93 17:46:31 GMT Organization: Dansk Data Elektronik A/S Lines: 1481 Here are patches to add re_exec and companion functions. The code is written by Ozan Yigit, and kindly donated to the public domain. To apply: patch -d/usr/src/lib/libc/gen -p0 -N < "what ever you saved this file as" *** Makefile.inc~ Thu Mar 18 18:31:13 1993 --- Makefile.inc Thu Mar 18 18:31:13 1993 *************** *** 1,7 **** # @(#)Makefile.inc 5.21 (Berkeley) 5/24/91 # gen sources ! .PATH: ${.CURDIR}/${MACHINE}/gen ${.CURDIR}/gen ${.CURDIR}/gen/regexp SRCS+= alarm.c clock.c closedir.c crypt_dummy.c ctermid.c ctime.c ctype_.c \ difftime.c disklabel.c errlst.c exec.c fnmatch.c frexp.c fstab.c \ --- 1,7 ---- # @(#)Makefile.inc 5.21 (Berkeley) 5/24/91 # gen sources ! .PATH: ${.CURDIR}/${MACHINE}/gen ${.CURDIR}/gen ${.CURDIR}/gen/regexp ${.CURDIR}/gen/regex SRCS+= alarm.c clock.c closedir.c crypt_dummy.c ctermid.c ctime.c ctype_.c \ difftime.c disklabel.c errlst.c exec.c fnmatch.c frexp.c fstab.c \ *************** *** 21,26 **** --- 21,29 ---- # gen/regexp sources SRCS+= regerror.c regexp.c regsub.c + # gen/regex sources + SRCS+= re_fail.c regex.c + .if (${MACHINE} == "hp300") SRCS+= _setjmp.s alloca.s fabs.s ldexp.s modf.s setjmp.s SRCS+= adddf3.s addsf3.s ashlsi3.s ashrsi3.s cmpdf2.s cmpsf2.s divdf3.s \ *************** *** 47,53 **** isalpha.0 isascii.0 iscntrl.0 isdigit.0 isgraph.0 isinf.0 \ islower.0 isprint.0 ispunct.0 isspace.0 isupper.0 isxdigit.0 \ ldexp.0 modf.0 nice.0 nlist.0 pause.0 popen.0 psignal.0 \ ! raise.0 regexp.0 scandir.0 setjmp.0 setmode.0 setuid.0 \ siginterrupt.0 signal.0 sigsetops.0 sleep.0 syslog.0 time.0 \ times.0 timezone.0 tolower.0 toupper.0 ttyname.0 tzset.0 \ ualarm.0 unvis.0 usleep.0 utime.0 valloc.0 vis.0 --- 50,56 ---- isalpha.0 isascii.0 iscntrl.0 isdigit.0 isgraph.0 isinf.0 \ islower.0 isprint.0 ispunct.0 isspace.0 isupper.0 isxdigit.0 \ ldexp.0 modf.0 nice.0 nlist.0 pause.0 popen.0 psignal.0 \ ! raise.0 regex.0 regexp.0 scandir.0 setjmp.0 setmode.0 setuid.0 \ siginterrupt.0 signal.0 sigsetops.0 sleep.0 syslog.0 time.0 \ times.0 timezone.0 tolower.0 toupper.0 ttyname.0 tzset.0 \ ualarm.0 unvis.0 usleep.0 utime.0 valloc.0 vis.0 *************** *** 73,78 **** --- 76,82 ---- MLINKS+=glob.3 globfree.3 MLINKS+=popen.3 pclose.3 MLINKS+=psignal.3 sys_siglist.3 + MLINKS+=regex.3 MLINKS+=regexp.3 regcomp.3 regexp.3 regexec.3 regexp.3 regsub.3 \ regexp.3 regerror.3 MLINKS+=scandir.3 alphasort.3 *** /dev/null Thu Mar 18 15:24:08 1993 --- regex/grep.c Thu Mar 18 18:31:14 1993 *************** *** 0 **** --- 1,67 ---- + #ifdef vms + #include stdio + #else + #include <stdio.h> + #endif + + /* + * Rudimentary grep to test regex routines. + * + * DEBUG is only applicable + * to oZ version of regex. Make sure to + * compile regex.c with -DDEBUG as well. + * + */ + extern char *re_comp(); + extern re_exec(); + + main(argc,argv) + char *argv[]; + { + char str[512]; + FILE *f; + register int n; + register char *p; + + if (argc < 2) + error("usage: grep pat [file]"); + + if ((p = re_comp(argv[1])) != 0) { + printf("\t%s: %s\n", p, argv[1]); + exit(1); + } + #ifdef DEBUG + symbolic(argv[1]); + #endif + if (p = argv[2]) { + if ((f = fopen(p, "r")) == NULL) { + printf("cannot open %s\n", argv[2]); + exit(1); + } + while ((n = load(str, f)) != EOF) + if (re_exec(str)) + printf("%s\n",str); + + } + exit(0); + } + load (s, f) + char *s; + FILE *f; + { + register int c; + static int lineno = 0; + + while ((c = getc(f)) != '\n' && c != EOF) + *s++ = c; + if (c == EOF) + return (EOF); + *s = (char) 0; + return (++lineno); + } + + error(s) char *s ; + { + fprintf(stderr,"%s\n",s); + exit(1); + } *** /dev/null Thu Mar 18 15:24:08 1993 --- regex/makefile Thu Mar 18 18:31:15 1993 *************** *** 0 **** --- 1,94 ---- + # + # regex routines (PUBLIC DOMAIN) + # + # by: Ozan S. Yigit (oz) + # Dept. of Computer Science + # York University + # + # Applicable to BSD: + # + # If you have the generic(?) regex routines + # than you can compare the timings of these + # routines to the generic ones by: + # + # make times + # + # which will create two rudimentary greps + # lgrep and ogrep. lgrep will use the generic + # regex routines, and ogrep will use oz version + # of regex. Several patterns will be searched + # in /usr/dict/words, and the output of the greps + # will be compared. [for reasons of sanity] + # + # Surely, you will note, the time tests are somewhat + # biased, since /usr/dict/words contain *short* lines, + # thereby the real-life case of searching a complex + # expression within a long line is not covered. You + # will find, however, that the PD regex routines will + # search *as fast* as the generic ones in most + # cases, and about 10% slower in some cases, when + # tested with files containing *long* lines. + # + CFLAGS = -O + # + # test patterns + # + PAT1 = '[d-f]zz*.*m' + PAT2 = 'fo[ornt]*.*b[a-d]*' + PAT3 = '.th.' + PAT4 = '\(ab\)[a-d]*\1' + PAT5 = 'burp' + + FILE = /usr/dict/words + OUTD = /tmp/ + + RSRC = regex.o re_fail.o + + regex: $(RSRC) + # + # insert code to put these into a library + # + rlint: + lint -phc regex.c + debug: + cc -O -DDEBUG -o ogrep grep.c regex.c re_fail.c + + lgrep: grep.o + cc -o lgrep grep.o + + ogrep: grep.o $(RSRC) + cc -o ogrep grep.o $(RSRC) + + times: lgrep ogrep + @echo generic regex vs oz regex + # @echo pattern: $(PAT1) + time ogrep $(PAT1) $(FILE) >$(OUTD)ogrep.out + time lgrep $(PAT1) $(FILE) >$(OUTD)lgrep.out + @echo output differences: + -diff $(OUTD)ogrep.out $(OUTD)lgrep.out + @echo "---" + # @echo pattern: $(PAT2) + time ogrep $(PAT2) $(FILE) >$(OUTD)ogrep.out + time lgrep $(PAT2) $(FILE) >$(OUTD)lgrep.out + @echo output differences: + -diff $(OUTD)ogrep.out $(OUTD)lgrep.out + @echo "---" + # echo pattern: $(PAT3) + time ogrep $(PAT3) $(FILE) >$(OUTD)ogrep.out + time lgrep $(PAT3) $(FILE) >$(OUTD)lgrep.out + @echo output differences: + -diff $(OUTD)ogrep.out $(OUTD)lgrep.out + @echo "---" + # echo pattern: $(PAT4) + time ogrep $(PAT4) $(FILE) >$(OUTD)ogrep.out + time lgrep $(PAT4) $(FILE) >$(OUTD)lgrep.out + @echo output differences: + -diff $(OUTD)ogrep.out $(OUTD)lgrep.out + @echo "---" + # echo pattern: $(PAT5) + time ogrep $(PAT5) $(FILE) >$(OUTD)ogrep.out + time lgrep $(PAT5) $(FILE) >$(OUTD)lgrep.out + @echo output differences: + -diff $(OUTD)ogrep.out $(OUTD)lgrep.out + @echo "---" + @rm $(OUTD)ogrep.out $(OUTD)lgrep.out *** /dev/null Thu Mar 18 15:24:08 1993 --- regex/re_fail.c Thu Mar 18 18:31:16 1993 *************** *** 0 **** --- 1,22 ---- + + #ifdef vms + #include stdio + #else + #include <stdio.h> + #endif + + /* + * re_fail: + * internal error handler for re_exec. + * + * should probably do something like a + * longjump to recover gracefully. + */ + void + re_fail(s, c) + char *s; + char c; + { + fprintf(stderr, "%s [opcode %o]\n", s, c); + exit(1); + } *** /dev/null Thu Mar 18 15:24:08 1993 --- regex/regex.3 Thu Mar 18 18:31:16 1993 *************** *** 0 **** --- 1,326 ---- + .TH regex 3 local + .DA Jun 19 1986 + .SH NAME + re_comp, re_exec, re_subs, re_modw, re_fail \- regular expression handling + .SH ORIGIN + Dept. of Computer Science + .br + York University + .SH SYNOPSIS + .B char *re_comp(pat) + .br + .B char *pat; + .PP + .B re_exec(str) + .br + .B char *str; + .PP + .B re_subs(src, dst) + .br + .B char *src; + .br + .B char *dst; + .PP + .B void re_fail(msg, op) + .br + .B char *msg; + .br + .B char op; + .PP + .B void re_modw(str) + .br + .B char *str; + + .SH DESCRIPTION + .PP + These functions implement + .IR ed (1)-style + partial regular expressions and supporting facilities. + .PP + .I Re_comp + compiles a pattern string into an internal form (a deterministic finite-state + automaton) to be executed by + .I re_exec + for pattern matching. + .I Re_comp + returns 0 if the pattern is compiled successfully, otherwise it returns an + error message string. If + .I re_comp + is called with a 0 or a \fInull\fR string, it returns without changing the + currently compiled regular expression. + .sp + .I Re_comp + supports the same limited set of + .I regular expressions + found in + .I ed + and Berkeley + .IR regex (3) + routines: + .sp + .if n .in +1.6i + .if t .in +1i + .de Ti + .if n .ti -1.6i + .if t .ti -1i + .. + .if n .ta 0.8i +0.8i +0.8i + .if t .ta 0.5i +0.5i +0.5i + .Ti + [1] \fIchar\fR Matches itself, unless it is a special + character (meta-character): \fB. \\ [ ] * + ^ $\fR + + .Ti + [2] \fB.\fR Matches \fIany\fR character. + + .Ti + [3] \fB\\\fR Matches the character following it, except + when followed by a digit 1 to 9, \fB(\fR, fB)\fR, \fB<\fR or \fB>\fR. + (see [7], [8] and [9]) It is used as an escape character for all + other meta-characters, and itself. When used + in a set ([4]), it is treated as an ordinary + character. + + .Ti + [4] \fB[\fIset\fB]\fR Matches one of the characters in the set. + If the first character in the set is \fB^\fR, + it matches a character NOT in the set. A + shorthand + .IR S - E + is used to specify a set of + characters + .I S + up to + .IR E , + inclusive. The special + characters \fB]\fR and \fB-\fR have no special + meaning if they appear as the first chars + in the set. + .nf + examples: match: + [a-z] any lowercase alpha + [^]-] any char except ] and - + [^A-Z] any char except + uppercase alpha + [a-zA-Z0-9] any alphanumeric + .fi + + .Ti + [5] \fB*\fR Any regular expression form [1] to [4], followed by + closure char (*) matches zero or more matches of + that form. + + .Ti + [6] \fB+\fR Same as [5], except it matches one or more. + + .Ti + [7] A regular expression in the form [1] to [10], enclosed + as \\(\fIform\fR\\) matches what form matches. The enclosure + creates a set of tags, used for [8] and for + pattern substitution in + .I re_subs. + The tagged forms are numbered + starting from 1. + + .Ti + [8] A \\ followed by a digit 1 to 9 matches whatever a + previously tagged regular expression ([7]) matched. + + .Ti + [9] \fB\\<\fR Matches the beginning of a \fIword\fR, + that is, an empty string followed by a + letter, digit, or _ and not preceded by + a letter, digit, or _ . + .Ti + \fB\\>\fR Matches the end of a \fIword\fR, + that is, an empty string preceded + by a letter, digit, or _ , and not + followed by a letter, digit, or _ . + + .Ti + [10] A composite regular expression + \fIxy\fR where \fIx\fR and \fIy\fR + are in the form of [1] to [10] matches the longest + match of \fIx\fR followed by a match for \fIy\fR. + + .Ti + [11] \fB^ $\fR a regular expression starting with a \fB^\fR character + and/or ending with a \fB$\fR character, restricts the + pattern matching to the beginning of the line, + and/or the end of line [anchors]. Elsewhere in the + pattern, \fB^\fR and \fB$\fR are treated as ordinary characters. + .if n .in -1.6i + .if t .in -1i + + .PP + .I Re_exec + executes the internal form produced by + .I re_comp + and searches the argument string for the regular expression described + by the internal + form. + .I Re_exec + returns 1 if the last regular expression pattern is matched within the string, + 0 if no match is found. In case of an internal error (corrupted internal + form), + .I re_exec + calls the user-supplied + .I re_fail + and returns 0. + .PP + The strings passed to both + .I re_comp + and + .I re_exec + may have trailing or embedded newline characters. The strings + must be terminated by nulls. + .PP + .I Re_subs + does + .IR ed -style + pattern substitution, after a successful match is found by + .I re_exec. + The source string parameter to + .I re_subs + is copied to the destination string with the following interpretation; + .sp + .if n .in +1.6i + .if t .in +1i + .Ti + [1] & Substitute the entire matched string in the destination. + + .Ti + [2] \\\fIn\fR Substitute the substring matched by a tagged subpattern + numbered \fIn\fR, where \fIn\fR is between 1 to 9, inclusive. + + .Ti + [3] \\\fIchar\fR Treat the next character literally, + unless the character is a digit ([2]). + .if n .in -1.6i + .if t .in -1i + + .PP + If the copy operation with the substitutions is successful, + .I re_subs + returns 1. + If the source string is corrupted, or the last call to + .I re_exec + fails, it returns 0. + + .I Re_modw + is used to + add new characters into an internal table to + change the re_exec's understanding of what + a \fIword\fR should look like, when matching with \fB\\<\fR and \fB\\>\fR + constructs. If the string parameter is 0 or null string, + the table is reset back to the default, which contains \fBA-Z a-z 0-9 _\fR . + + .I Re_fail + is a user-supplied routine to handle internal errors. + .I re_exec + calls + .I re_fail + with an error message string, and the opcode character that caused the error. + The default + .I re_fail + routine simply prints the message and the opcode character to + .I stderr + and invokes + .IR exit (2). + .SH EXAMPLES + In the examples below, the + .I nfaform + describes the internal form after the pattern is compiled. For additional + details, refer to the sources. + .PP + .ta 0.5i +0.5i +0.5i + .nf + foo*.* + nfaform: CHR f CHR o CLO CHR o END CLO ANY END END + matches: \fIfo foo fooo foobar fobar foxx ...\fR + + fo[ob]a[rz] + nfaform: CHR f CHR o CCL 2 o b CHR a CCL 2 r z END + matches: \fIfobar fooar fobaz fooaz\fR + + foo\\\\+ + nfaform: CHR f CHR o CHR o CHR \\ CLO CHR \\ END END + matches: \fIfoo\\ foo\\\\ foo\\\\\\ ...\fR + + \\(foo\\)[1-3]\\1 (same as foo[1-3]foo, but takes less internal space) + nfaform: BOT 1 CHR f CHR o CHR o EOT 1 CCL 3 1 2 3 REF 1 END + matches: \fIfoo1foo foo2foo foo3foo\fR + + \\(fo.*\\)-\\1 + nfaform: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + matches: \fIfoo-foo fo-fo fob-fob foobar-foobar ...\fR + .SH DIAGNOSTICS + .I Re_comp + returns one of the following strings if an error occurs: + .PP + .nf + .in +0.5i + \fINo previous regular expression, + Empty closure, + Illegal closure, + Cyclical reference, + Undetermined reference, + Unmatched \e(, + Missing ], + Null pattern inside \e(\e), + Null pattern inside \e<\e>, + Too many \e(\e) pairs, + Unmatched \e)\fP. + .in -0.5i + .fi + .SH REFERENCES + .if n .ta 3i + .if t .ta 1.8i + .nf + \fISoftware tools\fR Kernighan & Plauger + \fISoftware tools in Pascal\fR Kernighan & Plauger + \fIGrep sources\fR [rsx-11 C dist] David Conroy + \fIEd - text editor\fR Unix Programmer's Manual + \fIAdvanced editing on Unix\fR B. W. Kernighan + \fIRegExp sources\fR Henry Spencer + .fi + .SH "HISTORY AND NOTES" + These routines are derived from various implementations + found in + .I "Software Tools" + books, and David Conroy's + .I grep. + They are NOT derived from licensed/restricted software. + For more interesting/academic/complicated implementations, + see Henry Spencer's + .I regexp + routines (V8), or + .I "GNU Emacs" + pattern + matching module. + .PP + The + .I re_comp + and + .I re_exec + routines perform + .I almost + as well as their licensed counterparts, sometimes better. + In very few instances, they + are about 10% to 15% slower. + .SH AUTHOR + Ozan S. Yigit (oz) + .br + usenet: utzoo!yetti!oz + .br + bitnet: oz@yusol || oz@yuyetti + .SH "SEE ALSO" + ed(1), ex(1), egrep(1), fgrep(1), grep(1), regex(3) + .SH BUGS + These routines are \fIPublic Domain\fR. You can get them + in source. + .br + The internal storage for the \fInfa form\fR is not checked for + overflows. Currently, it is 1024 bytes. + .br + Others, no doubt. *** /dev/null Thu Mar 18 15:24:08 1993 --- regex/regex.c Thu Mar 18 18:31:17 1993 *************** *** 0 **** --- 1,877 ---- + /* + * regex - Regular expression pattern matching + * and replacement + * + * + * By: Ozan S. Yigit (oz) + * Dept. of Computer Science + * York University + * + * + * These routines are the PUBLIC DOMAIN equivalents + * of regex routines as found in 4.nBSD UN*X, with minor + * extensions. + * + * These routines are derived from various implementations + * found in software tools books, and Conroy's grep. They + * are NOT derived from licensed/restricted software. + * For more interesting/academic/complicated implementations, + * see Henry Spencer's regexp routines, or GNU Emacs pattern + * matching module. + * + * Modification history: + * + * $Log: regex.c,v $ + * Revision 1.3 89/04/01 14:18:09 oz + * Change all references to a dfa: this is actually an nfa. + * + * Revision 1.2 88/08/28 15:36:04 oz + * Use a complement bitmap to represent NCL. + * This removes the need to have seperate + * code in the pmatch case block - it is + * just CCL code now. + * + * Use the actual CCL code in the CLO + * section of pmatch. No need for a recursive + * pmatch call. + * + * Use a bitmap table to set char bits in an + * 8-bit chunk. + * + * + * Routines: + * re_comp: compile a regular expression into + * a NFA. + * + * char *re_comp(s) + * char *s; + * + * re_exec: execute the NFA to match a pattern. + * + * int re_exec(s) + * char *s; + * + * re_modw change re_exec's understanding of what + * a "word" looks like (for \< and \>) + * by adding into the hidden word-syntax + * table. + * + * void re_modw(s) + * char *s; + * + * re_subs: substitute the matched portions in + * a new string. + * + * int re_subs(src, dst) + * char *src; + * char *dst; + * + * re_fail: failure routine for re_exec. + * + * void re_fail(msg, op) + * char *msg; + * char op; + * + * Regular Expressions: + * + * [1] char matches itself, unless it is a special + * character (metachar): . \ [ ] * + ^ $ + * + * [2] . matches any character. + * + * [3] \ matches the character following it, except + * when followed by a left or right round bracket, + * a digit 1 to 9 or a left or right angle bracket. + * (see [7], [8] and [9]) + * It is used as an escape character for all + * other meta-characters, and itself. When used + * in a set ([4]), it is treated as an ordinary + * character. + * + * [4] [set] matches one of the characters in the set. + * If the first character in the set is "^", + * it matches a character NOT in the set, i.e. + * complements the set. A shorthand S-E is + * used to specify a set of characters S upto + * E, inclusive. The special characters "]" and + * "-" have no special meaning if they appear + * as the first chars in the set. + * examples: match: + * + * [a-z] any lowercase alpha + * + * [^]-] any char except ] and - + * + * [^A-Z] any char except uppercase + * alpha + * + * [a-zA-Z] any alpha + * + * [5] * any regular expression form [1] to [4], followed by + * closure char (*) matches zero or more matches of + * that form. + * + * [6] + same as [5], except it matches one or more. + * + * [7] a regular expression in the form [1] to [10], enclosed + * as \(form\) matches what form matches. The enclosure + * creates a set of tags, used for [8] and for + * pattern substution. The tagged forms are numbered + * starting from 1. + * + * [8] a \ followed by a digit 1 to 9 matches whatever a + * previously tagged regular expression ([7]) matched. + * + * [9] \< a regular expression starting with a \< construct + * \> and/or ending with a \> construct, restricts the + * pattern matching to the beginning of a word, and/or + * the end of a word. A word is defined to be a character + * string beginning and/or ending with the characters + * A-Z a-z 0-9 and _. It must also be preceded and/or + * followed by any character outside those mentioned. + * + * [10] a composite regular expression xy where x and y + * are in the form [1] to [10] matches the longest + * match of x followed by a match for y. + * + * [11] ^ a regular expression starting with a ^ character + * $ and/or ending with a $ character, restricts the + * pattern matching to the beginning of the line, + * or the end of line. [anchors] Elsewhere in the + * pattern, ^ and $ are treated as ordinary characters. + * + * + * Acknowledgements: + * + * HCR's Hugh Redelmeier has been most helpful in various + * stages of development. He convinced me to include BOW + * and EOW constructs, originally invented by Rob Pike at + * the University of Toronto. + * + * References: + * Software tools Kernighan & Plauger + * Software tools in Pascal Kernighan & Plauger + * Grep [rsx-11 C dist] David Conroy + * ed - text editor Un*x Programmer's Manual + * Advanced editing on Un*x B. W. Kernighan + * RegExp routines Henry Spencer + * + * Notes: + * + * This implementation uses a bit-set representation for character + * classes for speed and compactness. Each character is represented + * by one bit in a 128-bit block. Thus, CCL always takes a + * constant 16 bytes in the internal nfa, and re_exec does a single + * bit comparison to locate the character in the set. + * + * Examples: + * + * pattern: foo*.* + * compile: CHR f CHR o CLO CHR o END CLO ANY END END + * matches: fo foo fooo foobar fobar foxx ... + * + * pattern: fo[ob]a[rz] + * compile: CHR f CHR o CCL bitset CHR a CCL bitset END + * matches: fobar fooar fobaz fooaz + * + * pattern: foo\\+ + * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END + * matches: foo\ foo\\ foo\\\ ... + * + * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) + * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END + * matches: foo1foo foo2foo foo3foo + * + * pattern: \(fo.*\)-\1 + * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END + * matches: foo-foo fo-fo fob-fob foobar-foobar ... + * + */ + + #define MAXNFA 1024 + #define MAXTAG 10 + + #define OKP 1 + #define NOP 0 + + #define CHR 1 + #define ANY 2 + #define CCL 3 + #define BOL 4 + #define EOL 5 + #define BOT 6 + #define EOT 7 + #define BOW 8 + #define EOW 9 + #define REF 10 + #define CLO 11 + + #define END 0 + + /* + * The following defines are not meant + * to be changeable. They are for readability + * only. + * + */ + #define MAXCHR 128 + #define CHRBIT 8 + #define BITBLK MAXCHR/CHRBIT + #define BLKIND 0170 + #define BITIND 07 + + #define ASCIIB 0177 + + typedef /*unsigned*/ char CHAR; + + static int tagstk[MAXTAG]; /* subpat tag stack..*/ + static CHAR nfa[MAXNFA]; /* automaton.. */ + static int sta = NOP; /* status of lastpat */ + + static CHAR bittab[BITBLK]; /* bit table for CCL */ + /* pre-set bits... */ + static CHAR bitarr[] = {1,2,4,8,16,32,64,128}; + + static void + chset(c) register CHAR c; { bittab[((c)&BLKIND)>>3] |= bitarr[(c)&BITIND]; } + + #define badpat(x) return(*nfa = END, x) + #define store(x) *mp++ = x + + char * + re_comp(pat) + char *pat; + { + register char *p; /* pattern pointer */ + register CHAR *mp=nfa; /* nfa pointer */ + register CHAR *lp; /* saved pointer.. */ + register CHAR *sp=nfa; /* another one.. */ + + register int tagi = 0; /* tag stack index */ + register int tagc = 1; /* actual tag count */ + + register int n; + register CHAR mask; /* xor mask -CCL/NCL */ + int c1, c2; + + if (!pat || !*pat) + if (sta) + return(0); + else + badpat("No previous regular expression"); + sta = NOP; + + for (p = pat; *p; p++) { + lp = mp; + switch(*p) { + + case '.': /* match any char.. */ + store(ANY); + break; + + case '^': /* match beginning.. */ + if (p == pat) + store(BOL); + else { + store(CHR); + store(*p); + } + break; + + case '$': /* match endofline.. */ + if (!*(p+1)) + store(EOL); + else { + store(CHR); + store(*p); + } + break; + + case '[': /* match char class..*/ + store(CCL); + + if (*++p == '^') { + mask = 0377; + p++; + } + else + mask = 0; + + if (*p == '-') /* real dash */ + chset(*p++); + if (*p == ']') /* real brac */ + chset(*p++); + while (*p && *p != ']') { + if (*p == '-' && *(p+1) && *(p+1) != ']') { + p++; + c1 = *(p-2) + 1; + c2 = *p++; + while (c1 <= c2) + chset(c1++); + } + #ifdef EXTEND + else if (*p == '\\' && *(p+1)) { + p++; + chset(*p++); + } + #endif + else + chset(*p++); + } + if (!*p) + badpat("Missing ]"); + + for (n = 0; n < BITBLK; bittab[n++] = (char) 0) + store(mask ^ bittab[n]); + + break; + + case '*': /* match 0 or more.. */ + case '+': /* match 1 or more.. */ + if (p == pat) + badpat("Empty closure"); + lp = sp; /* previous opcode */ + if (*lp == CLO) /* equivalence.. */ + break; + switch(*lp) { + + case BOL: + case BOT: + case EOT: + case BOW: + case EOW: + case REF: + badpat("Illegal closure"); + default: + break; + } + + if (*p == '+') + for (sp = mp; lp < sp; lp++) + store(*lp); + + store(END); + store(END); + sp = mp; + while (--mp > lp) + *mp = mp[-1]; + store(CLO); + mp = sp; + break; + + case '\\': /* tags, backrefs .. */ + switch(*++p) { + + case '(': + if (tagc < MAXTAG) { + tagstk[++tagi] = tagc; + store(BOT); + store(tagc++); + } + else + badpat("Too many \\(\\) pairs"); + break; + case ')': + if (*sp == BOT) + badpat("Null pattern inside \\(\\)"); + if (tagi > 0) { + store(EOT); + store(tagstk[tagi--]); + } + else + badpat("Unmatched \\)"); + break; + case '<': + store(BOW); + break; + case '>': + if (*sp == BOW) + badpat("Null pattern inside \\<\\>"); + store(EOW); + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + n = *p-'0'; + if (tagi > 0 && tagstk[tagi] == n) + badpat("Cyclical reference"); + if (tagc > n) { + store(REF); + store(n); + } + else + badpat("Undetermined reference"); + break; + #ifdef EXTEND + case 'b': + store(CHR); + store('\b'); + break; + case 'n': + store(CHR); + store('\n'); + break; + case 'f': + store(CHR); + store('\f'); + break; + case 'r': + store(CHR); + store('\r'); + break; + case 't': + store(CHR); + store('\t'); + break; + #endif + default: + store(CHR); + store(*p); + } + break; + + default : /* an ordinary char */ + store(CHR); + store(*p); + break; + } + sp = lp; + } + if (tagi > 0) + badpat("Unmatched \\("); + store(END); + sta = OKP; + return(0); + } + + + static char *bol; + char *bopat[MAXTAG]; + char *eopat[MAXTAG]; + char *pmatch(); + + /* + * re_exec: + * execute nfa to find a match. + * + * special cases: (nfa[0]) + * BOL + * Match only once, starting from the + * beginning. + * CHR + * First locate the character without + * calling pmatch, and if found, call + * pmatch for the remaining string. + * END + * re_comp failed, poor luser did not + * check for it. Fail fast. + * + * If a match is found, bopat[0] and eopat[0] are set + * to the beginning and the end of the matched fragment, + * respectively. + * + */ + + int + re_exec(lp) + register char *lp; + { + register char c; + register char *ep = 0; + register CHAR *ap = nfa; + + bol = lp; + + bopat[0] = 0; + bopat[1] = 0; + bopat[2] = 0; + bopat[3] = 0; + bopat[4] = 0; + bopat[5] = 0; + bopat[6] = 0; + bopat[7] = 0; + bopat[8] = 0; + bopat[9] = 0; + + switch(*ap) { + + case BOL: /* anchored: match from BOL only */ + ep = pmatch(lp,ap); + break; + case CHR: /* ordinary char: locate it fast */ + c = *(ap+1); + while (*lp && *lp != c) + lp++; + if (!*lp) /* if EOS, fail, else fall thru. */ + return(0); + default: /* regular matching all the way. */ + while (*lp) { + if ((ep = pmatch(lp,ap))) + break; + lp++; + } + break; + case END: /* munged automaton. fail always */ + return(0); + } + if (!ep) + return(0); + + bopat[0] = lp; + eopat[0] = ep; + return(1); + } + + /* + * pmatch: + * internal routine for the hard part + * + * This code is mostly snarfed from an early + * grep written by David Conroy. The backref and + * tag stuff, and various other mods are by oZ. + * + * special cases: (nfa[n], nfa[n+1]) + * CLO ANY + * We KNOW ".*" will match ANYTHING + * upto the end of line. Thus, go to + * the end of line straight, without + * calling pmatch recursively. As in + * the other closure cases, the remaining + * pattern must be matched by moving + * backwards on the string recursively, + * to find a match for xy (x is ".*" and + * y is the remaining pattern) where + * the match satisfies the LONGEST match + * for x followed by a match for y. + * CLO CHR + * We can again scan the string forward + * for the single char without recursion, + * and at the point of failure, we execute + * the remaining nfa recursively, as + * described above. + * + * At the end of a successful match, bopat[n] and eopat[n] + * are set to the beginning and end of subpatterns matched + * by tagged expressions (n = 1 to 9). + * + */ + + extern void re_fail(); + + /* + * character classification table for word boundary + * operators BOW and EOW. the reason for not using + * ctype macros is that we can let the user add into + * our own table. see re_modw. This table is not in + * the bitset form, since we may wish to extend it + * in the future for other character classifications. + * + * TRUE for 0-9 A-Z a-z _ + */ + static char chrtyp[MAXCHR] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0 + }; + + #define inascii(x) (0177&(x)) + #define iswordc(x) chrtyp[inascii(x)] + #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND]) + + /* + * skip values for CLO XXX to skip past the closure + * + */ + + #define ANYSKIP 2 /* [CLO] ANY END ... */ + #define CHRSKIP 3 /* [CLO] CHR chr END ... */ + #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */ + + static char * + pmatch(lp, ap) + register char *lp; + register CHAR *ap; + { + register int op, c, n; + register char *e; /* extra pointer for CLO */ + register char *bp; /* beginning of subpat.. */ + register char *ep; /* ending of subpat.. */ + char *are; /* to save the line ptr. */ + + while ((op = *ap++) != END) + switch(op) { + + case CHR: + if (*lp++ != *ap++) + return(0); + break; + case ANY: + if (!*lp++) + return(0); + break; + case CCL: + c = *lp++; + if (!isinset(ap,c)) + return(0); + ap += BITBLK; + break; + case BOL: + if (lp != bol) + return(0); + break; + case EOL: + if (*lp) + return(0); + break; + case BOT: + bopat[*ap++] = lp; + break; + case EOT: + eopat[*ap++] = lp; + break; + case BOW: + if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp)) + return(0); + break; + case EOW: + if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp)) + return(0); + break; + case REF: + n = *ap++; + bp = bopat[n]; + ep = eopat[n]; + while (bp < ep) + if (*bp++ != *lp++) + return(0); + break; + case CLO: + are = lp; + switch(*ap) { + + case ANY: + while (*lp) + lp++; + n = ANYSKIP; + break; + case CHR: + c = *(ap+1); + while (*lp && c == *lp) + lp++; + n = CHRSKIP; + break; + case CCL: + while ((c = *lp) && isinset(ap+1,c)) + lp++; + n = CCLSKIP; + break; + default: + re_fail("closure: bad nfa.", *ap); + return(0); + } + + ap += n; + + while (lp >= are) { + if (e = pmatch(lp, ap)) + return(e); + --lp; + } + return(0); + default: + re_fail("re_exec: bad nfa.", op); + return(0); + } + return(lp); + } + + /* + * re_modw: + * add new characters into the word table to + * change the re_exec's understanding of what + * a word should look like. Note that we only + * accept additions into the word definition. + * + * If the string parameter is 0 or null string, + * the table is reset back to the default, which + * contains A-Z a-z 0-9 _. [We use the compact + * bitset representation for the default table] + * + */ + + static char deftab[16] = { + 0, 0, 0, 0, 0, 0, 377, 003, 376, 377, 377, 207, + 376, 377, 377, 007 + }; + + void + re_modw(s) + register char *s; + { + register int i; + + if (!s || !*s) { + for (i = 0; i < MAXCHR; i++) + if (!isinset(deftab,i)) + iswordc(i) = 0; + } + else + while(*s) + iswordc(*s++) = 1; + } + + /* + * re_subs: + * substitute the matched portions of the src in + * dst. + * + * & substitute the entire matched pattern. + * + * \digit substitute a subpattern, with the given + * tag number. Tags are numbered from 1 to + * 9. If the particular tagged subpattern + * does not exist, null is substituted. + * + */ + int + re_subs(src, dst) + register char *src; + register char *dst; + { + register char c; + register int pin; + register char *bp; + register char *ep; + + if (!*src || !bopat[0]) + return(0); + + while (c = *src++) { + switch(c) { + + case '&': + pin = 0; + break; + + case '\\': + c = *src++; + if (c >= '0' && c <= '9') { + pin = c - '0'; + break; + } + + default: + *dst++ = c; + continue; + } + + if ((bp = bopat[pin]) && (ep = eopat[pin])) { + while (*bp && bp < ep) + *dst++ = *bp++; + if (bp < ep) + return(0); + } + } + *dst = (char) 0; + return(1); + } + + #ifdef DEBUG + /* + * symbolic - produce a symbolic dump of the + * nfa + */ + symbolic(s) + char *s; + { + printf("pattern: %s\n", s); + printf("nfacode:\n"); + nfadump(nfa); + } + + static + nfadump(ap) + CHAR *ap; + { + register int n; + + while (*ap != END) + switch(*ap++) { + case CLO: + printf("CLOSURE"); + nfadump(ap); + switch(*ap) { + case CHR: + n = CHRSKIP; + break; + case ANY: + n = ANYSKIP; + break; + case CCL: + n = CCLSKIP; + break; + } + ap += n; + break; + case CHR: + printf("\tCHR %c\n",*ap++); + break; + case ANY: + printf("\tANY .\n"); + break; + case BOL: + printf("\tBOL -\n"); + break; + case EOL: + printf("\tEOL -\n"); + break; + case BOT: + printf("BOT: %d\n",*ap++); + break; + case EOT: + printf("EOT: %d\n",*ap++); + break; + case BOW: + printf("BOW\n"); + break; + case EOW: + printf("EOW\n"); + break; + case REF: + printf("REF: %d\n",*ap++); + break; + case CCL: + printf("\tCCL ["); + for (n = 0; n < MAXCHR; n++) + if (isinset(ap,(CHAR)n)) { + if (n < ' ') + printf("^%c", n ^ 0x040); + else + printf("%c", n); + } + printf("]\n"); + ap += BITBLK; + break; + default: + printf("bad nfa. opcode %o\n", ap[-1]); + exit(1); + break; + } + } + #endif -- -- Kim Andersen @ Dansk Data Elektronik A/S, Herlev, Denmark. E-mail: kim@dde.dk or ...!uunet!mcsun!dkuug!dde!kim