/* grep from scratch (without powerset construction!) * $Id: grep.c,v 1.21 2024/02/03 18:28:11 oj14ozun Exp $ * https://wwwcip.cs.fau.de/~oj14ozun/src+etc/grep.c */ #include #include #include #include #include #include #include struct regexp { enum re_type { RE_CONCAT, RE_DISJUNCT, RE_REPEAT, RE_EMPTY_SET, RE_EMPTY_WORD, RE_LITERAL, } type; union { struct {struct regexp *s, *t;} concat; struct {struct regexp *s, *t;} disjunct; struct {struct regexp *s;} repeat; struct {unsigned char l, u;} literal; } content; }; #define CON(s_, t_) &((struct regexp){.type = RE_CONCAT, .content.concat = { .s = s_, .t = t_ }}) #define DIS(s_, t_) &((struct regexp){.type = RE_DISJUNCT, .content.disjunct = { .s = s_, .t = t_ }}) #define REP(s_) &((struct regexp){.type = RE_REPEAT, .content.repeat = { .s = s_ }}) #define EMP &((struct regexp){.type = RE_EMPTY_SET}) #define EPS &((struct regexp){.type = RE_EMPTY_WORD}) #define LIT(c_) &((struct regexp){.type = RE_LITERAL, .content.literal = { .l = c_, .u = c_ }}) #define RNG(l_, u_) &((struct regexp){.type = RE_LITERAL, .content.literal = { .l = l_, .u = u_ }}) struct re_stack { struct regexp *regexp; struct re_stack *rest; }; #define CONS(n, s) &(struct re_stack) { .regexp = n, .rest = s } #define POP(stack, fail, input, rep) do { \ if (stack) { \ re_match0((stack)->regexp, \ (fail), \ (stack)->rest, \ input, rep); \ } else if (**input != '\0') { \ longjmp(*fail, 1); \ } \ } while (0) /* Idea: when matching RE with INPUT, we can either FAIL (backtrack to * a previous secure point), or continue on with the NEXT element on * the stack. If the function returns normally, we have processed the * entire regular expression, but that doesn't mean that RE matched * INPUT. The final check is made in `re_match', by checking if INPUT * has been used up. */ static void re_match0(struct regexp *re, /* the expression we are trying to match */ jmp_buf *fail, /* a fallback if the match fails */ struct re_stack *stack, /* next exps. to try if this succeeds */ char **input, /* the remaining input to match */ unsigned rep) /* remaining allowed repetitions */ { jmp_buf checkpoint; char *reset = *input; assert(re); switch (re->type) { case RE_CONCAT: { re_match0(re->content.concat.s, fail, CONS(re->content.concat.t, stack), input, rep); break; } case RE_DISJUNCT: if (setjmp(checkpoint)) { *input = reset; re_match0(re->content.disjunct.t, fail, stack, input, rep); } else { re_match0(re->content.disjunct.s, &checkpoint, stack, input, rep); } break; case RE_REPEAT: { if (rep == 0) { POP(stack, fail, input, rep); } else if (setjmp(checkpoint)) { *input = reset; re_match0(re->content.repeat.s, fail, CONS(re, stack), input, rep - 1); } else { POP(stack, &checkpoint, input, rep); } break; } case RE_EMPTY_SET: longjmp(*fail, 1); case RE_EMPTY_WORD: POP(stack, fail, input, rep); break; case RE_LITERAL: if (!(re->content.literal.l <= **input && **input <= re->content.literal.u)) { longjmp(*fail, 1); } else { (*input)++; POP(stack, fail, input, rep); } } } static bool re_match(struct regexp *re, char *input) { jmp_buf checkpoint; if (setjmp(checkpoint)) { return false; } re_match0(re, &checkpoint, NULL, &input, strlen(input)); return *input == '\0'; } static struct regexp * re_dup(struct regexp *re) { struct regexp *dup = malloc(sizeof(*re)); if (dup) { return memcpy(dup, re, sizeof(*dup)); } else { perror("malloc"); exit(EXIT_FAILURE); } } static struct regexp * re_anything() { return re_dup(RNG(1, UCHAR_MAX)); } struct regexp * re_parse0(char **pat, struct regexp *ctx) { struct regexp *re, **last = ctx->type == RE_CONCAT ? &(ctx->content.concat.t) : &ctx; switch (**pat) { case '\0': return ctx; case '*': *last = re_dup(REP(*last)); (*pat)++; return re_parse0(pat, ctx); case '|': (*pat)++; return re_dup(DIS(ctx, re_parse0(pat, re_dup(EPS)))); case '?': *last = re_dup(DIS(*last, re_dup(EPS))); (*pat)++; return re_parse0(pat, ctx); case '+': re = *last; *last = re_dup(CON(re, re_dup(REP(re_dup(re))))); (*pat)++; return re_parse0(pat, ctx); case ')': return ctx; case '(': (*pat)++; re = re_parse0(pat, re_dup(EPS)); break; case '.': re = re_anything(); break; case '\\': (*pat)++; assert(**pat); default: /* fall-through */ re = re_dup(LIT(**pat)); break; } (*pat)++; return re_parse0(pat, re_dup(CON(ctx, re))); } static struct regexp * re_parse(char *pat) { bool hook_beg, hook_end; struct regexp *re; if ((hook_beg = (pat[0] == '^'))) { pat++; } if ((hook_end = (pat[strlen(pat)-1] == '$'))) { pat[strlen(pat)-1] = '\0'; } re = re_parse0(&pat, re_dup(EPS)); if (!hook_beg) { re = re_dup(CON(re_dup(REP(re_anything())), re)); } if (!hook_end) { re = re_dup(CON(re, re_dup(REP(re_anything())))); } return re; } static void re_free(struct regexp *re) { if (re == NULL) { return; } switch (re->type) { case RE_CONCAT: re_free(re->content.concat.s); re_free(re->content.concat.t); break; case RE_DISJUNCT: re_free(re->content.disjunct.s); re_free(re->content.disjunct.t); break; case RE_REPEAT: re_free(re->content.repeat.s); break; default:; /* the others are atomic */ } free(re); } static int grep(struct regexp *re, FILE *file) { char *line = NULL, *c; size_t len = 0; while (-1 != getline(&line, &len, file)) { if ((c = strchr(line, '\n'))) *c = '\0'; if (re_match(re, line)) { if (EOF == puts(line)) { perror("fputs"); return EXIT_FAILURE; } } } if (ferror(file)) { perror("getline"); return EXIT_FAILURE; } free(line); } int main(int argc, char *argv[]) { struct regexp *re; if (argc <= 1 || !argv[1]) { fprintf(stderr, "usage: %s [regexp] [file...]\n", argv[0]); return EXIT_FAILURE; } re = re_parse(argv[1]); if (!re) { perror("re_parse"); return EXIT_FAILURE; } if (argc == 2) { grep(re, stdin); } else { for (int i = 2, r; i < argc; i++) { FILE *file; if (strcmp(argv[i], "-") == 0) { file = stdin; } else { file = fopen(argv[i], "r"); if (file == NULL) { perror("fopen"); return EXIT_FAILURE; } } if ((r = grep(re, file)) != EXIT_SUCCESS) { return r; } fclose(file); } } re_free(re); return EXIT_SUCCESS; }