-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy paththompson.c
152 lines (140 loc) · 3.13 KB
/
thompson.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re1.5.h"
typedef struct Thread Thread;
struct Thread
{
char *pc;
};
typedef struct ThreadList ThreadList;
struct ThreadList
{
int n;
Thread t[1];
};
static Thread
thread(char *pc)
{
Thread t = {pc};
return t;
}
static ThreadList*
threadlist(int n)
{
return mal(sizeof(ThreadList)+n*sizeof(Thread));
}
static void
addthread(ThreadList *l, Thread t, Subject *input, const char *sp)
{
int off;
if(*t.pc & 0x80)
return; // already on list
*t.pc |= 0x80;
l->t[l->n] = t;
l->n++;
switch(*t.pc & 0x7f) {
case Jmp:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc + off), input, sp);
break;
case Split:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc), input, sp);
addthread(l, thread(t.pc + off), input, sp);
break;
case RSplit:
off = (signed char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc + off), input, sp);
addthread(l, thread(t.pc), input, sp);
break;
case Save:
off = (unsigned char)t.pc[1];
t.pc += 2;
addthread(l, thread(t.pc), input, sp);
break;
case Bol:
if(sp == input->begin)
addthread(l, thread(t.pc + 1), input, sp);
break;
case Eol:
if(sp == input->end - 1)
addthread(l, thread(t.pc + 1), input, sp);
break;
}
}
int
re1_5_thompsonvm(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
{
int i, len, matched;
ThreadList *clist, *nlist, *tmp;
char *pc;
const char *sp;
for(i=0; i<nsubp; i++)
subp[i] = nil;
len = prog->len;
clist = threadlist(len);
nlist = threadlist(len);
if(nsubp >= 1)
subp[0] = input->begin;
cleanmarks(prog);
addthread(clist, thread(HANDLE_ANCHORED(prog->insts, is_anchored)), input, input->begin);
matched = 0;
for(sp=input->begin;; sp++) {
if(clist->n == 0)
break;
// printf("%d(%02x).", (int)(sp - input->begin), *sp & 0xFF);
cleanmarks(prog);
for(i=0; i<clist->n; i++) {
pc = clist->t[i].pc;
// printf(" %d", (int)(pc - prog->insts));
if (inst_is_consumer(*pc & 0x7f)) {
// If we need to match a character, but there's none left,
// it's fail (we don't schedule current thread for continuation)
if(sp >= input->end)
continue;
}
switch(*pc++ & 0x7f) {
case Char:
if(*sp != *pc++)
break;
case Any:
addthread:
addthread(nlist, thread(pc), input, sp);
break;
case Class:
case ClassNot:
if (!_re1_5_classmatch(pc, sp))
break;
pc += *(unsigned char*)pc * 2 + 1;
goto addthread;
case NamedClass:
if (!_re1_5_namedclassmatch(pc, sp))
break;
pc++;
goto addthread;
case Match:
if(nsubp >= 2)
subp[1] = sp;
matched = 1;
goto BreakFor;
// Jmp, Split, Save handled in addthread, so that
// machine execution matches what a backtracker would do.
// This is discussed (but not shown as code) in
// Regular Expression Matching: the Virtual Machine Approach.
}
}
BreakFor:
// printf("\n");
tmp = clist;
clist = nlist;
nlist = tmp;
nlist->n = 0;
//if(sp >= input->end)
// break;
}
return matched;
}