1implement Regex; 2 3include "regex.m"; 4 5# syntax 6 7# RE ALT regular expression 8# NUL 9# ALT CAT alternation 10# CAT | ALT 11# 12# CAT DUP catenation 13# DUP CAT 14# 15# DUP PRIM possibly duplicated primary 16# PCLO 17# CLO 18# OPT 19# 20# PCLO PRIM + 1 or more 21# CLO PRIM * 0 or more 22# OPT PRIM ? 0 or 1 23# 24# PRIM ( RE ) 25# () 26# DOT any character 27# CHAR a single character 28# ESC escape sequence 29# [ SET ] character set 30# NUL null string 31# HAT beginning of string 32# DOL end of string 33# 34 35NIL : con -1; # a refRex constant 36NONE: con -2; # ditto, for an un-set value 37BAD: con 1<<16; # a non-character 38HUGE: con (1<<31) - 1; 39 40# the data structures of re.m would like to be ref-linked, but are 41# circular (see fn walk), thus instead of pointers we use indexes 42# into an array (arena) of nodes of the syntax tree of a regular expression. 43# from a storage-allocation standpoint, this replaces many small 44# allocations of one size with one big one of variable size. 45 46ReStr: adt { 47 s : string; 48 i : int; # cursor postion 49 n : int; # number of chars left; -1 on error 50 peek : fn(s: self ref ReStr): int; 51 next : fn(s: self ref ReStr): int; 52}; 53 54ReStr.peek(s: self ref ReStr): int 55{ 56 if(s.n <= 0) 57 return BAD; 58 return s.s[s.i]; 59} 60 61ReStr.next(s: self ref ReStr): int 62{ 63 if(s.n <= 0) 64 return BAD; 65 s.n--; 66 return s.s[s.i++]; 67} 68 69newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int): refRex 70{ 71 ar.rex[ar.ptr] = Rex(kind, left, right, set, pno); 72 return ar.ptr++; 73} 74 75# parse a regex by recursive descent to get a syntax tree 76 77re(s: ref ReStr, ar: ref Arena): refRex 78{ 79 left := cat(s, ar); 80 if(left==NIL || s.peek()!='|') 81 return left; 82 s.next(); 83 right := re(s, ar); 84 if(right == NIL) 85 return NIL; 86 return newRe(ALT, left, right, nil, ar, 0); 87} 88 89cat(s: ref ReStr, ar: ref Arena): refRex 90{ 91 left := dup(s, ar); 92 if(left == NIL) 93 return left; 94 right := cat(s, ar); 95 if(right == NIL) 96 return left; 97 return newRe(CAT, left, right, nil, ar, 0); 98} 99 100dup(s: ref ReStr, ar: ref Arena): refRex 101{ 102 case s.peek() { 103 BAD or ')' or ']' or '|' or '?' or '*' or '+' => 104 return NIL; 105 } 106 prim: refRex; 107 case kind:=s.next() { 108 '(' => if(ar.pno < 0) { 109 if(s.peek() == ')') { 110 s.next(); 111 prim = newRe(NUL, NONE, NONE, nil, ar, 0); 112 } else { 113 prim = re(s, ar); 114 if(prim==NIL || s.next()!=')') 115 s.n = -1; 116 } 117 } else { 118 pno := ++ar.pno; 119 lp := newRe(LPN, NONE, NONE, nil, ar, pno); 120 rp := newRe(RPN, NONE, NONE, nil, ar, pno); 121 if(s.peek() == ')') { 122 s.next(); 123 prim = newRe(CAT, lp, rp, nil, ar, 0); 124 125 } else { 126 prim = re(s, ar); 127 if(prim==NIL || s.next()!=')') 128 s.n = -1; 129 else { 130 prim = newRe(CAT, prim, rp, nil, ar, 0); 131 prim = newRe(CAT, lp, prim, nil, ar, 0); 132 } 133 } 134 } 135 '[' => prim = newRe(SET, NONE, NONE, newSet(s), ar, 0); 136 * => case kind { 137 '.' => kind = DOT; 138 '^' => kind = HAT; 139 '$' => kind = DOL; 140 } 141 prim = newRe(esc(s, kind), NONE, NONE, nil, ar, 0); 142 } 143 case s.peek() { 144 '*' => kind = CLO; 145 '+' => kind = PCLO; 146 '?' => kind = OPT; 147 * => return prim; 148 } 149 s.next(); 150 return newRe(kind, prim, NONE, nil, ar, 0); 151} 152 153esc(s: ref ReStr, char: int): int 154{ 155 if(char == '\\') { 156 char = s.next(); 157 case char { 158 BAD => s.n = -1; 159 'n' => char = '\n'; 160 } 161 } 162 return char; 163} 164 165# walk the tree adjusting pointers to refer to 166# next state of the finite state machine 167 168walk(r: refRex, succ: refRex, ar: ref Arena) 169{ 170 if(r==NONE) 171 return; 172 rex := ar.rex[r]; 173 case rex.kind { 174 ALT => walk(rex.left, succ, ar); 175 walk(rex.right, succ, ar); 176 return; 177 CAT => walk(rex.left, rex.right, ar); 178 walk(rex.right, succ, ar); 179 ar.rex[r] = ar.rex[rex.left]; # optimization 180 return; 181 CLO or PCLO => 182 end := newRe(OPT, r, succ, nil, ar, 0); # here's the circularity 183 walk(rex.left, end, ar); 184 OPT => walk(rex.left, succ, ar); 185 } 186 ar.rex[r].right = succ; 187} 188 189compile(e: string, flag: int): (Re, string) 190{ 191 if(e == nil) 192 return (nil, "missing expression"); 193 s := ref ReStr(e, 0, len e); 194 ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1); 195 start := ar.start = re(s, ar); 196 if(start==NIL || s.n!=0) 197 return (nil, "invalid regular expression"); 198 walk(start, NIL, ar); 199 if(ar.pno < 0) 200 ar.pno = 0; 201 return (ar, nil); 202} 203 204# todo1, todo2: queues for epsilon and advancing transitions 205Gaz: adt { 206 pno: int; 207 beg: int; 208 end: int; 209}; 210Trace: adt { 211 cre: refRex; # cursor in Re 212 beg: int; # where this trace began; 213 gaz: list of Gaz; 214}; 215Queue: adt { 216 ptr: int; 217 q: array of Trace; 218}; 219 220execute(re: Re, s: string): array of (int, int) 221{ 222 return executese(re, s, (-1,-1), 1, 1); 223} 224 225executese(re: Re, s: string, range: (int, int), bol: int, eol: int): array of (int,int) 226{ 227 if(re==nil) 228 return nil; 229 (s0, s1) := range; 230 if(s0 < 0) 231 s0 = 0; 232 if(s1 < 0) 233 s1 = len s; 234 gaz : list of Gaz; 235 (beg, end) := (-1, -1); 236 todo1 := ref Queue(0, array[re.ptr] of Trace); 237 todo2 := ref Queue(0, array[re.ptr] of Trace); 238 for(i:=s0; i<=s1; i++) { 239 small2 := HUGE; # earliest possible match if advance 240 if(beg == -1) # no leftmost match yet 241 todo1.q[todo1.ptr++] = Trace(re.start, i, nil); 242 for(k:=0; k<todo1.ptr; k++) { 243 q := todo1.q[k]; 244 rex := re.rex[q.cre]; 245 next1 := next2 := NONE; 246 case rex.kind { 247 NUL => 248 next1 = rex.right; 249 DOT => 250 if(i<len s && s[i]!='\n') 251 next2 = rex.right; 252 HAT => 253 if(i == s0 && bol) 254 next1 = rex.right; 255 DOL => 256 if(i == s1 && eol) 257 next1 = rex.right; 258 SET => 259 if(i<len s && member(s[i], rex.set)) 260 next2 = rex.right; 261 CAT or 262 PCLO => 263 next1 = rex.left; 264 ALT or 265 CLO or 266 OPT => 267 next1 = rex.right; 268 k = insert(rex.left, q.beg, q.gaz, todo1, k); 269 LPN => 270 next1 = rex.right; 271 q.gaz = Gaz(rex.pno,i,-1)::q.gaz; 272 RPN => 273 next1 = rex.right; 274 for(r:=q.gaz; ; r=tl r) { 275 (pno,beg1,end1) := hd r; 276 if(rex.pno==pno && end1==-1) { 277 q.gaz = Gaz(pno,beg1,i)::q.gaz; 278 break; 279 } 280 } 281 * => 282 if(i<len s && rex.kind==s[i]) 283 next2 = rex.right; 284 } 285 if(next1 != NONE) { 286 if(next1 != NIL) 287 k =insert(next1, q.beg, q.gaz, todo1, k); 288 else if(better(q.beg, i, beg, end)) 289 (gaz, beg, end) = (q.gaz, q.beg, i); 290 } 291 if(next2 != NONE) { 292 if(next2 != NIL) { 293 if(q.beg < small2) 294 small2 = q.beg; 295 insert(next2, q.beg, q.gaz, todo2, 0); 296 } else if(better(q.beg, i+1, beg, end)) 297 (gaz, beg, end) = (q.gaz, q.beg, i+1); 298 } 299 300 } 301 if(beg!=-1 && beg<small2) # nothing better possible 302 break; 303 (todo1,todo2) = (todo2, todo1); 304 todo2.ptr = 0; 305 } 306 if(beg == -1) 307 return nil; 308 result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) }; 309 for( ; gaz!=nil; gaz=tl gaz) { 310 (pno, beg1, end1) := hd gaz; 311 (rbeg, nil) := result[pno]; 312 if(rbeg==-1 && (beg1|end1)!=-1) 313 result[pno] = (beg1,end1); 314 } 315 return result; 316} 317 318better(newbeg, newend, oldbeg, oldend: int): int 319{ 320 return oldbeg==-1 || newbeg<oldbeg || 321 newbeg==oldbeg && newend>oldend; 322} 323 324insert(next: refRex, tbeg: int, tgaz: list of Gaz, todo: ref Queue, k: int): int 325{ 326 for(j:=0; j<todo.ptr; j++) 327 if(todo.q[j].cre == next) 328 if(todo.q[j].beg <= tbeg) 329 return k; 330 else 331 break; 332 if(j < k) 333 k--; 334 if(j < todo.ptr) 335 todo.ptr--; 336 for( ; j<todo.ptr; j++) 337 todo.q[j] = todo.q[j+1]; 338 todo.q[todo.ptr++] = Trace(next, tbeg, tgaz); 339 return k; 340} 341 342ASCII : con 128; 343WORD : con 32; 344 345member(char: int, set: ref Set): int 346{ 347 if(char < 128) 348 return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg; 349 for(l:=set.unicode; l!=nil; l=tl l) { 350 (beg, end) := hd l; 351 if(char>=beg && char<=end) 352 return !set.neg; 353 } 354 return set.neg; 355} 356 357newSet(s: ref ReStr): ref Set 358{ 359 set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil); 360 if(s.peek() == '^') { 361 set.neg = 1; 362 s.next(); 363 } 364 while(s.n > 0) { 365 char1 := s.next(); 366 if(char1 == ']') 367 return set; 368 char1 = esc(s, char1); 369 char2 := char1; 370 if(s.peek() == '-') { 371 s.next(); 372 char2 = s.next(); 373 if(char2 == ']') 374 break; 375 char2 = esc(s, char2); 376 if(char2 < char1) 377 break; 378 } 379 for( ; char1<=char2; char1++) 380 if(char1 < ASCII) 381 set.ascii[char1/WORD] |= 1<<char1%WORD; 382 else { 383 set.unicode = (char1,char2)::set.unicode; 384 break; 385 } 386 } 387 s.n = -1; 388 return nil; 389} 390