xref: /netbsd-src/external/historical/nawk/dist/run.c (revision 3b29b3e809927840440a4e214eaab8cecc06ed5e)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #if HAVE_NBTOOL_CONFIG_H
26 #include "nbtool_config.h"
27 #endif
28 
29 #define DEBUG
30 #include <stdio.h>
31 #include <ctype.h>
32 #include <errno.h>
33 #include <wctype.h>
34 #include <fcntl.h>
35 #include <setjmp.h>
36 #include <limits.h>
37 #include <math.h>
38 #include <string.h>
39 #include <stdlib.h>
40 #include <time.h>
41 #include <sys/types.h>
42 #include <sys/wait.h>
43 #include "awk.h"
44 #include "awkgram.h"
45 
46 
47 static void stdinit(void);
48 static void flush_all(void);
49 static char *wide_char_to_byte_str(int rune, size_t *outlen);
50 
51 #if 1
52 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
53 #else
54 void tempfree(Cell *p) {
55 	if (p->ctype == OCELL && (p->csub < CUNK || p->csub > CFREE)) {
56 		WARNING("bad csub %d in Cell %d %s",
57 			p->csub, p->ctype, p->sval);
58 	}
59 	if (istemp(p))
60 		tfree(p);
61 }
62 #endif
63 
64 /* do we really need these? */
65 /* #ifdef _NFILE */
66 /* #ifndef FOPEN_MAX */
67 /* #define FOPEN_MAX _NFILE */
68 /* #endif */
69 /* #endif */
70 /*  */
71 /* #ifndef	FOPEN_MAX */
72 /* #define	FOPEN_MAX	40 */	/* max number of open files */
73 /* #endif */
74 /*  */
75 /* #ifndef RAND_MAX */
76 /* #define RAND_MAX	32767 */	/* all that ansi guarantees */
77 /* #endif */
78 
79 jmp_buf env;
80 extern	int	pairstack[];
81 extern	Awkfloat	srand_seed;
82 
83 Node	*winner = NULL;	/* root of parse tree */
84 Cell	*tmps;		/* free temporary cells for execution */
85 
86 static Cell	truecell	={ OBOOL, BTRUE, 0, 0, 1.0, NUM, NULL, NULL };
87 Cell	*True	= &truecell;
88 static Cell	falsecell	={ OBOOL, BFALSE, 0, 0, 0.0, NUM, NULL, NULL };
89 Cell	*False	= &falsecell;
90 static Cell	breakcell	={ OJUMP, JBREAK, 0, 0, 0.0, NUM, NULL, NULL };
91 Cell	*jbreak	= &breakcell;
92 static Cell	contcell	={ OJUMP, JCONT, 0, 0, 0.0, NUM, NULL, NULL };
93 Cell	*jcont	= &contcell;
94 static Cell	nextcell	={ OJUMP, JNEXT, 0, 0, 0.0, NUM, NULL, NULL };
95 Cell	*jnext	= &nextcell;
96 static Cell	nextfilecell	={ OJUMP, JNEXTFILE, 0, 0, 0.0, NUM, NULL, NULL };
97 Cell	*jnextfile	= &nextfilecell;
98 static Cell	exitcell	={ OJUMP, JEXIT, 0, 0, 0.0, NUM, NULL, NULL };
99 Cell	*jexit	= &exitcell;
100 static Cell	retcell		={ OJUMP, JRET, 0, 0, 0.0, NUM, NULL, NULL };
101 Cell	*jret	= &retcell;
102 static Cell	tempcell	={ OCELL, CTEMP, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
103 
104 Node	*curnode = NULL;	/* the node being executed, for debugging */
105 
106 /* buffer memory management */
107 int adjbuf(char **pbuf, int *psiz, int minlen, int quantum, char **pbptr,
108 	const char *whatrtn)
109 /* pbuf:    address of pointer to buffer being managed
110  * psiz:    address of buffer size variable
111  * minlen:  minimum length of buffer needed
112  * quantum: buffer size quantum
113  * pbptr:   address of movable pointer into buffer, or 0 if none
114  * whatrtn: name of the calling routine if failure should cause fatal error
115  *
116  * return   0 for realloc failure, !=0 for success
117  */
118 {
119 	if (minlen > *psiz) {
120 		char *tbuf;
121 		int rminlen = quantum ? minlen % quantum : 0;
122 		int boff = pbptr ? *pbptr - *pbuf : 0;
123 		/* round up to next multiple of quantum */
124 		if (rminlen)
125 			minlen += quantum - rminlen;
126 		tbuf = (char *) realloc(*pbuf, minlen);
127 		DPRINTF("adjbuf %s: %d %d (pbuf=%p, tbuf=%p)\n", whatrtn, *psiz, minlen, (void*)*pbuf, (void*)tbuf);
128 		if (tbuf == NULL) {
129 			if (whatrtn)
130 				FATAL("out of memory in %s", whatrtn);
131 			return 0;
132 		}
133 		*pbuf = tbuf;
134 		*psiz = minlen;
135 		if (pbptr)
136 			*pbptr = tbuf + boff;
137 	}
138 	return 1;
139 }
140 
141 void run(Node *a)	/* execution of parse tree starts here */
142 {
143 
144 	stdinit();
145 	execute(a);
146 	closeall();
147 }
148 
149 Cell *execute(Node *u)	/* execute a node of the parse tree */
150 {
151 	Cell *(*proc)(Node **, int);
152 	Cell *x;
153 	Node *a;
154 
155 	if (u == NULL)
156 		return(True);
157 	for (a = u; ; a = a->nnext) {
158 		curnode = a;
159 		if (isvalue(a)) {
160 			x = (Cell *) (a->narg[0]);
161 			if (isfld(x) && !donefld)
162 				fldbld();
163 			else if (isrec(x) && !donerec)
164 				recbld();
165 			return(x);
166 		}
167 		if (notlegal(a->nobj))	/* probably a Cell* but too risky to print */
168 			FATAL("illegal statement");
169 		proc = proctab[a->nobj-FIRSTTOKEN];
170 		x = (*proc)(a->narg, a->nobj);
171 		if (isfld(x) && !donefld)
172 			fldbld();
173 		else if (isrec(x) && !donerec)
174 			recbld();
175 		if (isexpr(a))
176 			return(x);
177 		if (isjump(x))
178 			return(x);
179 		if (a->nnext == NULL)
180 			return(x);
181 		tempfree(x);
182 	}
183 }
184 
185 
186 Cell *program(Node **a, int n)	/* execute an awk program */
187 {				/* a[0] = BEGIN, a[1] = body, a[2] = END */
188 	Cell *x;
189 
190 	if (setjmp(env) != 0)
191 		goto ex;
192 	if (a[0]) {		/* BEGIN */
193 		x = execute(a[0]);
194 		if (isexit(x))
195 			return(True);
196 		if (isjump(x))
197 			FATAL("illegal break, continue, next or nextfile from BEGIN");
198 		tempfree(x);
199 	}
200 	if (a[1] || a[2])
201 		while (getrec(&record, &recsize, true) > 0) {
202 			x = execute(a[1]);
203 			if (isexit(x))
204 				break;
205 			tempfree(x);
206 		}
207   ex:
208 	if (setjmp(env) != 0)	/* handles exit within END */
209 		goto ex1;
210 	if (a[2]) {		/* END */
211 		x = execute(a[2]);
212 		if (isbreak(x) || isnext(x) || iscont(x))
213 			FATAL("illegal break, continue, next or nextfile from END");
214 		tempfree(x);
215 	}
216   ex1:
217 	return(True);
218 }
219 
220 struct Frame {	/* stack frame for awk function calls */
221 	int nargs;	/* number of arguments in this call */
222 	Cell *fcncell;	/* pointer to Cell for function */
223 	Cell **args;	/* pointer to array of arguments after execute */
224 	Cell *retval;	/* return value */
225 };
226 
227 #define	NARGS	50	/* max args in a call */
228 
229 struct Frame *frame = NULL;	/* base of stack frames; dynamically allocated */
230 int	nframe = 0;		/* number of frames allocated */
231 struct Frame *frp = NULL;	/* frame pointer. bottom level unused */
232 
233 Cell *call(Node **a, int n)	/* function call.  very kludgy and fragile */
234 {
235 	static const Cell newcopycell = { OCELL, CCOPY, 0, EMPTY, 0.0, NUM|STR|DONTFREE, NULL, NULL };
236 	int i, ncall, ndef;
237 	int freed = 0; /* handles potential double freeing when fcn & param share a tempcell */
238 	Node *x;
239 	Cell *args[NARGS], *oargs[NARGS];	/* BUG: fixed size arrays */
240 	Cell *y, *z, *fcn;
241 	char *s;
242 
243 	fcn = execute(a[0]);	/* the function itself */
244 	s = fcn->nval;
245 	if (!isfcn(fcn))
246 		FATAL("calling undefined function %s", s);
247 	if (frame == NULL) {
248 		frp = frame = (struct Frame *) calloc(nframe += 100, sizeof(*frame));
249 		if (frame == NULL)
250 			FATAL("out of space for stack frames calling %s", s);
251 	}
252 	for (ncall = 0, x = a[1]; x != NULL; x = x->nnext)	/* args in call */
253 		ncall++;
254 	ndef = (int) fcn->fval;			/* args in defn */
255 	DPRINTF("calling %s, %d args (%d in defn), frp=%d\n", s, ncall, ndef, (int) (frp-frame));
256 	if (ncall > ndef)
257 		WARNING("function %s called with %d args, uses only %d",
258 			s, ncall, ndef);
259 	if (ncall + ndef > NARGS)
260 		FATAL("function %s has %d arguments, limit %d", s, ncall+ndef, NARGS);
261 	for (i = 0, x = a[1]; x != NULL; i++, x = x->nnext) {	/* get call args */
262 		DPRINTF("evaluate args[%d], frp=%d:\n", i, (int) (frp-frame));
263 		y = execute(x);
264 		oargs[i] = y;
265 		DPRINTF("args[%d]: %s %f <%s>, t=%o\n",
266 			i, NN(y->nval), y->fval, isarr(y) ? "(array)" : NN(y->sval), y->tval);
267 		if (isfcn(y))
268 			FATAL("can't use function %s as argument in %s", y->nval, s);
269 		if (isarr(y))
270 			args[i] = y;	/* arrays by ref */
271 		else
272 			args[i] = copycell(y);
273 		tempfree(y);
274 	}
275 	for ( ; i < ndef; i++) {	/* add null args for ones not provided */
276 		args[i] = gettemp();
277 		*args[i] = newcopycell;
278 	}
279 	frp++;	/* now ok to up frame */
280 	if (frp >= frame + nframe) {
281 		int dfp = frp - frame;	/* old index */
282 		frame = (struct Frame *) realloc(frame, (nframe += 100) * sizeof(*frame));
283 		if (frame == NULL)
284 			FATAL("out of space for stack frames in %s", s);
285 		frp = frame + dfp;
286 	}
287 	frp->fcncell = fcn;
288 	frp->args = args;
289 	frp->nargs = ndef;	/* number defined with (excess are locals) */
290 	frp->retval = gettemp();
291 
292 	DPRINTF("start exec of %s, frp=%d\n", s, (int) (frp-frame));
293 	y = execute((Node *)(fcn->sval));	/* execute body */
294 	DPRINTF("finished exec of %s, frp=%d\n", s, (int) (frp-frame));
295 
296 	for (i = 0; i < ndef; i++) {
297 		Cell *t = frp->args[i];
298 		if (isarr(t)) {
299 			if (t->csub == CCOPY) {
300 				if (i >= ncall) {
301 					freesymtab(t);
302 					t->csub = CTEMP;
303 					tempfree(t);
304 				} else {
305 					oargs[i]->tval = t->tval;
306 					oargs[i]->tval &= ~(STR|NUM|DONTFREE);
307 					oargs[i]->sval = t->sval;
308 					tempfree(t);
309 				}
310 			}
311 		} else if (t != y) {	/* kludge to prevent freeing twice */
312 			t->csub = CTEMP;
313 			tempfree(t);
314 		} else if (t == y && t->csub == CCOPY) {
315 			t->csub = CTEMP;
316 			tempfree(t);
317 			freed = 1;
318 		}
319 	}
320 	tempfree(fcn);
321 	if (isexit(y) || isnext(y))
322 		return y;
323 	if (freed == 0) {
324 		tempfree(y);	/* don't free twice! */
325 	}
326 	z = frp->retval;			/* return value */
327 	DPRINTF("%s returns %g |%s| %o\n", s, getfval(z), getsval(z), z->tval);
328 	frp--;
329 	return(z);
330 }
331 
332 Cell *copycell(Cell *x)	/* make a copy of a cell in a temp */
333 {
334 	Cell *y;
335 
336 	/* copy is not constant or field */
337 
338 	y = gettemp();
339 	y->tval = x->tval & ~(CON|FLD|REC);
340 	y->csub = CCOPY;	/* prevents freeing until call is over */
341 	y->nval = x->nval;	/* BUG? */
342 	if (isstr(x) /* || x->ctype == OCELL */) {
343 		y->sval = tostring(x->sval);
344 		y->tval &= ~DONTFREE;
345 	} else
346 		y->tval |= DONTFREE;
347 	y->fval = x->fval;
348 	return y;
349 }
350 
351 Cell *arg(Node **a, int n)	/* nth argument of a function */
352 {
353 
354 	n = ptoi(a[0]);	/* argument number, counting from 0 */
355 	DPRINTF("arg(%d), frp->nargs=%d\n", n, frp->nargs);
356 	if (n+1 > frp->nargs)
357 		FATAL("argument #%d of function %s was not supplied",
358 			n+1, frp->fcncell->nval);
359 	return frp->args[n];
360 }
361 
362 Cell *jump(Node **a, int n)	/* break, continue, next, nextfile, return */
363 {
364 	Cell *y;
365 
366 	switch (n) {
367 	case EXIT:
368 		if (a[0] != NULL) {
369 			y = execute(a[0]);
370 			errorflag = (int) getfval(y);
371 			tempfree(y);
372 		}
373 		longjmp(env, 1);
374 	case RETURN:
375 		if (a[0] != NULL) {
376 			y = execute(a[0]);
377 			if ((y->tval & (STR|NUM)) == (STR|NUM)) {
378 				setsval(frp->retval, getsval(y));
379 				frp->retval->fval = getfval(y);
380 				frp->retval->tval |= NUM;
381 			}
382 			else if (y->tval & STR)
383 				setsval(frp->retval, getsval(y));
384 			else if (y->tval & NUM)
385 				setfval(frp->retval, getfval(y));
386 			else		/* can't happen */
387 				FATAL("bad type variable %d", y->tval);
388 			tempfree(y);
389 		}
390 		return(jret);
391 	case NEXT:
392 		return(jnext);
393 	case NEXTFILE:
394 		nextfile();
395 		return(jnextfile);
396 	case BREAK:
397 		return(jbreak);
398 	case CONTINUE:
399 		return(jcont);
400 	default:	/* can't happen */
401 		FATAL("illegal jump type %d", n);
402 	}
403 	return 0;	/* not reached */
404 }
405 
406 Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
407 {		/* a[0] is variable, a[1] is operator, a[2] is filename */
408 	Cell *r, *x;
409 	extern Cell **fldtab;
410 	FILE *fp;
411 	char *buf;
412 	int bufsize = recsize;
413 	int mode;
414 	bool newflag;
415 
416 	if ((buf = (char *) malloc(bufsize)) == NULL)
417 		FATAL("out of memory in getline");
418 
419 	fflush(stdout);	/* in case someone is waiting for a prompt */
420 	r = gettemp();
421 	if (a[1] != NULL) {		/* getline < file */
422 		x = execute(a[2]);		/* filename */
423 		mode = ptoi(a[1]);
424 		if (mode == '|')		/* input pipe */
425 			mode = LE;	/* arbitrary flag */
426 		fp = openfile(mode, getsval(x), &newflag);
427 		tempfree(x);
428 		if (fp == NULL)
429 			n = -1;
430 		else
431 			n = readrec(&buf, &bufsize, fp, newflag);
432 		if (n <= 0) {
433 			;
434 		} else if (a[0] != NULL) {	/* getline var <file */
435 			x = execute(a[0]);
436 			setsval(x, buf);
437 			check_number(x);
438 			tempfree(x);
439 		} else {			/* getline <file */
440 			setsval(fldtab[0], buf);
441 			check_number(fldtab[0]);
442 		}
443 	} else {			/* bare getline; use current input */
444 		if (a[0] == NULL)	/* getline */
445 			n = getrec(&record, &recsize, true);
446 		else {			/* getline var */
447 			n = getrec(&buf, &bufsize, false);
448 			if (n > 0) {
449 				x = execute(a[0]);
450 				setsval(x, buf);
451 				check_number(x);
452 				tempfree(x);
453 			}
454 		}
455 	}
456 	setfval(r, (Awkfloat) n);
457 	free(buf);
458 	return r;
459 }
460 
461 Cell *getnf(Node **a, int n)	/* get NF */
462 {
463 	if (!donefld)
464 		fldbld();
465 	return (Cell *) a[0];
466 }
467 
468 static char *
469 makearraystring(Node *p, const char *func)
470 {
471 	char *buf;
472 	int bufsz = recsize;
473 	size_t blen;
474 
475 	if ((buf = (char *) malloc(bufsz)) == NULL) {
476 		FATAL("%s: out of memory", func);
477 	}
478 
479 	blen = 0;
480 	buf[blen] = '\0';
481 
482 	for (; p; p = p->nnext) {
483 		Cell *x = execute(p);	/* expr */
484 		char *s = getsval(x);
485 		size_t seplen = strlen(getsval(subseploc));
486 		size_t nsub = p->nnext ? seplen : 0;
487 		size_t slen = strlen(s);
488 		size_t tlen = blen + slen + nsub;
489 
490 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
491 			FATAL("%s: out of memory %s[%s...]",
492 			    func, x->nval, buf);
493 		}
494 		memcpy(buf + blen, s, slen);
495 		if (nsub) {
496 			memcpy(buf + blen + slen, *SUBSEP, nsub);
497 		}
498 		buf[tlen] = '\0';
499 		blen = tlen;
500 		tempfree(x);
501 	}
502 	return buf;
503 }
504 
505 Cell *array(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
506 {
507 	Cell *x, *z;
508 	char *buf;
509 
510 	x = execute(a[0]);	/* Cell* for symbol table */
511 	buf = makearraystring(a[1], __func__);
512 	if (!isarr(x)) {
513 		DPRINTF("making %s into an array\n", NN(x->nval));
514 		if (freeable(x))
515 			xfree(x->sval);
516 		x->tval &= ~(STR|NUM|DONTFREE);
517 		x->tval |= ARR;
518 		x->sval = (char *) makesymtab(NSYMTAB);
519 	}
520 	z = setsymtab(buf, "", 0.0, STR|NUM, (Array *) x->sval);
521 	z->ctype = OCELL;
522 	z->csub = CVAR;
523 	tempfree(x);
524 	free(buf);
525 	return(z);
526 }
527 
528 Cell *awkdelete(Node **a, int n)	/* a[0] is symtab, a[1] is list of subscripts */
529 {
530 	Cell *x;
531 
532 	x = execute(a[0]);	/* Cell* for symbol table */
533 	if (x == symtabloc) {
534 		FATAL("cannot delete SYMTAB or its elements");
535 	}
536 	if (!isarr(x))
537 		return True;
538 	if (a[1] == NULL) {	/* delete the elements, not the table */
539 		freesymtab(x);
540 		x->tval &= ~STR;
541 		x->tval |= ARR;
542 		x->sval = (char *) makesymtab(NSYMTAB);
543 	} else {
544 		char *buf = makearraystring(a[1], __func__);
545 		freeelem(x, buf);
546 		free(buf);
547 	}
548 	tempfree(x);
549 	return True;
550 }
551 
552 Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
553 {
554 	Cell *ap, *k;
555 	char *buf;
556 
557 	ap = execute(a[1]);	/* array name */
558 	if (!isarr(ap)) {
559 		DPRINTF("making %s into an array\n", ap->nval);
560 		if (freeable(ap))
561 			xfree(ap->sval);
562 		ap->tval &= ~(STR|NUM|DONTFREE);
563 		ap->tval |= ARR;
564 		ap->sval = (char *) makesymtab(NSYMTAB);
565 	}
566 	buf = makearraystring(a[0], __func__);
567 	k = lookup(buf, (Array *) ap->sval);
568 	tempfree(ap);
569 	free(buf);
570 	if (k == NULL)
571 		return(False);
572 	else
573 		return(True);
574 }
575 
576 
577 /* ======== utf-8 code ========== */
578 
579 /*
580  * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
581  * or utf-8.  u8_isutf tests whether a string starts with a valid
582  * utf-8 sequence, and returns 0 if not (e.g., high bit set).
583  * u8_nextlen returns length of next valid sequence, which is
584  * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
585  * u8_strlen returns length of string in valid utf-8 sequences
586  * and/or high-bit bytes.  Conversion functions go between byte
587  * number and character number.
588  *
589  * In theory, this behaves the same as before for non-utf8 bytes.
590  *
591  * Limited checking! This is a potential security hole.
592  */
593 
594 /* is s the beginning of a valid utf-8 string? */
595 /* return length 1..4 if yes, 0 if no */
596 static int u8_isutf(const char *s)
597 {
598 	int n, ret;
599 	unsigned char c;
600 
601 	c = s[0];
602 	if (c < 128 || awk_mb_cur_max == 1)
603 		return 1; /* what if it's 0? */
604 
605 	n = strlen(s);
606 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
607 		ret = 2; /* 110xxxxx 10xxxxxx */
608 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
609 			 && (s[2] & 0xC0) == 0x80) {
610 		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
611 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
612 			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
613 		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
614 	} else {
615 		ret = 0;
616 	}
617 	return ret;
618 }
619 
620 /* Convert (prefix of) utf8 string to utf-32 rune. */
621 /* Sets *rune to the value, returns the length. */
622 /* No error checking: watch out. */
623 int u8_rune(int *rune, const char *s)
624 {
625 	int n, ret;
626 	unsigned char c;
627 
628 	c = s[0];
629 	if (c < 128 || awk_mb_cur_max == 1) {
630 		*rune = c;
631 		return 1;
632 	}
633 
634 	n = strlen(s);
635 	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
636 		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
637 		ret = 2;
638 	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
639 			  && (s[2] & 0xC0) == 0x80) {
640 		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
641 			/* 1110xxxx 10xxxxxx 10xxxxxx */
642 		ret = 3;
643 	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
644 			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
645 		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
646 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
647 		ret = 4;
648 	} else {
649 		*rune = c;
650 		ret = 1;
651 	}
652 	return ret; /* returns one byte if sequence doesn't look like utf */
653 }
654 
655 /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
656 int u8_nextlen(const char *s)
657 {
658 	int len;
659 
660 	len = u8_isutf(s);
661 	if (len == 0)
662 		len = 1;
663 	return len;
664 }
665 
666 /* return number of utf characters or single non-utf bytes */
667 static int u8_strlen(const char *s)
668 {
669 	int i, len, n, totlen;
670 	unsigned char c;
671 
672 	n = strlen(s);
673 	totlen = 0;
674 	for (i = 0; i < n; i += len) {
675 		c = s[i];
676 		if (c < 128 || awk_mb_cur_max == 1) {
677 			len = 1;
678 		} else {
679 			len = u8_nextlen(&s[i]);
680 		}
681 		totlen++;
682 		if (i > n)
683 			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
684 	}
685 	return totlen;
686 }
687 
688 /* convert utf-8 char number in a string to its byte offset */
689 static int u8_char2byte(const char *s, int charnum)
690 {
691 	int n;
692 	int bytenum = 0;
693 
694 	while (charnum > 0) {
695 		n = u8_nextlen(s);
696 		s += n;
697 		bytenum += n;
698 		charnum--;
699 	}
700 	return bytenum;
701 }
702 
703 /* convert byte offset in s to utf-8 char number that starts there */
704 static int u8_byte2char(const char *s, int bytenum)
705 {
706 	int i, len, b;
707 	int charnum = 0; /* BUG: what origin? */
708 	/* should be 0 to match start==0 which means no match */
709 
710 	b = strlen(s);
711 	if (bytenum > b) {
712 		return -1; /* ??? */
713 	}
714 	for (i = 0; i <= bytenum; i += len) {
715 		len = u8_nextlen(s+i);
716 		charnum++;
717 	}
718 	return charnum;
719 }
720 
721 /* runetochar() adapted from rune.c in the Plan 9 distribution */
722 
723 enum
724 {
725 	Runeerror = 128, /* from somewhere else */
726 	Runemax = 0x10FFFF,
727 
728 	Bit1    = 7,
729 	Bitx    = 6,
730 	Bit2    = 5,
731 	Bit3    = 4,
732 	Bit4    = 3,
733 	Bit5    = 2,
734 
735 	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
736 	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
737 	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
738 	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
739 	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
740 	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
741 
742 	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
743 	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
744 	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
745 	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
746 
747 	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
748 	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
749 
750 };
751 
752 int runetochar(char *str, int c)
753 {
754 	/* one character sequence 00000-0007F => 00-7F */
755 	if (c <= Rune1) {
756 		str[0] = c;
757 		return 1;
758 	}
759 
760 	/* two character sequence 00080-007FF => T2 Tx */
761 	if (c <= Rune2) {
762 		str[0] = T2 | (c >> 1*Bitx);
763 		str[1] = Tx | (c & Maskx);
764 		return 2;
765 	}
766 
767 	/* three character sequence 00800-0FFFF => T3 Tx Tx */
768 	if (c > Runemax)
769 		c = Runeerror;
770 	if (c <= Rune3) {
771 		str[0] = T3 |  (c >> 2*Bitx);
772 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
773 		str[2] = Tx |  (c & Maskx);
774 		return 3;
775 	}
776 
777 	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
778 	str[0] = T4 |  (c >> 3*Bitx);
779 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
780 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
781 	str[3] = Tx |  (c & Maskx);
782 	return 4;
783 }
784 
785 
786 /* ========== end of utf8 code =========== */
787 
788 
789 
790 Cell *matchop(Node **a, int n)	/* ~ and match() */
791 {
792 	Cell *x, *y, *z;
793 	char *s, *t;
794 	int i;
795 	int cstart, cpatlen, len;
796 	fa *pfa;
797 	int (*mf)(fa *, const char *) = match, mode = 0;
798 
799 	if (n == MATCHFCN) {
800 		mf = pmatch;
801 		mode = 1;
802 	}
803 	x = execute(a[1]);	/* a[1] = target text */
804 	s = getsval(x);
805 	if (a[0] == NULL)	/* a[1] == 0: already-compiled reg expr */
806 		i = (*mf)((fa *) a[2], s);
807 	else {
808 		y = execute(a[2]);	/* a[2] = regular expr */
809 		t = getsval(y);
810 		pfa = makedfa(t, mode);
811 		i = (*mf)(pfa, s);
812 		tempfree(y);
813 	}
814 	z = x;
815 	if (n == MATCHFCN) {
816 		int start = patbeg - s + 1; /* origin 1 */
817 		if (patlen < 0) {
818 			start = 0; /* not found */
819 		} else {
820 			cstart = u8_byte2char(s, start-1);
821 			cpatlen = 0;
822 			for (i = 0; i < patlen; i += len) {
823 				len = u8_nextlen(patbeg+i);
824 				cpatlen++;
825 			}
826 
827 			start = cstart;
828 			patlen = cpatlen;
829 		}
830 
831 		setfval(rstartloc, (Awkfloat) start);
832 		setfval(rlengthloc, (Awkfloat) patlen);
833 		x = gettemp();
834 		x->tval = NUM;
835 		x->fval = start;
836 	} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
837 		x = True;
838 	else
839 		x = False;
840 
841 	tempfree(z);
842 	return x;
843 }
844 
845 
846 Cell *boolop(Node **a, int n)	/* a[0] || a[1], a[0] && a[1], !a[0] */
847 {
848 	Cell *x, *y;
849 	int i;
850 
851 	x = execute(a[0]);
852 	i = istrue(x);
853 	tempfree(x);
854 	switch (n) {
855 	case BOR:
856 		if (i) return(True);
857 		y = execute(a[1]);
858 		i = istrue(y);
859 		tempfree(y);
860 		if (i) return(True);
861 		else return(False);
862 	case AND:
863 		if ( !i ) return(False);
864 		y = execute(a[1]);
865 		i = istrue(y);
866 		tempfree(y);
867 		if (i) return(True);
868 		else return(False);
869 	case NOT:
870 		if (i) return(False);
871 		else return(True);
872 	default:	/* can't happen */
873 		FATAL("unknown boolean operator %d", n);
874 	}
875 	return 0;	/*NOTREACHED*/
876 }
877 
878 Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
879 {
880 	int i;
881 	Cell *x, *y;
882 	Awkfloat j;
883 	bool x_is_nan, y_is_nan;
884 
885 	x = execute(a[0]);
886 	y = execute(a[1]);
887 	x_is_nan = isnan(x->fval);
888 	y_is_nan = isnan(y->fval);
889 	if (x->tval&NUM && y->tval&NUM) {
890 		if ((x_is_nan || y_is_nan) && n != NE)
891 			return(False);
892 		j = x->fval - y->fval;
893 		i = j<0? -1: (j>0? 1: 0);
894 	} else {
895 		i = strcmp(getsval(x), getsval(y));
896 	}
897 	tempfree(x);
898 	tempfree(y);
899 	switch (n) {
900 	case LT:	if (i<0) return(True);
901 			else return(False);
902 	case LE:	if (i<=0) return(True);
903 			else return(False);
904 	case NE:	if (x_is_nan && y_is_nan) return(True);
905 			else if (i!=0) return(True);
906 			else return(False);
907 	case EQ:	if (i == 0) return(True);
908 			else return(False);
909 	case GE:	if (i>=0) return(True);
910 			else return(False);
911 	case GT:	if (i>0) return(True);
912 			else return(False);
913 	default:	/* can't happen */
914 		FATAL("unknown relational operator %d", n);
915 	}
916 	return 0;	/*NOTREACHED*/
917 }
918 
919 void tfree(Cell *a)	/* free a tempcell */
920 {
921 	if (freeable(a)) {
922 		DPRINTF("freeing %s %s %o\n", NN(a->nval), NN(a->sval), a->tval);
923 		xfree(a->sval);
924 	}
925 	if (a == tmps)
926 		FATAL("tempcell list is curdled");
927 	a->cnext = tmps;
928 	tmps = a;
929 }
930 
931 Cell *gettemp(void)	/* get a tempcell */
932 {	int i;
933 	Cell *x;
934 
935 	if (!tmps) {
936 		tmps = (Cell *) calloc(100, sizeof(*tmps));
937 		if (!tmps)
938 			FATAL("out of space for temporaries");
939 		for (i = 1; i < 100; i++)
940 			tmps[i-1].cnext = &tmps[i];
941 		tmps[i-1].cnext = NULL;
942 	}
943 	x = tmps;
944 	tmps = x->cnext;
945 	*x = tempcell;
946 	return(x);
947 }
948 
949 Cell *indirect(Node **a, int n)	/* $( a[0] ) */
950 {
951 	Awkfloat val;
952 	Cell *x;
953 	int m;
954 	char *s;
955 
956 	x = execute(a[0]);
957 	val = getfval(x);	/* freebsd: defend against super large field numbers */
958 	if ((Awkfloat)INT_MAX < val)
959 		FATAL("trying to access out of range field %s", x->nval);
960 	m = (int) val;
961 	if (m == 0 && !is_number(s = getsval(x), NULL))	/* suspicion! */
962 		FATAL("illegal field $(%s), name \"%s\"", s, x->nval);
963 		/* BUG: can x->nval ever be null??? */
964 	tempfree(x);
965 	x = fieldadr(m);
966 	x->ctype = OCELL;	/* BUG?  why are these needed? */
967 	x->csub = CFLD;
968 	return(x);
969 }
970 
971 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
972 {
973 	int k, m, n;
974 	int mb, nb;
975 	char *s;
976 	int temp;
977 	Cell *x, *y, *z = NULL;
978 
979 	x = execute(a[0]);
980 	y = execute(a[1]);
981 	if (a[2] != NULL)
982 		z = execute(a[2]);
983 	s = getsval(x);
984 	k = u8_strlen(s) + 1;
985 	if (k <= 1) {
986 		tempfree(x);
987 		tempfree(y);
988 		if (a[2] != NULL) {
989 			tempfree(z);
990 		}
991 		x = gettemp();
992 		setsval(x, "");
993 		return(x);
994 	}
995 	m = (int) getfval(y);
996 	if (m <= 0)
997 		m = 1;
998 	else if (m > k)
999 		m = k;
1000 	tempfree(y);
1001 	if (a[2] != NULL) {
1002 		n = (int) getfval(z);
1003 		tempfree(z);
1004 	} else
1005 		n = k - 1;
1006 	if (n < 0)
1007 		n = 0;
1008 	else if (n > k - m)
1009 		n = k - m;
1010 	/* m is start, n is length from there */
1011 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
1012 	y = gettemp();
1013 	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
1014 	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
1015 
1016 	temp = s[nb];	/* with thanks to John Linderman */
1017 	s[nb] = '\0';
1018 	setsval(y, s + mb);
1019 	s[nb] = temp;
1020 	tempfree(x);
1021 	return(y);
1022 }
1023 
1024 Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
1025 {
1026 	Cell *x, *y, *z;
1027 	char *s1, *s2, *p1, *p2, *q;
1028 	Awkfloat v = 0.0;
1029 
1030 	x = execute(a[0]);
1031 	s1 = getsval(x);
1032 	y = execute(a[1]);
1033 	s2 = getsval(y);
1034 
1035 	z = gettemp();
1036 	for (p1 = s1; *p1 != '\0'; p1++) {
1037 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
1038 			continue;
1039 		if (*p2 == '\0') {
1040 			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
1041 
1042 		   /* should be a function: used in match() as well */
1043 			int i, len;
1044 			v = 0;
1045 			for (i = 0; i < p1-s1+1; i += len) {
1046 				len = u8_nextlen(s1+i);
1047 				v++;
1048 			}
1049 			break;
1050 		}
1051 	}
1052 	tempfree(x);
1053 	tempfree(y);
1054 	setfval(z, v);
1055 	return(z);
1056 }
1057 
1058 static int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
1059 {
1060 	int n;
1061 
1062 	for (n = 0; *s != 0; s += n) {
1063 		n = u8_nextlen(s);
1064 		if (n > 1)
1065 			return 1;
1066 	}
1067 	return 0;
1068 }
1069 
1070 #define	MAXNUMSIZE	50
1071 
1072 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
1073 {
1074 	char *fmt;
1075 	char *p, *t;
1076 	const char *os;
1077 	Cell *x;
1078 	int flag = 0, n;
1079 	int fmtwd; /* format width */
1080 	int fmtsz = recsize;
1081 	char *buf = *pbuf;
1082 	int bufsize = *pbufsize;
1083 #define FMTSZ(a)   (fmtsz - ((a) - fmt))
1084 #define BUFSZ(a)   (bufsize - ((a) - buf))
1085 
1086 	static bool first = true;
1087 	static bool have_a_format = false;
1088 
1089 	if (first) {
1090 		char xbuf[100];
1091 
1092 		snprintf(xbuf, sizeof(xbuf), "%a", 42.0);
1093 		have_a_format = (strcmp(xbuf, "0x1.5p+5") == 0);
1094 		first = false;
1095 	}
1096 
1097 	os = s;
1098 	p = buf;
1099 	if ((fmt = (char *) malloc(fmtsz)) == NULL)
1100 		FATAL("out of memory in format()");
1101 	while (*s) {
1102 		adjbuf(&buf, &bufsize, MAXNUMSIZE+1+p-buf, recsize, &p, "format1");
1103 		if (*s != '%') {
1104 			*p++ = *s++;
1105 			continue;
1106 		}
1107 		if (*(s+1) == '%') {
1108 			*p++ = '%';
1109 			s += 2;
1110 			continue;
1111 		}
1112 		fmtwd = atoi(s+1);
1113 		if (fmtwd < 0)
1114 			fmtwd = -fmtwd;
1115 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format2");
1116 		for (t = fmt; (*t++ = *s) != '\0'; s++) {
1117 			if (!adjbuf(&fmt, &fmtsz, MAXNUMSIZE+1+t-fmt, recsize, &t, "format3"))
1118 				FATAL("format item %.30s... ran format() out of memory", os);
1119 			/* Ignore size specifiers */
1120 			if (strchr("hjLlqtz", *s) != NULL) {	/* the ansi panoply */
1121 				t--;
1122 				continue;
1123 			}
1124 			if (isalpha((uschar)*s))
1125 				break;
1126 			if (*s == '$') {
1127 				FATAL("'$' not permitted in awk formats");
1128 			}
1129 			if (*s == '*') {
1130 				if (a == NULL) {
1131 					FATAL("not enough args in printf(%s)", os);
1132 				}
1133 				x = execute(a);
1134 				a = a->nnext;
1135 				snprintf(t - 1, FMTSZ(t - 1),
1136 				    "%d", fmtwd=(int) getfval(x));
1137 				if (fmtwd < 0)
1138 					fmtwd = -fmtwd;
1139 				adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format");
1140 				t = fmt + strlen(fmt);
1141 				tempfree(x);
1142 			}
1143 		}
1144 		*t = '\0';
1145 		if (fmtwd < 0)
1146 			fmtwd = -fmtwd;
1147 		adjbuf(&buf, &bufsize, fmtwd+1+p-buf, recsize, &p, "format4");
1148 		switch (*s) {
1149 		case 'a': case 'A':
1150 			if (have_a_format)
1151 				flag = *s;
1152 			else
1153 				flag = 'f';
1154 			break;
1155 		case 'f': case 'e': case 'g': case 'E': case 'G':
1156 			flag = 'f';
1157 			break;
1158 		case 'd': case 'i': case 'o': case 'x': case 'X': case 'u':
1159 			flag = (*s == 'd' || *s == 'i') ? 'd' : 'u';
1160 			*(t-1) = 'j';
1161 			*t = *s;
1162 			*++t = '\0';
1163 			break;
1164 		case 's':
1165 			flag = 's';
1166 			break;
1167 		case 'c':
1168 			flag = 'c';
1169 			break;
1170 		default:
1171 			WARNING("weird printf conversion %s", fmt);
1172 			flag = '?';
1173 			break;
1174 		}
1175 		if (a == NULL)
1176 			FATAL("not enough args in printf(%s)", os);
1177 		x = execute(a);
1178 		a = a->nnext;
1179 		n = MAXNUMSIZE;
1180 		if (fmtwd > n)
1181 			n = fmtwd;
1182 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
1183 		switch (flag) {
1184 		case '?':
1185 			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
1186 			t = getsval(x);
1187 			n = strlen(t);
1188 			if (fmtwd > n)
1189 				n = fmtwd;
1190 			adjbuf(&buf, &bufsize, 1+strlen(p)+n+p-buf, recsize, &p, "format6");
1191 			p += strlen(p);
1192 			snprintf(p, BUFSZ(p), "%s", t);
1193 			break;
1194 		case 'a':
1195 		case 'A':
1196 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
1197 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
1198 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
1199 
1200 		case 's': {
1201 			t = getsval(x);
1202 			n = strlen(t);
1203 			/* if simple format or no utf-8 in the string, sprintf works */
1204 			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
1205 				if (fmtwd > n)
1206 					n = fmtwd;
1207 				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
1208 					FATAL("huge string/format (%d chars) in printf %.30s..." \
1209 						" ran format() out of memory", n, t);
1210 				snprintf(p, BUFSZ(p), fmt, t);
1211 				break;
1212 			}
1213 
1214 			/* get here if string has utf-8 chars and fmt is not plain %s */
1215 			/* "%-w.ps", where -, w and .p are all optional */
1216 			/* '0' before the w is a flag character */
1217 			/* fmt points at % */
1218 			int ljust = 0, wid = 0, prec = n, pad = 0;
1219 			char *f = fmt+1;
1220 			if (f[0] == '-') {
1221 				ljust = 1;
1222 				f++;
1223 			}
1224 			// flags '0' and '+' are recognized but skipped
1225 			if (f[0] == '0') {
1226 				f++;
1227 				if (f[0] == '+')
1228 					f++;
1229 			}
1230 			if (f[0] == '+') {
1231 				f++;
1232 				if (f[0] == '0')
1233 					f++;
1234 			}
1235 			if (isdigit((unsigned char)f[0])) { /* there is a wid */
1236 				wid = strtol(f, &f, 10);
1237 			}
1238 			if (f[0] == '.') { /* there is a .prec */
1239 				prec = strtol(++f, &f, 10);
1240 			}
1241 			if (prec > u8_strlen(t))
1242 				prec = u8_strlen(t);
1243 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1244 			int i, k, l;
1245 
1246 			if (ljust) { // print prec chars from t, then pad blanks
1247 				l = u8_char2byte(t, prec);
1248 				for (k = 0; k < l; k++) {
1249 					//putchar(t[k]);
1250 					*p++ = t[k];
1251 				}
1252 				for (i = 0; i < pad; i++) {
1253 					//printf(" ");
1254 					*p++ = ' ';
1255 				}
1256 			} else { // print pad blanks, then prec chars from t
1257 				for (i = 0; i < pad; i++) {
1258 					//printf(" ");
1259 					*p++ = ' ';
1260 				}
1261 				l = u8_char2byte(t, prec);
1262 				for (k = 0; k < l; k++) {
1263 					//putchar(t[k]);
1264 					*p++ = t[k];
1265 				}
1266 			}
1267 			*p = 0;
1268 			break;
1269 		}
1270 
1271                case 'c': {
1272 			/*
1273 			 * If a numeric value is given, awk should just turn
1274 			 * it into a character and print it:
1275 			 *      BEGIN { printf("%c\n", 65) }
1276 			 * prints "A".
1277 			 *
1278 			 * But what if the numeric value is > 128 and
1279 			 * represents a valid Unicode code point?!? We do
1280 			 * our best to convert it back into UTF-8. If we
1281 			 * can't, we output the encoding of the Unicode
1282 			 * "invalid character", 0xFFFD.
1283 			 */
1284 			if (isnum(x)) {
1285 				int charval = (int) getfval(x);
1286 
1287 				if (charval != 0) {
1288 					if (charval < 128 || awk_mb_cur_max == 1)
1289 						snprintf(p, BUFSZ(p), fmt, charval);
1290 					else {
1291 						// possible unicode character
1292 						size_t count;
1293 						char *bs = wide_char_to_byte_str(charval, &count);
1294 
1295 						if (bs == NULL)	{ // invalid character
1296 							// use unicode invalid character, 0xFFFD
1297 							static char invalid_char[] = "\357\277\275";
1298 							bs = invalid_char;
1299 							count = 3;
1300 						}
1301 						t = bs;
1302 						n = count;
1303 						goto format_percent_c;
1304 					}
1305 				} else {
1306 					*p++ = '\0'; /* explicit null byte */
1307 					*p = '\0';   /* next output will start here */
1308 				}
1309 				break;
1310 			}
1311 			t = getsval(x);
1312 			n = u8_nextlen(t);
1313 		format_percent_c:
1314 			if (n < 2) { /* not utf8 */
1315 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
1316 				break;
1317 			}
1318 
1319 			// utf8 character, almost same song and dance as for %s
1320 			int ljust = 0, wid = 0, prec = n, pad = 0;
1321 			char *f = fmt+1;
1322 			if (f[0] == '-') {
1323 				ljust = 1;
1324 				f++;
1325 			}
1326 			// flags '0' and '+' are recognized but skipped
1327 			if (f[0] == '0') {
1328 				f++;
1329 				if (f[0] == '+')
1330 					f++;
1331 			}
1332 			if (f[0] == '+') {
1333 				f++;
1334 				if (f[0] == '0')
1335 					f++;
1336 			}
1337 			if (isdigit((unsigned char)f[0])) { /* there is a wid */
1338 				wid = strtol(f, &f, 10);
1339 			}
1340 			if (f[0] == '.') { /* there is a .prec */
1341 				prec = strtol(++f, &f, 10);
1342 			}
1343 			if (prec > 1)           // %c --> only one character
1344 				prec = 1;
1345 			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
1346 			int i;
1347 
1348 			if (ljust) { // print one char from t, then pad blanks
1349 				for (i = 0; i < n; i++)
1350 					*p++ = t[i];
1351 				for (i = 0; i < pad; i++) {
1352 					//printf(" ");
1353 					*p++ = ' ';
1354 				}
1355 			} else { // print pad blanks, then prec chars from t
1356 				for (i = 0; i < pad; i++) {
1357 					//printf(" ");
1358 					*p++ = ' ';
1359 				}
1360 				for (i = 0; i < n; i++)
1361 					*p++ = t[i];
1362 			}
1363 			*p = 0;
1364 			break;
1365 		}
1366 		default:
1367 			FATAL("can't happen: bad conversion %c in format()", flag);
1368 		}
1369 
1370 		tempfree(x);
1371 		p += strlen(p);
1372 		s++;
1373 	}
1374 	*p = '\0';
1375 	free(fmt);
1376 	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
1377 		x = execute(a);
1378 		tempfree(x);
1379 	}
1380 	*pbuf = buf;
1381 	*pbufsize = bufsize;
1382 	return p - buf;
1383 }
1384 
1385 Cell *awksprintf(Node **a, int n)		/* sprintf(a[0]) */
1386 {
1387 	Cell *x;
1388 	Node *y;
1389 	char *buf;
1390 	int bufsz=3*recsize;
1391 
1392 	if ((buf = (char *) malloc(bufsz)) == NULL)
1393 		FATAL("out of memory in awksprintf");
1394 	y = a[0]->nnext;
1395 	x = execute(a[0]);
1396 	if (format(&buf, &bufsz, getsval(x), y) == -1)
1397 		FATAL("sprintf string %.30s... too long.  can't happen.", buf);
1398 	tempfree(x);
1399 	x = gettemp();
1400 	x->sval = buf;
1401 	x->tval = STR;
1402 	return(x);
1403 }
1404 
1405 Cell *awkprintf(Node **a, int n)		/* printf */
1406 {	/* a[0] is list of args, starting with format string */
1407 	/* a[1] is redirection operator, a[2] is redirection file */
1408 	FILE *fp;
1409 	Cell *x;
1410 	Node *y;
1411 	char *buf;
1412 	int len;
1413 	int bufsz=3*recsize;
1414 
1415 	if ((buf = (char *) malloc(bufsz)) == NULL)
1416 		FATAL("out of memory in awkprintf");
1417 	y = a[0]->nnext;
1418 	x = execute(a[0]);
1419 	if ((len = format(&buf, &bufsz, getsval(x), y)) == -1)
1420 		FATAL("printf string %.30s... too long.  can't happen.", buf);
1421 	tempfree(x);
1422 	if (a[1] == NULL) {
1423 		/* fputs(buf, stdout); */
1424 		fwrite(buf, len, 1, stdout);
1425 		if (ferror(stdout))
1426 			FATAL("write error on stdout");
1427 	} else {
1428 		fp = redirect(ptoi(a[1]), a[2]);
1429 		/* fputs(buf, fp); */
1430 		fwrite(buf, len, 1, fp);
1431 		fflush(fp);
1432 		if (ferror(fp))
1433 			FATAL("write error on %s", filename(fp));
1434 	}
1435 	free(buf);
1436 	return(True);
1437 }
1438 
1439 Cell *arith(Node **a, int n)	/* a[0] + a[1], etc.  also -a[0] */
1440 {
1441 	Awkfloat i, j = 0;
1442 	double v;
1443 	Cell *x, *y, *z;
1444 
1445 	x = execute(a[0]);
1446 	i = getfval(x);
1447 	tempfree(x);
1448 	if (n != UMINUS && n != UPLUS) {
1449 		y = execute(a[1]);
1450 		j = getfval(y);
1451 		tempfree(y);
1452 	}
1453 	z = gettemp();
1454 	switch (n) {
1455 	case ADD:
1456 		i += j;
1457 		break;
1458 	case MINUS:
1459 		i -= j;
1460 		break;
1461 	case MULT:
1462 		i *= j;
1463 		break;
1464 	case DIVIDE:
1465 		if (j == 0)
1466 			FATAL("division by zero");
1467 		i /= j;
1468 		break;
1469 	case MOD:
1470 		if (j == 0)
1471 			FATAL("division by zero in mod");
1472 		modf(i/j, &v);
1473 		i = i - j * v;
1474 		break;
1475 	case UMINUS:
1476 		i = -i;
1477 		break;
1478 	case UPLUS: /* handled by getfval(), above */
1479 		break;
1480 	case POWER:
1481 		if (j >= 0 && modf(j, &v) == 0.0)	/* pos integer exponent */
1482 			i = ipow(i, (int) j);
1483                else {
1484 			errno = 0;
1485 			i = errcheck(pow(i, j), "pow");
1486                }
1487 		break;
1488 	default:	/* can't happen */
1489 		FATAL("illegal arithmetic operator %d", n);
1490 	}
1491 	setfval(z, i);
1492 	return(z);
1493 }
1494 
1495 double ipow(double x, int n)	/* x**n.  ought to be done by pow, but isn't always */
1496 {
1497 	double v;
1498 
1499 	if (n <= 0)
1500 		return 1;
1501 	v = ipow(x, n/2);
1502 	if (n % 2 == 0)
1503 		return v * v;
1504 	else
1505 		return x * v * v;
1506 }
1507 
1508 Cell *incrdecr(Node **a, int n)		/* a[0]++, etc. */
1509 {
1510 	Cell *x, *z;
1511 	int k;
1512 	Awkfloat xf;
1513 
1514 	x = execute(a[0]);
1515 	xf = getfval(x);
1516 	k = (n == PREINCR || n == POSTINCR) ? 1 : -1;
1517 	if (n == PREINCR || n == PREDECR) {
1518 		setfval(x, xf + k);
1519 		return(x);
1520 	}
1521 	z = gettemp();
1522 	setfval(z, xf);
1523 	setfval(x, xf + k);
1524 	tempfree(x);
1525 	return(z);
1526 }
1527 
1528 Cell *assign(Node **a, int n)	/* a[0] = a[1], a[0] += a[1], etc. */
1529 {		/* this is subtle; don't muck with it. */
1530 	Cell *x, *y;
1531 	Awkfloat xf, yf;
1532 	double v;
1533 
1534 	y = execute(a[1]);
1535 	x = execute(a[0]);
1536 	if (n == ASSIGN) {	/* ordinary assignment */
1537 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
1538 			;	/* self-assignment: leave alone unless it's a field or NF */
1539 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
1540 			yf = getfval(y);
1541 			setsval(x, getsval(y));
1542 			x->fval = yf;
1543 			x->tval |= NUM;
1544 		}
1545 		else if (isstr(y))
1546 			setsval(x, getsval(y));
1547 		else if (isnum(y))
1548 			setfval(x, getfval(y));
1549 		else
1550 			funnyvar(y, "read value of");
1551 		tempfree(y);
1552 		return(x);
1553 	}
1554 	xf = getfval(x);
1555 	yf = getfval(y);
1556 	switch (n) {
1557 	case ADDEQ:
1558 		xf += yf;
1559 		break;
1560 	case SUBEQ:
1561 		xf -= yf;
1562 		break;
1563 	case MULTEQ:
1564 		xf *= yf;
1565 		break;
1566 	case DIVEQ:
1567 		if (yf == 0)
1568 			FATAL("division by zero in /=");
1569 		xf /= yf;
1570 		break;
1571 	case MODEQ:
1572 		if (yf == 0)
1573 			FATAL("division by zero in %%=");
1574 		modf(xf/yf, &v);
1575 		xf = xf - yf * v;
1576 		break;
1577 	case POWEQ:
1578 		if (yf >= 0 && modf(yf, &v) == 0.0)	/* pos integer exponent */
1579 			xf = ipow(xf, (int) yf);
1580                else {
1581 			errno = 0;
1582 			xf = errcheck(pow(xf, yf), "pow");
1583                }
1584 		break;
1585 	default:
1586 		FATAL("illegal assignment operator %d", n);
1587 		break;
1588 	}
1589 	tempfree(y);
1590 	setfval(x, xf);
1591 	return(x);
1592 }
1593 
1594 Cell *cat(Node **a, int q)	/* a[0] cat a[1] */
1595 {
1596 	Cell *x, *y, *z;
1597 	int n1, n2;
1598 	char *s = NULL;
1599 	int ssz = 0;
1600 
1601 	x = execute(a[0]);
1602 	n1 = strlen(getsval(x));
1603 	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
1604 	memcpy(s, x->sval, n1);
1605 
1606 	tempfree(x);
1607 
1608 	y = execute(a[1]);
1609 	n2 = strlen(getsval(y));
1610 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
1611 	memcpy(s + n1, y->sval, n2);
1612 	s[n1 + n2] = '\0';
1613 
1614 	tempfree(y);
1615 
1616 	z = gettemp();
1617 	z->sval = s;
1618 	z->tval = STR;
1619 
1620 	return(z);
1621 }
1622 
1623 Cell *pastat(Node **a, int n)	/* a[0] { a[1] } */
1624 {
1625 	Cell *x;
1626 
1627 	if (a[0] == NULL)
1628 		x = execute(a[1]);
1629 	else {
1630 		x = execute(a[0]);
1631 		if (istrue(x)) {
1632 			tempfree(x);
1633 			x = execute(a[1]);
1634 		}
1635 	}
1636 	return x;
1637 }
1638 
1639 Cell *dopa2(Node **a, int n)	/* a[0], a[1] { a[2] } */
1640 {
1641 	Cell *x;
1642 	int pair;
1643 
1644 	pair = ptoi(a[3]);
1645 	if (pairstack[pair] == 0) {
1646 		x = execute(a[0]);
1647 		if (istrue(x))
1648 			pairstack[pair] = 1;
1649 		tempfree(x);
1650 	}
1651 	if (pairstack[pair] == 1) {
1652 		x = execute(a[1]);
1653 		if (istrue(x))
1654 			pairstack[pair] = 0;
1655 		tempfree(x);
1656 		x = execute(a[2]);
1657 		return(x);
1658 	}
1659 	return(False);
1660 }
1661 
1662 Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
1663 {
1664 	Cell *x = NULL, *y, *ap;
1665 	const char *s, *origs, *t;
1666 	const char *fs = NULL;
1667 	char *origfs = NULL;
1668 	int sep;
1669 	char temp, num[50];
1670 	int n, tempstat, arg3type;
1671 	int j;
1672 	double result;
1673 
1674 	y = execute(a[0]);	/* source string */
1675 	origs = s = strdup(getsval(y));
1676 	tempfree(y);
1677 	arg3type = ptoi(a[3]);
1678 	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
1679 		fs = getsval(fsloc);
1680 	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
1681 		x = execute(a[2]);
1682 		fs = origfs = strdup(getsval(x));
1683 		tempfree(x);
1684 	} else if (arg3type == REGEXPR) {
1685 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
1686 	} else {
1687 		FATAL("illegal type of split");
1688 	}
1689 	sep = *fs;
1690 	ap = execute(a[1]);	/* array name */
1691 /* BUG 7/26/22: this appears not to reset array: see C1/asplit */
1692 	freesymtab(ap);
1693 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
1694 	ap->tval &= ~STR;
1695 	ap->tval |= ARR;
1696 	ap->sval = (char *) makesymtab(NSYMTAB);
1697 
1698 	n = 0;
1699         if (arg3type == REGEXPR && strlen((char*)((fa*)a[2])->restr) == 0) {
1700 		/* split(s, a, //); have to arrange that it looks like empty sep */
1701 		arg3type = 0;
1702 		fs = "";
1703 		sep = 0;
1704 	}
1705 	if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) {	/* reg expr */
1706 		fa *pfa;
1707 		if (arg3type == REGEXPR) {	/* it's ready already */
1708 			pfa = (fa *) a[2];
1709 		} else {
1710 			pfa = makedfa(fs, 1);
1711 		}
1712 		if (nematch(pfa,s)) {
1713 			tempstat = pfa->initstat;
1714 			pfa->initstat = 2;
1715 			do {
1716 				n++;
1717 				snprintf(num, sizeof(num), "%d", n);
1718 				temp = *patbeg;
1719 				setptr(patbeg, '\0');
1720 				if (is_number(s, & result))
1721 					setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1722 				else
1723 					setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1724 				setptr(patbeg, temp);
1725 				s = patbeg + patlen;
1726 				if (*(patbeg+patlen-1) == '\0' || *s == '\0') {
1727 					n++;
1728 					snprintf(num, sizeof(num), "%d", n);
1729 					setsymtab(num, "", 0.0, STR, (Array *) ap->sval);
1730 					pfa->initstat = tempstat;
1731 					goto spdone;
1732 				}
1733 			} while (nematch(pfa,s));
1734 			pfa->initstat = tempstat; 	/* bwk: has to be here to reset */
1735 							/* cf gsub and refldbld */
1736 		}
1737 		n++;
1738 		snprintf(num, sizeof(num), "%d", n);
1739 		if (is_number(s, & result))
1740 			setsymtab(num, s, result, STR|NUM, (Array *) ap->sval);
1741 		else
1742 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
1743   spdone:
1744 		pfa = NULL;
1745 
1746 	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
1747 		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
1748 		for (;;) {
1749 			char *fr = newt;
1750 			n++;
1751 			if (*s == '"' ) { /* start of "..." */
1752 				for (s++ ; *s != '\0'; ) {
1753 					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
1754 						s += 2; /* doubled quote */
1755 						*fr++ = '"';
1756 					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
1757 						s++; /* skip over closing quote */
1758 						break;
1759 					} else {
1760 						*fr++ = *s++;
1761 					}
1762 				}
1763 				*fr++ = 0;
1764 			} else {	/* unquoted field */
1765 				while (*s != ',' && *s != '\0')
1766 					*fr++ = *s++;
1767 				*fr++ = 0;
1768 			}
1769 			snprintf(num, sizeof(num), "%d", n);
1770 			if (is_number(newt, &result))
1771 				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
1772 			else
1773 				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
1774 			if (*s++ == '\0')
1775 				break;
1776 		}
1777 		free(newt);
1778 
1779 	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
1780 		for (n = 0; ; ) {
1781 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
1782 			while (ISWS(*s))
1783 				s++;
1784 			if (*s == '\0')
1785 				break;
1786 			n++;
1787 			t = s;
1788 			do
1789 				s++;
1790 			while (*s != '\0' && !ISWS(*s));
1791 			temp = *s;
1792 			setptr(s, '\0');
1793 			snprintf(num, sizeof(num), "%d", n);
1794 			if (is_number(t, & result))
1795 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1796 			else
1797 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1798 			setptr(s, temp);
1799 			if (*s != '\0')
1800 				s++;
1801 		}
1802 
1803 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
1804 		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
1805 			char buf[10];
1806 			n++;
1807 			snprintf(num, sizeof(num), "%d", n);
1808 
1809 			for (j = 0; j < u8_nextlen(s); j++) {
1810 				buf[j] = s[j];
1811 			}
1812 			buf[j] = '\0';
1813 
1814 			if (isdigit((uschar)buf[0]))
1815 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
1816 			else
1817 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
1818 		}
1819 
1820 	} else if (*s != '\0') {  /* some random single character */
1821 		for (;;) {
1822 			n++;
1823 			t = s;
1824 			while (*s != sep && *s != '\0')
1825 				s++;
1826 			temp = *s;
1827 			setptr(s, '\0');
1828 			snprintf(num, sizeof(num), "%d", n);
1829 			if (is_number(t, & result))
1830 				setsymtab(num, t, result, STR|NUM, (Array *) ap->sval);
1831 			else
1832 				setsymtab(num, t, 0.0, STR, (Array *) ap->sval);
1833 			setptr(s, temp);
1834 			if (*s++ == '\0')
1835 				break;
1836 		}
1837 	}
1838 	tempfree(ap);
1839 	xfree(origs);
1840 	xfree(origfs);
1841 	x = gettemp();
1842 	x->tval = NUM;
1843 	x->fval = n;
1844 	return(x);
1845 }
1846 
1847 Cell *condexpr(Node **a, int n)	/* a[0] ? a[1] : a[2] */
1848 {
1849 	Cell *x;
1850 
1851 	x = execute(a[0]);
1852 	if (istrue(x)) {
1853 		tempfree(x);
1854 		x = execute(a[1]);
1855 	} else {
1856 		tempfree(x);
1857 		x = execute(a[2]);
1858 	}
1859 	return(x);
1860 }
1861 
1862 Cell *ifstat(Node **a, int n)	/* if (a[0]) a[1]; else a[2] */
1863 {
1864 	Cell *x;
1865 
1866 	x = execute(a[0]);
1867 	if (istrue(x)) {
1868 		tempfree(x);
1869 		x = execute(a[1]);
1870 	} else if (a[2] != NULL) {
1871 		tempfree(x);
1872 		x = execute(a[2]);
1873 	}
1874 	return(x);
1875 }
1876 
1877 Cell *whilestat(Node **a, int n)	/* while (a[0]) a[1] */
1878 {
1879 	Cell *x;
1880 
1881 	for (;;) {
1882 		x = execute(a[0]);
1883 		if (!istrue(x))
1884 			return(x);
1885 		tempfree(x);
1886 		x = execute(a[1]);
1887 		if (isbreak(x)) {
1888 			x = True;
1889 			return(x);
1890 		}
1891 		if (isnext(x) || isexit(x) || isret(x))
1892 			return(x);
1893 		tempfree(x);
1894 	}
1895 }
1896 
1897 Cell *dostat(Node **a, int n)	/* do a[0]; while(a[1]) */
1898 {
1899 	Cell *x;
1900 
1901 	for (;;) {
1902 		x = execute(a[0]);
1903 		if (isbreak(x))
1904 			return True;
1905 		if (isnext(x) || isexit(x) || isret(x))
1906 			return(x);
1907 		tempfree(x);
1908 		x = execute(a[1]);
1909 		if (!istrue(x))
1910 			return(x);
1911 		tempfree(x);
1912 	}
1913 }
1914 
1915 Cell *forstat(Node **a, int n)	/* for (a[0]; a[1]; a[2]) a[3] */
1916 {
1917 	Cell *x;
1918 
1919 	x = execute(a[0]);
1920 	tempfree(x);
1921 	for (;;) {
1922 		if (a[1]!=NULL) {
1923 			x = execute(a[1]);
1924 			if (!istrue(x)) return(x);
1925 			else tempfree(x);
1926 		}
1927 		x = execute(a[3]);
1928 		if (isbreak(x))		/* turn off break */
1929 			return True;
1930 		if (isnext(x) || isexit(x) || isret(x))
1931 			return(x);
1932 		tempfree(x);
1933 		x = execute(a[2]);
1934 		tempfree(x);
1935 	}
1936 }
1937 
1938 Cell *instat(Node **a, int n)	/* for (a[0] in a[1]) a[2] */
1939 {
1940 	Cell *x, *vp, *arrayp, *cp, *ncp;
1941 	Array *tp;
1942 	int i;
1943 
1944 	vp = execute(a[0]);
1945 	arrayp = execute(a[1]);
1946 	if (!isarr(arrayp)) {
1947 		return True;
1948 	}
1949 	tp = (Array *) arrayp->sval;
1950 	tempfree(arrayp);
1951 	for (i = 0; i < tp->size; i++) {	/* this routine knows too much */
1952 		for (cp = tp->tab[i]; cp != NULL; cp = ncp) {
1953 			setsval(vp, cp->nval);
1954 			ncp = cp->cnext;
1955 			x = execute(a[2]);
1956 			if (isbreak(x)) {
1957 				tempfree(vp);
1958 				return True;
1959 			}
1960 			if (isnext(x) || isexit(x) || isret(x)) {
1961 				tempfree(vp);
1962 				return(x);
1963 			}
1964 			tempfree(x);
1965 		}
1966 	}
1967 	return True;
1968 }
1969 
1970 static char *nawk_convert(const char *s, int (*fun_c)(int),
1971     wint_t (*fun_wc)(wint_t))
1972 {
1973 	char *buf      = NULL;
1974 	char *pbuf     = NULL;
1975 	const char *ps = NULL;
1976 	size_t n       = 0;
1977 	wchar_t wc;
1978 	const size_t sz = awk_mb_cur_max;
1979 	int unused;
1980 
1981 	if (sz == 1) {
1982 		buf = tostring(s);
1983 
1984 		for (pbuf = buf; *pbuf; pbuf++)
1985 			*pbuf = fun_c((uschar)*pbuf);
1986 
1987 		return buf;
1988 	} else {
1989 		/* upper/lower character may be shorter/longer */
1990 		buf = tostringN(s, strlen(s) * sz + 1);
1991 
1992 		(void) mbtowc(NULL, NULL, 0);	/* reset internal state */
1993 		/*
1994 		 * Reset internal state here too.
1995 		 * Assign result to avoid a compiler warning. (Casting to void
1996 		 * doesn't work.)
1997 		 * Increment said variable to avoid a different warning.
1998 		 */
1999 		unused = wctomb(NULL, L'\0');
2000 		unused++;
2001 
2002 		ps   = s;
2003 		pbuf = buf;
2004 		while (n = mbtowc(&wc, ps, sz),
2005 		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
2006 		{
2007 			ps += n;
2008 
2009 			n = wctomb(pbuf, fun_wc(wc));
2010 			if (n == (size_t)-1)
2011 				FATAL("illegal wide character %s", s);
2012 
2013 			pbuf += n;
2014 		}
2015 
2016 		*pbuf = '\0';
2017 
2018 		if (n)
2019 			FATAL("illegal byte sequence %s", s);
2020 
2021 		return buf;
2022 	}
2023 }
2024 
2025 #ifdef __DJGPP__
2026 static wint_t towupper(wint_t wc)
2027 {
2028 	if (wc >= 0 && wc < 256)
2029 		return toupper(wc & 0xFF);
2030 
2031 	return wc;
2032 }
2033 
2034 static wint_t towlower(wint_t wc)
2035 {
2036 	if (wc >= 0 && wc < 256)
2037 		return tolower(wc & 0xFF);
2038 
2039 	return wc;
2040 }
2041 #endif
2042 
2043 static char *nawk_toupper(const char *s)
2044 {
2045 	return nawk_convert(s, toupper, towupper);
2046 }
2047 
2048 static char *nawk_tolower(const char *s)
2049 {
2050 	return nawk_convert(s, tolower, towlower);
2051 }
2052 
2053 
2054 
2055 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
2056 {
2057 	Cell *x, *y;
2058 	Awkfloat u = 0;
2059 	int t, sz;
2060 	Awkfloat tmp;
2061 	char *buf, *fmt;
2062 	Node *nextarg;
2063 	FILE *fp;
2064 	int status = 0;
2065 	time_t tv;
2066 	struct tm *tm, tmbuf;
2067 	int estatus = 0;
2068 
2069 	t = ptoi(a[0]);
2070 	x = execute(a[1]);
2071 	nextarg = a[1]->nnext;
2072 	switch (t) {
2073 	case FLENGTH:
2074 		if (isarr(x))
2075 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
2076 		else
2077 			u = u8_strlen(getsval(x));
2078 		break;
2079 	case FLOG:
2080 		errno = 0;
2081 		u = errcheck(log(getfval(x)), "log");
2082 		break;
2083 	case FINT:
2084 		modf(getfval(x), &u); break;
2085 	case FEXP:
2086 		errno = 0;
2087 		u = errcheck(exp(getfval(x)), "exp");
2088 		break;
2089 	case FSQRT:
2090 		errno = 0;
2091 		u = errcheck(sqrt(getfval(x)), "sqrt");
2092 		break;
2093 	case FSIN:
2094 		u = sin(getfval(x)); break;
2095 	case FCOS:
2096 		u = cos(getfval(x)); break;
2097 	case FATAN:
2098 		if (nextarg == NULL) {
2099 			WARNING("atan2 requires two arguments; returning 1.0");
2100 			u = 1.0;
2101 		} else {
2102 			y = execute(a[1]->nnext);
2103 			u = atan2(getfval(x), getfval(y));
2104 			tempfree(y);
2105 			nextarg = nextarg->nnext;
2106 		}
2107 		break;
2108 	case FCOMPL:
2109 		u = ~((int)getfval(x));
2110 		break;
2111 	case FAND:
2112 		if (nextarg == 0) {
2113 			WARNING("and requires two arguments; returning 0");
2114 			u = 0;
2115 			break;
2116 		}
2117 		y = execute(a[1]->nnext);
2118 		u = ((int)getfval(x)) & ((int)getfval(y));
2119 		tempfree(y);
2120 		nextarg = nextarg->nnext;
2121 		break;
2122 	case FFOR:
2123 		if (nextarg == 0) {
2124 			WARNING("or requires two arguments; returning 0");
2125 			u = 0;
2126 			break;
2127 		}
2128 		y = execute(a[1]->nnext);
2129 		u = ((int)getfval(x)) | ((int)getfval(y));
2130 		tempfree(y);
2131 		nextarg = nextarg->nnext;
2132 		break;
2133 	case FXOR:
2134 		if (nextarg == 0) {
2135 			WARNING("xor requires two arguments; returning 0");
2136 			u = 0;
2137 			break;
2138 		}
2139 		y = execute(a[1]->nnext);
2140 		u = ((int)getfval(x)) ^ ((int)getfval(y));
2141 		tempfree(y);
2142 		nextarg = nextarg->nnext;
2143 		break;
2144 	case FLSHIFT:
2145 		if (nextarg == 0) {
2146 			WARNING("lshift requires two arguments; returning 0");
2147 			u = 0;
2148 			break;
2149 		}
2150 		y = execute(a[1]->nnext);
2151 		u = ((int)getfval(x)) << ((int)getfval(y));
2152 		tempfree(y);
2153 		nextarg = nextarg->nnext;
2154 		break;
2155 	case FRSHIFT:
2156 		if (nextarg == 0) {
2157 			WARNING("rshift requires two arguments; returning 0");
2158 			u = 0;
2159 			break;
2160 		}
2161 		y = execute(a[1]->nnext);
2162 		u = ((int)getfval(x)) >> ((int)getfval(y));
2163 		tempfree(y);
2164 		nextarg = nextarg->nnext;
2165 		break;
2166 	case FSYSTEM:
2167 		fflush(stdout);		/* in case something is buffered already */
2168 		estatus = status = system(getsval(x));
2169 		if (status != -1) {
2170 			if (WIFEXITED(status)) {
2171 				estatus = WEXITSTATUS(status);
2172 			} else if (WIFSIGNALED(status)) {
2173 				estatus = WTERMSIG(status) + 256;
2174 #ifdef WCOREDUMP
2175 				if (WCOREDUMP(status))
2176 					estatus += 256;
2177 #endif
2178 			} else	/* something else?!? */
2179 				estatus = 0;
2180 		}
2181 		/* else estatus was set to -1 */
2182 		u = estatus;
2183 		break;
2184 	case FRAND:
2185 		/* random() returns numbers in [0..2^31-1]
2186 		 * in order to get a number in [0, 1), divide it by 2^31
2187 		 */
2188 		u = (Awkfloat) random() / (0x7fffffffL + 0x1UL);
2189 		break;
2190 	case FSRAND:
2191 		if (isrec(x))	/* no argument provided */
2192 			u = time((time_t *)0);
2193 		else
2194 			u = getfval(x);
2195 		tmp = u;
2196 		srandom((unsigned long) u);
2197 		u = srand_seed;
2198 		srand_seed = tmp;
2199 		break;
2200 	case FTOUPPER:
2201 	case FTOLOWER:
2202 		if (t == FTOUPPER)
2203 			buf = nawk_toupper(getsval(x));
2204 		else
2205 			buf = nawk_tolower(getsval(x));
2206 		tempfree(x);
2207 		x = gettemp();
2208 		setsval(x, buf);
2209 		free(buf);
2210 		return x;
2211 	case FFLUSH:
2212 		if (isrec(x) || strlen(getsval(x)) == 0) {
2213 			flush_all();	/* fflush() or fflush("") -> all */
2214 			u = 0;
2215 		} else if ((fp = openfile(FFLUSH, getsval(x), NULL)) == NULL)
2216 			u = EOF;
2217 		else
2218 			u = fflush(fp);
2219 		break;
2220 	case FMKTIME:
2221 		memset(&tmbuf, 0, sizeof(tmbuf));
2222 		tm = &tmbuf;
2223 		t = sscanf(getsval(x), "%d %d %d %d %d %d %d",
2224 		    &tm->tm_year, &tm->tm_mon, &tm->tm_mday, &tm->tm_hour,
2225 		    &tm->tm_min, &tm->tm_sec, &tm->tm_isdst);
2226 		switch (t) {
2227 		case 6:
2228 			tm->tm_isdst = -1;	/* let mktime figure it out */
2229 			/* FALLTHROUGH */
2230 		case 7:
2231 			tm->tm_year -= 1900;
2232 			tm->tm_mon--;
2233 			u = mktime(tm);
2234 			break;
2235 		default:
2236 			u = -1;
2237 			break;
2238 		}
2239 		break;
2240 	case FSYSTIME:
2241 		u = time((time_t *) 0);
2242 		break;
2243 	case FSTRFTIME:
2244 		/* strftime([format [,timestamp]]) */
2245 		if (nextarg) {
2246 			y = execute(nextarg);
2247 			nextarg = nextarg->nnext;
2248 			tv = (time_t) getfval(y);
2249 			tempfree(y);
2250 		} else
2251 			tv = time((time_t *) 0);
2252 		tm = localtime(&tv);
2253 		if (tm == NULL)
2254 			FATAL("bad time %ld", (long)tv);
2255 
2256 		if (isrec(x)) {
2257 			/* format argument not provided, use default */
2258 			fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
2259 		} else
2260 			fmt = tostring(getsval(x));
2261 
2262 		sz = 32;
2263 		buf = NULL;
2264 		do {
2265 			if ((buf = realloc(buf, (sz *= 2))) == NULL)
2266 				FATAL("out of memory in strftime");
2267 		} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
2268 
2269 		y = gettemp();
2270 		setsval(y, buf);
2271 		free(fmt);
2272 		free(buf);
2273 
2274 		return y;
2275 	default:	/* can't happen */
2276 		FATAL("illegal function type %d", t);
2277 		break;
2278 	}
2279 	tempfree(x);
2280 	x = gettemp();
2281 	setfval(x, u);
2282 	if (nextarg != NULL) {
2283 		WARNING("warning: function has too many arguments");
2284 		for ( ; nextarg; nextarg = nextarg->nnext) {
2285 			y = execute(nextarg);
2286 			tempfree(y);
2287 		}
2288 	}
2289 	return(x);
2290 }
2291 
2292 Cell *printstat(Node **a, int n)	/* print a[0] */
2293 {
2294 	Node *x;
2295 	Cell *y;
2296 	FILE *fp;
2297 
2298 	if (a[1] == NULL)	/* a[1] is redirection operator, a[2] is file */
2299 		fp = stdout;
2300 	else
2301 		fp = redirect(ptoi(a[1]), a[2]);
2302 	for (x = a[0]; x != NULL; x = x->nnext) {
2303 		y = execute(x);
2304 		fputs(getpssval(y), fp);
2305 		tempfree(y);
2306 		if (x->nnext == NULL)
2307 			fputs(getsval(orsloc), fp);
2308 		else
2309 			fputs(getsval(ofsloc), fp);
2310 	}
2311 	if (a[1] != NULL)
2312 		fflush(fp);
2313 	if (ferror(fp))
2314 		FATAL("write error on %s", filename(fp));
2315 	return(True);
2316 }
2317 
2318 Cell *nullproc(Node **a, int n)
2319 {
2320 	return 0;
2321 }
2322 
2323 
2324 FILE *redirect(int a, Node *b)	/* set up all i/o redirections */
2325 {
2326 	FILE *fp;
2327 	Cell *x;
2328 	char *fname;
2329 
2330 	x = execute(b);
2331 	fname = getsval(x);
2332 	fp = openfile(a, fname, NULL);
2333 	if (fp == NULL)
2334 		FATAL("can't open file %s", fname);
2335 	tempfree(x);
2336 	return fp;
2337 }
2338 
2339 struct files {
2340 	FILE	*fp;
2341 	const char	*fname;
2342 	int	mode;	/* '|', 'a', 'w' => LE/LT, GT */
2343 } *files;
2344 
2345 size_t nfiles;
2346 
2347 static void stdinit(void)	/* in case stdin, etc., are not constants */
2348 {
2349 	nfiles = FOPEN_MAX;
2350 	files = (struct files *) calloc(nfiles, sizeof(*files));
2351 	if (files == NULL)
2352 		FATAL("can't allocate file memory for %zu files", nfiles);
2353         files[0].fp = stdin;
2354 	files[0].fname = tostring("/dev/stdin");
2355 	files[0].mode = LT;
2356         files[1].fp = stdout;
2357 	files[1].fname = tostring("/dev/stdout");
2358 	files[1].mode = GT;
2359         files[2].fp = stderr;
2360 	files[2].fname = tostring("/dev/stderr");
2361 	files[2].mode = GT;
2362 }
2363 
2364 FILE *openfile(int a, const char *us, bool *pnewflag)
2365 {
2366 	const char *s = us;
2367 	size_t i;
2368 	int m;
2369 	FILE *fp = NULL;
2370 
2371 	if (*s == '\0')
2372 		FATAL("null file name in print or getline");
2373 	for (i = 0; i < nfiles; i++)
2374 		if (files[i].fname && strcmp(s, files[i].fname) == 0 &&
2375 		    (a == files[i].mode || (a==APPEND && files[i].mode==GT) ||
2376 		     a == FFLUSH)) {
2377 			if (pnewflag)
2378 				*pnewflag = false;
2379 			return files[i].fp;
2380 		}
2381 	if (a == FFLUSH)	/* didn't find it, so don't create it! */
2382 		return NULL;
2383 
2384 	for (i = 0; i < nfiles; i++)
2385 		if (files[i].fp == NULL)
2386 			break;
2387 	if (i >= nfiles) {
2388 		struct files *nf;
2389 		size_t nnf = nfiles + FOPEN_MAX;
2390 		nf = (struct files *) realloc(files, nnf * sizeof(*nf));
2391 		if (nf == NULL)
2392 			FATAL("cannot grow files for %s and %zu files", s, nnf);
2393 		memset(&nf[nfiles], 0, FOPEN_MAX * sizeof(*nf));
2394 		nfiles = nnf;
2395 		files = nf;
2396 	}
2397 	fflush(stdout);	/* force a semblance of order */
2398 	m = a;
2399 	if (a == GT) {
2400 		fp = fopen(s, "w");
2401 	} else if (a == APPEND) {
2402 		fp = fopen(s, "a");
2403 		m = GT;	/* so can mix > and >> */
2404 	} else if (a == '|') {	/* output pipe */
2405 		fp = popen(s, "w");
2406 	} else if (a == LE) {	/* input pipe */
2407 		fp = popen(s, "r");
2408 	} else if (a == LT) {	/* getline <file */
2409 		fp = strcmp(s, "-") == 0 ? stdin : fopen(s, "r");	/* "-" is stdin */
2410 	} else	/* can't happen */
2411 		FATAL("illegal redirection %d", a);
2412 	if (fp != NULL) {
2413 		files[i].fname = tostring(s);
2414 		files[i].fp = fp;
2415 		files[i].mode = m;
2416 		if (pnewflag)
2417 			*pnewflag = true;
2418 		if (fp != stdin && fp != stdout && fp != stderr)
2419 			(void) fcntl(fileno(fp), F_SETFD, FD_CLOEXEC);
2420 	}
2421 	return fp;
2422 }
2423 
2424 const char *filename(FILE *fp)
2425 {
2426 	size_t i;
2427 
2428 	for (i = 0; i < nfiles; i++)
2429 		if (fp == files[i].fp)
2430 			return files[i].fname;
2431 	return "???";
2432 }
2433 
2434 Cell *closefile(Node **a, int n)
2435 {
2436  	Cell *x;
2437 	size_t i;
2438 	bool stat;
2439 
2440  	x = execute(a[0]);
2441  	getsval(x);
2442 	stat = true;
2443  	for (i = 0; i < nfiles; i++) {
2444 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
2445 			continue;
2446 		if (files[i].mode == GT || files[i].mode == '|')
2447 			fflush(files[i].fp);
2448 		if (ferror(files[i].fp)) {
2449 			if ((files[i].mode == GT && files[i].fp != stderr)
2450 			  || files[i].mode == '|')
2451 				FATAL("write error on %s", files[i].fname);
2452 			else
2453 				WARNING("i/o error occurred on %s", files[i].fname);
2454 		}
2455 		if (files[i].fp == stdin || files[i].fp == stdout ||
2456 		    files[i].fp == stderr)
2457 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
2458 		else if (files[i].mode == '|' || files[i].mode == LE)
2459 			stat = pclose(files[i].fp) == -1;
2460 		else
2461 			stat = fclose(files[i].fp) == EOF;
2462 		if (stat)
2463 			WARNING("i/o error occurred closing %s", files[i].fname);
2464 		xfree(files[i].fname);
2465 		files[i].fname = NULL;	/* watch out for ref thru this */
2466 		files[i].fp = NULL;
2467 		break;
2468  	}
2469  	tempfree(x);
2470  	x = gettemp();
2471 	setfval(x, (Awkfloat) (stat ? -1 : 0));
2472  	return(x);
2473 }
2474 
2475 void closeall(void)
2476 {
2477 	size_t i;
2478 	bool stat = false;
2479 
2480 	for (i = 0; i < nfiles; i++) {
2481 		if (! files[i].fp)
2482 			continue;
2483 		if (files[i].mode == GT || files[i].mode == '|')
2484 			fflush(files[i].fp);
2485 		if (ferror(files[i].fp)) {
2486 			if ((files[i].mode == GT && files[i].fp != stderr)
2487 			  || files[i].mode == '|')
2488 				FATAL("write error on %s", files[i].fname);
2489 			else
2490 				WARNING("i/o error occurred on %s", files[i].fname);
2491 		}
2492 		if (files[i].fp == stdin || files[i].fp == stdout ||
2493 		    files[i].fp == stderr)
2494 			continue;
2495 		if (files[i].mode == '|' || files[i].mode == LE)
2496 			stat = pclose(files[i].fp) == -1;
2497 		else
2498 			stat = fclose(files[i].fp) == EOF;
2499 		if (stat)
2500 			WARNING("i/o error occurred while closing %s", files[i].fname);
2501 	}
2502 }
2503 
2504 static void flush_all(void)
2505 {
2506 	size_t i;
2507 
2508 	for (i = 0; i < nfiles; i++)
2509 		if (files[i].fp)
2510 			fflush(files[i].fp);
2511 }
2512 
2513 void backsub(char **pb_ptr, const char **sptr_ptr);
2514 
2515 Cell *dosub(Node **a, int subop)        /* sub and gsub */
2516 {
2517 	fa *pfa;
2518 	int tempstat = 0;
2519 	char *repl;
2520 	Cell *x;
2521 
2522 	char *buf = NULL;
2523 	char *pb = NULL;
2524 	int bufsz = recsize;
2525 
2526 	const char *r, *s;
2527 	const char *start;
2528 	const char *noempty = NULL;      /* empty match disallowed here */
2529 	size_t m = 0;                    /* match count */
2530 	size_t whichm = 0;               /* which match to select, 0 = global */
2531 	int mtype;                       /* match type */
2532 
2533 	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
2534 		pfa = (fa *) a[1];
2535 	} else {
2536 		x = execute(a[1]);
2537 		pfa = makedfa(getsval(x), 1);
2538 		tempfree(x);
2539 	}
2540 
2541 	x = execute(a[2]);	/* replacement string */
2542 	repl = tostring(getsval(x));
2543 	tempfree(x);
2544 
2545 	switch (subop) {
2546 	case SUB:
2547 		whichm = 1;
2548 		x = execute(a[3]);    /* source string */
2549 		break;
2550 	case GSUB:
2551 		whichm = 0;
2552 		x = execute(a[3]);    /* source string */
2553 		break;
2554 	default:
2555 		FATAL("dosub: unrecognized subop: %d", subop);
2556 	}
2557 
2558 	start = getsval(x);
2559 	while (pmatch(pfa, start)) {
2560 		if (buf == NULL) {
2561 			if ((pb = buf = (char *) malloc(bufsz)) == NULL)
2562 				FATAL("out of memory in dosub");
2563 			tempstat = pfa->initstat;
2564 			pfa->initstat = 2;
2565 		}
2566 
2567 		/* match types */
2568 		#define	MT_IGNORE  0  /* unselected or invalid */
2569 		#define MT_INSERT  1  /* selected, empty */
2570 		#define MT_REPLACE 2  /* selected, not empty */
2571 
2572 		/* an empty match just after replacement is invalid */
2573 
2574 		if (patbeg == noempty && patlen == 0) {
2575 			mtype = MT_IGNORE;    /* invalid, not counted */
2576 		} else if (whichm == ++m || whichm == 0) {
2577 			mtype = patlen ? MT_REPLACE : MT_INSERT;
2578 		} else {
2579 			mtype = MT_IGNORE;    /* unselected, but counted */
2580 		}
2581 
2582 		/* leading text: */
2583 		if (patbeg > start) {
2584 			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
2585 				recsize, &pb, "dosub");
2586 			s = start;
2587 			while (s < patbeg)
2588 				*pb++ = *s++;
2589 		}
2590 
2591 		if (mtype == MT_IGNORE)
2592 			goto matching_text;  /* skip replacement text */
2593 
2594 		r = repl;
2595 		while (*r != 0) {
2596 			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
2597 			if (*r == '\\') {
2598 				backsub(&pb, &r);
2599 			} else if (*r == '&') {
2600 				r++;
2601 				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
2602 					&pb, "dosub");
2603 				for (s = patbeg; s < patbeg+patlen; )
2604 					*pb++ = *s++;
2605 			} else {
2606 				*pb++ = *r++;
2607 			}
2608 		}
2609 
2610 matching_text:
2611 		if (mtype == MT_REPLACE || *patbeg == '\0')
2612 			goto next_search;  /* skip matching text */
2613 
2614 		if (patlen == 0)
2615 			patlen = u8_nextlen(patbeg);
2616 		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
2617 		s = patbeg;
2618 		while (s < patbeg + patlen)
2619 			*pb++ = *s++;
2620 
2621 next_search:
2622 		start = patbeg + patlen;
2623 		if (m == whichm || *patbeg == '\0')
2624 			break;
2625 		if (mtype == MT_REPLACE)
2626 			noempty = start;
2627 
2628 		#undef MT_IGNORE
2629 		#undef MT_INSERT
2630 		#undef MT_REPLACE
2631 	}
2632 
2633 	xfree(repl);
2634 
2635 	if (buf != NULL) {
2636 		pfa->initstat = tempstat;
2637 
2638 		/* trailing text */
2639 		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
2640 		while ((*pb++ = *start++) != '\0')
2641 			;
2642 
2643 		setsval(x, buf);
2644 		free(buf);
2645 	}
2646 
2647 	tempfree(x);
2648 	x = gettemp();
2649 	x->tval = NUM;
2650 	x->fval = m;
2651 	return x;
2652 }
2653 
2654 Cell *gensub(Node **a, int nnn)	/* global selective substitute */
2655 	/* XXX incomplete - doesn't support backreferences \0 ... \9 */
2656 {
2657 	Cell *x, *y, *res, *h;
2658 	char *rptr;
2659 	const char *sptr;
2660 	char *buf, *pb;
2661 	const char *t, *q;
2662 	fa *pfa;
2663 	int mflag, tempstat, num, whichm;
2664 	int bufsz = recsize;
2665 
2666 	if ((buf = malloc(bufsz)) == NULL)
2667 		FATAL("out of memory in gensub");
2668 	mflag = 0;	/* if mflag == 0, can replace empty string */
2669 	num = 0;
2670 	x = execute(a[4]);	/* source string */
2671 	t = getsval(x);
2672 	res = copycell(x);	/* target string - initially copy of source */
2673 	res->csub = CTEMP;	/* result values are temporary */
2674 	if (a[0] == 0)		/* 0 => a[1] is already-compiled regexpr */
2675 		pfa = (fa *) a[1];	/* regular expression */
2676 	else {
2677 		y = execute(a[1]);
2678 		pfa = makedfa(getsval(y), 1);
2679 		tempfree(y);
2680 	}
2681 	y = execute(a[2]);	/* replacement string */
2682 	h = execute(a[3]);	/* which matches should be replaced */
2683 	sptr = getsval(h);
2684 	if (sptr[0] == 'g' || sptr[0] == 'G')
2685 		whichm = -1;
2686 	else {
2687 		/*
2688 		 * The specified number is index of replacement, starting
2689 		 * from 1. GNU awk treats index lower than 0 same as
2690 		 * 1, we do same for compatibility.
2691 		 */
2692 		whichm = (int) getfval(h) - 1;
2693 		if (whichm < 0)
2694 			whichm = 0;
2695 	}
2696 	tempfree(h);
2697 
2698 	if (pmatch(pfa, t)) {
2699 		char *sl;
2700 
2701 		tempstat = pfa->initstat;
2702 		pfa->initstat = 2;
2703 		pb = buf;
2704 		rptr = getsval(y);
2705 		/*
2706 		 * XXX if there are any backreferences in subst string,
2707 		 * complain now.
2708 		 */
2709 		for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
2710 			if (strchr("0123456789", sl[1])) {
2711 				FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
2712 			}
2713 		}
2714 
2715 		do {
2716 			if (whichm >= 0 && whichm != num) {
2717 				num++;
2718 				adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
2719 
2720 				/* copy the part of string up to and including
2721 				 * match to output buffer */
2722 				while (t < patbeg + patlen)
2723 					*pb++ = *t++;
2724 				continue;
2725 			}
2726 
2727 			if (patlen == 0 && *patbeg != 0) {	/* matched empty string */
2728 				if (mflag == 0) {	/* can replace empty */
2729 					num++;
2730 					sptr = rptr;
2731 					while (*sptr != 0) {
2732 						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2733 						if (*sptr == '\\') {
2734 							backsub(&pb, &sptr);
2735 						} else if (*sptr == '&') {
2736 							sptr++;
2737 							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2738 							for (q = patbeg; q < patbeg+patlen; )
2739 								*pb++ = *q++;
2740 						} else
2741 							*pb++ = *sptr++;
2742 					}
2743 				}
2744 				if (*t == 0)	/* at end */
2745 					goto done;
2746 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
2747 				*pb++ = *t++;
2748 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
2749 					FATAL("gensub result0 %.30s too big; can't happen", buf);
2750 				mflag = 0;
2751 			}
2752 			else {	/* matched nonempty string */
2753 				num++;
2754 				sptr = t;
2755 				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
2756 				while (sptr < patbeg)
2757 					*pb++ = *sptr++;
2758 				sptr = rptr;
2759 				while (*sptr != 0) {
2760 					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
2761 					if (*sptr == '\\') {
2762 						backsub(&pb, &sptr);
2763 					} else if (*sptr == '&') {
2764 						sptr++;
2765 						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
2766 						for (q = patbeg; q < patbeg+patlen; )
2767 							*pb++ = *q++;
2768 					} else
2769 						*pb++ = *sptr++;
2770 				}
2771 				t = patbeg + patlen;
2772 				if (patlen == 0 || *t == 0 || *(t-1) == 0)
2773 					goto done;
2774 				if (pb > buf + bufsz)
2775 					FATAL("gensub result1 %.30s too big; can't happen", buf);
2776 				mflag = 1;
2777 			}
2778 		} while (pmatch(pfa,t));
2779 		sptr = t;
2780 		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
2781 		while ((*pb++ = *sptr++) != 0)
2782 			;
2783 	done:	if (pb > buf + bufsz)
2784 			FATAL("gensub result2 %.30s too big; can't happen", buf);
2785 		*pb = '\0';
2786 		setsval(res, buf);
2787 		pfa->initstat = tempstat;
2788 	}
2789 	tempfree(x);
2790 	tempfree(y);
2791 	free(buf);
2792 	return(res);
2793 }
2794 
2795 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
2796 {						/* sptr[0] == '\\' */
2797 	char *pb = *pb_ptr;
2798 	const char *sptr = *sptr_ptr;
2799 	static bool first = true;
2800 	static bool do_posix = false;
2801 
2802 	if (first) {
2803 		first = false;
2804 		do_posix = (getenv("POSIXLY_CORRECT") != NULL);
2805 	}
2806 
2807 	if (sptr[1] == '\\') {
2808 		if (sptr[2] == '\\' && sptr[3] == '&') { /* \\\& -> \& */
2809 			*pb++ = '\\';
2810 			*pb++ = '&';
2811 			sptr += 4;
2812 		} else if (sptr[2] == '&') {	/* \\& -> \ + matched */
2813 			*pb++ = '\\';
2814 			sptr += 2;
2815 		} else if (do_posix) {		/* \\x -> \x */
2816 			sptr++;
2817 			*pb++ = *sptr++;
2818 		} else {			/* \\x -> \\x */
2819 			*pb++ = *sptr++;
2820 			*pb++ = *sptr++;
2821 		}
2822 	} else if (sptr[1] == '&') {	/* literal & */
2823 		sptr++;
2824 		*pb++ = *sptr++;
2825 	} else				/* literal \ */
2826 		*pb++ = *sptr++;
2827 
2828 	*pb_ptr = pb;
2829 	*sptr_ptr = sptr;
2830 }
2831 
2832 static char *wide_char_to_byte_str(int rune, size_t *outlen)
2833 {
2834 	static char buf[5];
2835 	int len;
2836 
2837 	if (rune < 0 || rune > 0x10FFFF)
2838 		return NULL;
2839 
2840 	memset(buf, 0, sizeof(buf));
2841 
2842 	len = 0;
2843 	if (rune <= 0x0000007F) {
2844 		buf[len++] = rune;
2845 	} else if (rune <= 0x000007FF) {
2846 		// 110xxxxx 10xxxxxx
2847 		buf[len++] = 0xC0 | (rune >> 6);
2848 		buf[len++] = 0x80 | (rune & 0x3F);
2849 	} else if (rune <= 0x0000FFFF) {
2850 		// 1110xxxx 10xxxxxx 10xxxxxx
2851 		buf[len++] = 0xE0 | (rune >> 12);
2852 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2853 		buf[len++] = 0x80 | (rune & 0x3F);
2854 
2855 	} else {
2856 		// 0x00010000 - 0x10FFFF
2857 		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2858 		buf[len++] = 0xF0 | (rune >> 18);
2859 		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
2860 		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
2861 		buf[len++] = 0x80 | (rune & 0x3F);
2862 	}
2863 
2864 	*outlen = len;
2865 	buf[len++] = '\0';
2866 
2867 	return buf;
2868 }
2869