xref: /plan9-contrib/sys/src/9k/port/sysproc.c (revision 094d68186d4cdde21fdab9786d6c843a03693e4e)
1 #include	"u.h"
2 #include	"tos.h"
3 #include	"../port/lib.h"
4 #include	"mem.h"
5 #include	"dat.h"
6 #include	"fns.h"
7 #include	"../port/error.h"
8 
9 #include	"../port/edf.h"
10 #include	<a.out.h>
11 #include	<ptrace.h>
12 
13 void
sysr1(Ar0 * ar0,va_list list)14 sysr1(Ar0* ar0, va_list list)
15 {
16 	USED(list);
17 
18 	ar0->i = 0;
19 }
20 
21 void
sysrfork(Ar0 * ar0,va_list list)22 sysrfork(Ar0* ar0, va_list list)
23 {
24 	Proc *p;
25 	int flag, i, n, pid;
26 	Fgrp *ofg;
27 	Pgrp *opg;
28 	Rgrp *org;
29 	Egrp *oeg;
30 	Mach *wm;
31 	void (*pt)(Proc*, int, vlong, vlong);
32 	u64int ptarg;
33 
34 	/*
35 	 * int rfork(int);
36 	 */
37 	flag = va_arg(list, int);
38 
39 	/* Check flags before we commit */
40 	if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
41 		error(Ebadarg);
42 	if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
43 		error(Ebadarg);
44 	if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
45 		error(Ebadarg);
46 
47 	if((flag&RFPROC) == 0) {
48 		if(flag & (RFMEM|RFNOWAIT))
49 			error(Ebadarg);
50 		if(flag & (RFFDG|RFCFDG)) {
51 			ofg = up->fgrp;
52 			if(flag & RFFDG)
53 				up->fgrp = dupfgrp(ofg);
54 			else
55 				up->fgrp = dupfgrp(nil);
56 			closefgrp(ofg);
57 		}
58 		if(flag & (RFNAMEG|RFCNAMEG)) {
59 			opg = up->pgrp;
60 			up->pgrp = newpgrp();
61 			if(flag & RFNAMEG)
62 				pgrpcpy(up->pgrp, opg);
63 			/* inherit noattach */
64 			up->pgrp->noattach = opg->noattach;
65 			closepgrp(opg);
66 		}
67 		if(flag & RFNOMNT)
68 			up->pgrp->noattach = 1;
69 		if(flag & RFREND) {
70 			org = up->rgrp;
71 			up->rgrp = newrgrp();
72 			closergrp(org);
73 		}
74 		if(flag & (RFENVG|RFCENVG)) {
75 			oeg = up->egrp;
76 			up->egrp = smalloc(sizeof(Egrp));
77 			up->egrp->ref = 1;
78 			if(flag & RFENVG)
79 				envcpy(up->egrp, oeg);
80 			closeegrp(oeg);
81 		}
82 		if(flag & RFNOTEG)
83 			up->noteid = incref(&noteidalloc);
84 
85 		ar0->i = 0;
86 		return;
87 	}
88 
89 	p = newproc();
90 
91 	p->trace = up->trace;
92 	p->scallnr = up->scallnr;
93 	memmove(p->arg, up->arg, sizeof(up->arg));
94 	p->nerrlab = 0;
95 	p->slash = up->slash;
96 	p->dot = up->dot;
97 	incref(p->dot);
98 
99 	memmove(p->note, up->note, sizeof(p->note));
100 	p->privatemem = up->privatemem;
101 	p->nnote = up->nnote;
102 	p->notified = 0;
103 	p->lastnote = up->lastnote;
104 	p->notify = up->notify;
105 	p->ureg = up->ureg;
106 	p->dbgreg = 0;
107 
108 	/* Make a new set of memory segments */
109 	n = flag & RFMEM;
110 	qlock(&p->seglock);
111 	if(waserror()){
112 		qunlock(&p->seglock);
113 		nexterror();
114 	}
115 	for(i = 0; i < NSEG; i++)
116 		if(up->seg[i] != nil)
117 			p->seg[i] = dupseg(up->seg, i, n);
118 	qunlock(&p->seglock);
119 	poperror();
120 
121 	/* File descriptors */
122 	if(flag & (RFFDG|RFCFDG)) {
123 		if(flag & RFFDG)
124 			p->fgrp = dupfgrp(up->fgrp);
125 		else
126 			p->fgrp = dupfgrp(nil);
127 	}
128 	else {
129 		p->fgrp = up->fgrp;
130 		incref(p->fgrp);
131 	}
132 
133 	/* Process groups */
134 	if(flag & (RFNAMEG|RFCNAMEG)) {
135 		p->pgrp = newpgrp();
136 		if(flag & RFNAMEG)
137 			pgrpcpy(p->pgrp, up->pgrp);
138 		/* inherit noattach */
139 		p->pgrp->noattach = up->pgrp->noattach;
140 	}
141 	else {
142 		p->pgrp = up->pgrp;
143 		incref(p->pgrp);
144 	}
145 	if(flag & RFNOMNT)
146 		p->pgrp->noattach = 1;
147 
148 	if(flag & RFREND)
149 		p->rgrp = newrgrp();
150 	else {
151 		incref(up->rgrp);
152 		p->rgrp = up->rgrp;
153 	}
154 
155 	/* Environment group */
156 	if(flag & (RFENVG|RFCENVG)) {
157 		p->egrp = smalloc(sizeof(Egrp));
158 		p->egrp->ref = 1;
159 		if(flag & RFENVG)
160 			envcpy(p->egrp, up->egrp);
161 	}
162 	else {
163 		p->egrp = up->egrp;
164 		incref(p->egrp);
165 	}
166 	p->hang = up->hang;
167 	p->procmode = up->procmode;
168 
169 	/* Craft a return frame which will cause the child to pop out of
170 	 * the scheduler in user mode with the return register zero
171 	 */
172 	sysrforkchild(p, up);
173 
174 	p->parent = up;
175 	p->parentpid = up->pid;
176 	if(flag&RFNOWAIT)
177 		p->parentpid = 0;
178 	else {
179 		lock(&up->exl);
180 		up->nchild++;
181 		unlock(&up->exl);
182 	}
183 	if((flag&RFNOTEG) == 0)
184 		p->noteid = up->noteid;
185 
186 	pid = p->pid;
187 	memset(p->time, 0, sizeof(p->time));
188 	p->time[TReal] = sys->ticks;
189 
190 	kstrdup(&p->text, up->text);
191 	kstrdup(&p->user, up->user);
192 	/*
193 	 *  since the bss/data segments are now shareable,
194 	 *  any mmu info about this process is now stale
195 	 *  (i.e. has bad properties) and has to be discarded.
196 	 */
197 	mmuflush();
198 	p->basepri = up->basepri;
199 	p->priority = up->basepri;
200 	p->fixedpri = up->fixedpri;
201 	p->mp = up->mp;
202 	wm = up->wired;
203 	if(wm != nil)
204 		procwired(p, wm->machno);
205 	if(p->trace && (pt = proctrace) != nil){
206 		strncpy((char*)&ptarg, p->text, sizeof ptarg);
207 		pt(p, SName, 0, ptarg);
208 	}
209 	p->color = up->color;
210 	ready(p);
211 	sched();
212 
213 	ar0->i = pid;
214 }
215 
216 static uvlong
vl2be(uvlong v)217 vl2be(uvlong v)
218 {
219 	uchar *p;
220 
221 	p = (uchar*)&v;
222 	return ((uvlong)((p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3])<<32)
223 	      |((uvlong)(p[4]<<24)|(p[5]<<16)|(p[6]<<8)|p[7]);
224 }
225 
226 ulong
l2be(long l)227 l2be(long l)
228 {
229 	uchar *cp;
230 
231 	cp = (uchar*)&l;
232 	return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
233 }
234 
235 typedef struct {
236 	Exec;
237 	uvlong hdr[1];
238 } Hdr;
239 
240 void
sysexec(Ar0 * ar0,va_list list)241 sysexec(Ar0* ar0, va_list list)
242 {
243 	Hdr hdr;
244 	Fgrp *f;
245 	Tos *tos;
246 	Chan *chan;
247 	Image *img;
248 	Segment *s;
249 	int argc, i, n, nargs;
250 	char *a, *args, **argv, elem[sizeof(up->genbuf)], *file, *p;
251 	char line[sizeof(Exec)], *progarg[sizeof(Exec)/2+1];
252 	long hdrsz, magic, textsz, datasz, bsssz;
253 	uintptr textlim, textmin, datalim, bsslim, entry, stack;
254 	void (*pt)(Proc*, int, vlong, vlong);
255 	u64int ptarg;
256 
257 	/*
258 	 * void* exec(char* name, char* argv[]);
259 	 */
260 
261 	/*
262 	 * Remember the full name of the file,
263 	 * open it, and remember the final element of the
264 	 * name left in up->genbuf by namec.
265 	 */
266 	p = va_arg(list, char*);
267 	p = validaddr(p, 1, 0);
268 	file = validnamedup(p, 1);
269 	if(waserror()){
270 		free(file);
271 		nexterror();
272 	}
273 	chan = namec(file, Aopen, OEXEC, 0);
274 	if(waserror()){
275 		cclose(chan);
276 		nexterror();
277 	}
278 	strncpy(elem, up->genbuf, sizeof(elem));
279 
280 	/*
281 	 * Read the header.
282 	 * If it's a #!, fill in progarg[] with info then read a new header
283 	 * from the file indicated by the #!.
284 	 * The #! line must be less than sizeof(Exec) in size,
285 	 * including the terminating \n.
286 	 */
287 	hdrsz = chan->dev->read(chan, &hdr, sizeof(Hdr), 0);
288 	if(hdrsz < 2)
289 		error(Ebadexec);
290 	p = (char*)&hdr;
291 	argc = 0;
292 	if(p[0] == '#' && p[1] == '!'){
293 		p = memccpy(line, (char*)&hdr, '\n', MIN(sizeof(Exec), hdrsz));
294 		if(p == nil)
295 			error(Ebadexec);
296 		*(p-1) = '\0';
297 		argc = tokenize(line+2, progarg, nelem(progarg));
298 		if(argc == 0)
299 			error(Ebadexec);
300 
301 		/* The original file becomes an extra arg after #! line */
302 		progarg[argc++] = file;
303 
304 		/*
305 		 * Take the #! $0 as a file to open, and replace
306 		 * $0 with the original path's name.
307 		 */
308 		p = progarg[0];
309 		progarg[0] = elem;
310 		poperror();			/* chan */
311 		cclose(chan);
312 
313 		chan = namec(p, Aopen, OEXEC, 0);
314 		if(waserror()){
315 			cclose(chan);
316 			nexterror();
317 		}
318 		hdrsz = chan->dev->read(chan, &hdr, sizeof(Hdr), 0);
319 		if(hdrsz < 2)
320 			error(Ebadexec);
321 	}
322 
323 	/*
324 	 * #! has had its chance, now we need a real binary.
325 	 */
326 	magic = l2be(hdr.magic);
327 	if(hdrsz != sizeof(Hdr) || magic != AOUT_MAGIC)
328 		error(Ebadexec);
329 	if(magic & HDR_MAGIC){
330 		entry = vl2be(hdr.hdr[0]);
331 		hdrsz = sizeof(Hdr);
332 	}
333 	else{
334 		entry = l2be(hdr.entry);
335 		hdrsz = sizeof(Exec);
336 	}
337 
338 	textsz = l2be(hdr.text);
339 	datasz = l2be(hdr.data);
340 	bsssz = l2be(hdr.bss);
341 
342 	textmin = ROUNDUP(UTZERO+hdrsz+textsz, PGSZ);
343 	textlim = UTROUND(textmin);
344 	datalim = ROUNDUP(textlim+datasz, PGSZ);
345 	bsslim = ROUNDUP(textlim+datasz+bsssz, PGSZ);
346 
347 	/*
348 	 * Check the binary header for consistency,
349 	 * e.g. the entry point is within the text segment and
350 	 * the segments don't overlap each other.
351 	 */
352 	if(entry < UTZERO+hdrsz || entry >= UTZERO+hdrsz+textsz)
353 		error(Ebadexec);
354 
355 	if(textsz >= textlim || datasz > datalim || bsssz > bsslim
356 	|| textlim >= USTKTOP || datalim >= USTKTOP || bsslim >= USTKTOP
357 	|| datalim < textlim || bsslim < datalim)
358 		error(Ebadexec);
359 
360 	up->color = corecolor(m->machno);
361 
362 	/*
363 	 * The new stack is created in ESEG, temporarily mapped elsewhere.
364 	 * The stack contains, in descending address order:
365 	 *	a structure containing housekeeping and profiling data (Tos);
366 	 *	argument strings;
367 	 *	array of vectors to the argument strings with a terminating
368 	 *	nil (argv).
369 	 * When the exec is committed, this temporary stack in ESEG will
370 	 * become SSEG.
371 	 * The architecture-dependent code which jumps to the new image
372 	 * will also push a count of the argument array onto the stack (argc).
373 	 */
374 	qlock(&up->seglock);
375 	if(waserror()){
376 		if(up->seg[ESEG] != nil){
377 			putseg(up->seg[ESEG]);
378 			up->seg[ESEG] = nil;
379 		}
380 		qunlock(&up->seglock);
381 		nexterror();
382 	}
383 	up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, TSTKTOP);
384 	up->seg[ESEG]->color = up->color;
385 
386 	/*
387 	 * Stack is a pointer into the temporary stack
388 	 * segment, and will move as items are pushed.
389 	 */
390 	stack = TSTKTOP-sizeof(Tos);
391 
392 	/*
393 	 * First, the top-of-stack structure.
394 	 */
395 	tos = (Tos*)stack;
396 	tos->cyclefreq = m->cyclefreq;
397 	cycles((uvlong*)&tos->pcycles);
398 	tos->pcycles = -tos->pcycles;
399 	tos->kcycles = tos->pcycles;
400 	tos->clock = 0;
401 
402 	/*
403 	 * As the pass is made over the arguments and they are pushed onto
404 	 * the temporary stack, make a good faith copy in args for up->args.
405 	 */
406 	args = smalloc(128);
407 	if(waserror()){
408 		free(args);
409 		nexterror();
410 	}
411 	nargs = 0;
412 
413 	/*
414 	 * Next push any arguments found from a #! header.
415 	 */
416 	for(i = 0; i < argc; i++){
417 		n = strlen(progarg[i])+1;
418 		stack -= n;
419 		memmove(UINT2PTR(stack), progarg[i], n);
420 
421 		if((n = MIN(n, 128-nargs)) <= 0)
422 			continue;
423 		memmove(&args[nargs], progarg[i], n);
424 		nargs += n;
425 	}
426 
427 	/*
428 	 * Copy the strings pointed to by the syscall argument argv into
429 	 * the temporary stack segment, being careful to check both argv and
430 	 * the strings it points to are valid.
431 	 */
432 	argv = va_arg(list, char**);
433 	evenaddr(PTR2UINT(argv));
434 	for(i = 0;; i++, argv++){
435 		a = *(char**)validaddr(argv, sizeof(char**), 0);
436 		if(a == nil)
437 			break;
438 		a = validaddr(a, 1, 0);
439 		n = ((char*)vmemchr(a, 0, 0x7fffffff) - a) + 1;
440 
441 		/*
442 		 * This futzing is so argv[0] gets validated even
443 		 * though it will be thrown away if this is a shell
444 		 * script.
445 		 */
446 		if(argc > 0 && i == 0)
447 			continue;
448 
449 		/*
450 		 * Before copying the string into the temporary stack,
451 		 * which might involve a demand-page, check the string
452 		 * will not overflow the bottom of the stack.
453 		 */
454 		stack -= n;
455 		if(stack < TSTKTOP-USTKSIZE)
456 			error(Enovmem);
457 		p = UINT2PTR(stack);
458 		memmove(p, a, n);
459 		p[n-1] = 0;
460 		argc++;
461 
462 		if((n = MIN(n, 128-nargs)) <= 0)
463 			continue;
464 		memmove(&args[nargs], p, n);
465 		nargs += n;
466 	}
467 	if(argc < 1)
468 		error(Ebadexec);
469 
470 	/*
471 	 * Before pushing the argument pointers onto the temporary stack,
472 	 * which might involve a demand-page, check there is room for the
473 	 * terminating nil pointer, plus pointers, plus some slop for however
474 	 * argc might be passed on the stack by sysexecregs (give a page
475 	 * of slop, it is an overestimate, but why not).
476 	 * Sysexecstack does any architecture-dependent stack alignment.
477 	 * Keep a copy of the start of the argument strings before alignment
478 	 * so up->args can be created later.
479 	 * Although the argument vectors are being pushed onto the stack in
480 	 * the temporary segment, the values must be adjusted to reflect
481 	 * the segment address after it replaces the current SSEG.
482 	 */
483 	p = UINT2PTR(stack);
484 	stack = sysexecstack(stack, argc);
485 	if(stack-(argc+1)*sizeof(char**)-segpgsize(up->seg[ESEG]) < TSTKTOP-USTKSIZE)
486 		error(Ebadexec);
487 
488 	argv = (char**)stack;
489 	*--argv = nil;
490 	for(i = 0; i < argc; i++){
491 		*--argv = p + (USTKTOP-TSTKTOP);
492 		p += strlen(p) + 1;
493 	}
494 
495 	/*
496 	 * Fix up the up->args copy in args. The length must be > 0 as it
497 	 * includes the \0 on the last argument and argc was checked earlier
498 	 * to be > 0. Compensate for any UTF character boundary before
499 	 * placing the terminating \0.
500 	 */
501 	if(nargs <= 0)
502 		error(Egreg);
503 
504 	while(nargs > 0 && (args[nargs-1] & 0xc0) == 0x80)
505 		nargs--;
506 	args[nargs-1] = '\0';
507 
508 	/*
509 	 * All the argument processing is now done, ready to commit.
510 	 */
511 	kstrdup(&up->text, elem);
512 	free(up->args);
513 	up->args = args;
514 	up->nargs = nargs;
515 	poperror();				/* args */
516 
517 	/*
518 	 * Close on exec
519 	 */
520 	f = up->fgrp;
521 	for(i=0; i<=f->maxfd; i++)
522 		fdclose(i, CCEXEC);
523 
524 	/*
525 	 * Free old memory.
526 	 * Special segments maintained across exec.
527 	 */
528 	for(i = SSEG; i <= BSEG; i++) {
529 		putseg(up->seg[i]);
530 		up->seg[i] = nil;		/* in case of error */
531 	}
532 	for(i = BSEG+1; i< NSEG; i++) {
533 		s = up->seg[i];
534 		if(s && (s->type&SG_CEXEC)) {
535 			putseg(s);
536 			up->seg[i] = nil;
537 		}
538 	}
539 
540 	if(up->trace && (pt = proctrace) != nil){
541 		strncpy((char*)&ptarg, elem, sizeof ptarg);
542 		pt(up, SName, 0, ptarg);
543 	}
544 
545 	/* Text.  Shared. Attaches to cache image if possible */
546 	/* attachimage returns a locked cache image */
547 
548 	img = attachimage(SG_TEXT|SG_RONLY, chan, up->color, UTZERO, textmin);
549 	s = img->s;
550 	up->seg[TSEG] = s;
551 	s->flushme = 1;
552 	s->fstart = 0;
553 	s->flen = hdrsz+textsz;
554 	if(img->color != up->color){
555 		up->color = img->color;
556 	}
557 	unlock(img);
558 
559 	/* Data. Shared. */
560 	s = newseg(SG_DATA, textlim, datalim);
561 	up->seg[DSEG] = s;
562 	s->color = up->color;
563 
564 	/* Attached by hand */
565 	incref(img);
566 	s->image = img;
567 	s->fstart = hdrsz+textsz;
568 	s->flen = datasz;
569 
570 	/* BSS. Zero fill on demand */
571 	up->seg[BSEG] = newseg(SG_BSS, datalim, bsslim);
572 	up->seg[BSEG]->color= up->color;
573 
574 	/*
575 	 * Move the stack
576 	 */
577 	s = up->seg[ESEG];
578 	up->seg[ESEG] = nil;
579 	up->seg[SSEG] = s;
580 	qunlock(&up->seglock);
581 	poperror();				/* seglock */
582 
583 	s->base = USTKTOP-USTKSIZE;
584 	s->top = USTKTOP;
585 	relocateseg(s, USTKTOP-TSTKTOP);
586 
587 	/*
588 	 *  '/' processes are higher priority.
589 	 */
590 	if(chan->dev->dc == L'/')
591 		up->basepri = PriRoot;
592 	up->priority = up->basepri;
593 	poperror();				/* chan */
594 	cclose(chan);
595 	poperror();				/* file */
596 	free(file);
597 
598 	/*
599 	 *  At this point, the mmu contains info about the old address
600 	 *  space and needs to be flushed
601 	 */
602 	mmuflush();
603 	qlock(&up->debug);
604 	up->nnote = 0;
605 	up->notify = 0;
606 	up->notified = 0;
607 	up->privatemem = 0;
608 	sysprocsetup(up);
609 	qunlock(&up->debug);
610 	if(up->hang)
611 		up->procctl = Proc_stopme;
612 
613 	ar0->v = sysexecregs(entry, TSTKTOP - PTR2UINT(argv), argc);
614 }
615 
616 int
return0(void *)617 return0(void*)
618 {
619 	return 0;
620 }
621 
622 void
syssleep(Ar0 * ar0,va_list list)623 syssleep(Ar0* ar0, va_list list)
624 {
625 	long ms;
626 
627 	/*
628 	 * int sleep(long millisecs);
629 	 */
630 	ms = va_arg(list, long);
631 
632 	ar0->i = 0;
633 	if(ms <= 0) {
634 		if (up->edf && (up->edf->flags & Admitted))
635 			edfyield();
636 		else
637 			yield();
638 		return;
639 	}
640 	if(ms < TK2MS(1))
641 		ms = TK2MS(1);
642 	tsleep(&up->sleep, return0, 0, ms);
643 }
644 
645 void
sysalarm(Ar0 * ar0,va_list list)646 sysalarm(Ar0* ar0, va_list list)
647 {
648 	unsigned long ms;
649 
650 	/*
651 	 * long alarm(unsigned long millisecs);
652 	 * Odd argument type...
653 	 */
654 	ms = va_arg(list, unsigned long);
655 
656 	ar0->l = procalarm(ms);
657 }
658 
659 void
sysexits(Ar0 *,va_list list)660 sysexits(Ar0*, va_list list)
661 {
662 	char *status;
663 	char *inval = "invalid exit string";
664 	char buf[ERRMAX];
665 
666 	/*
667 	 * void exits(char *msg);
668 	 */
669 	status = va_arg(list, char*);
670 
671 	if(status){
672 		if(waserror())
673 			status = inval;
674 		else{
675 			status = validaddr(status, 1, 0);
676 			if(vmemchr(status, 0, ERRMAX) == 0){
677 				memmove(buf, status, ERRMAX);
678 				buf[ERRMAX-1] = 0;
679 				status = buf;
680 			}
681 			poperror();
682 		}
683 
684 	}
685 	pexit(status, 1);
686 }
687 
688 void
sys_wait(Ar0 * ar0,va_list list)689 sys_wait(Ar0* ar0, va_list list)
690 {
691 	int pid;
692 	Waitmsg w;
693 	OWaitmsg *ow;
694 
695 	/*
696 	 * int wait(Waitmsg* w);
697 	 *
698 	 * Deprecated; backwards compatibility only.
699 	 */
700 	ow = va_arg(list, OWaitmsg*);
701 	if(ow == nil){
702 		ar0->i = pwait(nil);
703 		return;
704 	}
705 
706 	ow = validaddr(ow, sizeof(OWaitmsg), 1);
707 	evenaddr(PTR2UINT(ow));
708 	pid = pwait(&w);
709 	if(pid >= 0){
710 		readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
711 		readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
712 		readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
713 		readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
714 		strncpy(ow->msg, w.msg, sizeof(ow->msg));
715 		ow->msg[sizeof(ow->msg)-1] = '\0';
716 	}
717 
718 	ar0->i = pid;
719 }
720 
721 void
sysawait(Ar0 * ar0,va_list list)722 sysawait(Ar0* ar0, va_list list)
723 {
724 	int i;
725 	int pid;
726 	Waitmsg w;
727 	usize n;
728 	char *p;
729 
730 	/*
731 	 * int await(char* s, int n);
732 	 * should really be
733 	 * usize await(char* s, usize n);
734 	 */
735 	p = va_arg(list, char*);
736 	n = va_arg(list, long);
737 	p = validaddr(p, n, 1);
738 
739 	pid = pwait(&w);
740 	if(pid < 0){
741 		ar0->i = -1;
742 		return;
743 	}
744 	i = snprint(p, n, "%d %lud %lud %lud %q",
745 		w.pid,
746 		w.time[TUser], w.time[TSys], w.time[TReal],
747 		w.msg);
748 
749 	ar0->i = i;
750 }
751 
752 void
werrstr(char * fmt,...)753 werrstr(char *fmt, ...)
754 {
755 	va_list va;
756 
757 	if(up == nil)
758 		return;
759 
760 	va_start(va, fmt);
761 	vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
762 	va_end(va);
763 }
764 
765 static void
generrstr(char * buf,long n)766 generrstr(char *buf, long n)
767 {
768 	char *p, tmp[ERRMAX];
769 
770 	if(n <= 0)
771 		error(Ebadarg);
772 	p = validaddr(buf, n, 1);
773 	if(n > sizeof tmp)
774 		n = sizeof tmp;
775 	memmove(tmp, p, n);
776 
777 	/* make sure it's NUL-terminated */
778 	tmp[n-1] = '\0';
779 	memmove(p, up->syserrstr, n);
780 	p[n-1] = '\0';
781 	memmove(up->syserrstr, tmp, n);
782 }
783 
784 void
syserrstr(Ar0 * ar0,va_list list)785 syserrstr(Ar0* ar0, va_list list)
786 {
787 	char *err;
788 	usize nerr;
789 
790 	/*
791 	 * int errstr(char* err, uint nerr);
792 	 * should really be
793 	 * usize errstr(char* err, usize nerr);
794 	 * but errstr always returns 0.
795 	 */
796 	err = va_arg(list, char*);
797 	nerr = va_arg(list, usize);
798 	generrstr(err, nerr);
799 
800 	ar0->i = 0;
801 }
802 
803 void
sys_errstr(Ar0 * ar0,va_list list)804 sys_errstr(Ar0* ar0, va_list list)
805 {
806 	char *p;
807 
808 	/*
809 	 * int errstr(char* err);
810 	 *
811 	 * Deprecated; backwards compatibility only.
812 	 */
813 	p = va_arg(list, char*);
814 	generrstr(p, 64);
815 
816 	ar0->i = 0;
817 }
818 
819 void
sysnotify(Ar0 * ar0,va_list list)820 sysnotify(Ar0* ar0, va_list list)
821 {
822 	void (*f)(void*, char*);
823 
824 	/*
825 	 * int notify(void (*f)(void*, char*));
826 	 */
827 	f = (void (*)(void*, char*))va_arg(list, void*);
828 
829 	if(f != nil)
830 		validaddr(f, sizeof(void (*)(void*, char*)), 0);
831 	up->notify = f;
832 
833 	ar0->i = 0;
834 }
835 
836 void
sysnoted(Ar0 * ar0,va_list list)837 sysnoted(Ar0* ar0, va_list list)
838 {
839 	int v;
840 
841 	/*
842 	 * int noted(int v);
843 	 */
844 	v = va_arg(list, int);
845 
846 	if(v != NRSTR && !up->notified)
847 		error(Egreg);
848 
849 	ar0->i = 0;
850 }
851 
852 void
sysrendezvous(Ar0 * ar0,va_list list)853 sysrendezvous(Ar0* ar0, va_list list)
854 {
855 	Proc *p, **l;
856 	uintptr tag, val, pc;
857 	void (*pt)(Proc*, int, vlong, vlong);
858 
859 	/*
860 	 * void* rendezvous(void*, void*);
861 	 */
862 	tag = PTR2UINT(va_arg(list, void*));
863 
864 	l = &REND(up->rgrp, tag);
865 	up->rendval = ~0;
866 
867 	lock(up->rgrp);
868 	for(p = *l; p; p = p->rendhash) {
869 		if(p->rendtag == tag) {
870 			*l = p->rendhash;
871 			val = p->rendval;
872 			p->rendval = PTR2UINT(va_arg(list, void*));
873 
874 			while(p->mach != 0)
875 				;
876 			ready(p);
877 			unlock(up->rgrp);
878 
879 			ar0->v = UINT2PTR(val);
880 			return;
881 		}
882 		l = &p->rendhash;
883 	}
884 
885 	/* Going to sleep here */
886 	up->rendtag = tag;
887 	up->rendval = PTR2UINT(va_arg(list, void*));
888 	up->rendhash = *l;
889 	*l = up;
890 	up->state = Rendezvous;
891 	if(up->trace && (pt = proctrace) != nil){
892 		pc = (uintptr)sysrendezvous;
893 		pt(up, SSleep, 0, Rendezvous|(pc<<8));
894 	}
895 	unlock(up->rgrp);
896 
897 	sched();
898 
899 	ar0->v = UINT2PTR(up->rendval);
900 }
901 
902 /*
903  * The implementation of semaphores is complicated by needing
904  * to avoid rescheduling in syssemrelease, so that it is safe
905  * to call from real-time processes.  This means syssemrelease
906  * cannot acquire any qlocks, only spin locks.
907  *
908  * Semacquire and semrelease must both manipulate the semaphore
909  * wait list.  Lock-free linked lists only exist in theory, not
910  * in practice, so the wait list is protected by a spin lock.
911  *
912  * The semaphore value *addr is stored in user memory, so it
913  * cannot be read or written while holding spin locks.
914  *
915  * Thus, we can access the list only when holding the lock, and
916  * we can access the semaphore only when not holding the lock.
917  * This makes things interesting.  Note that sleep's condition function
918  * is called while holding two locks - r and up->rlock - so it cannot
919  * access the semaphore value either.
920  *
921  * An acquirer announces its intention to try for the semaphore
922  * by putting a Sema structure onto the wait list and then
923  * setting Sema.waiting.  After one last check of semaphore,
924  * the acquirer sleeps until Sema.waiting==0.  A releaser of n
925  * must wake up n acquirers who have Sema.waiting set.  It does
926  * this by clearing Sema.waiting and then calling wakeup.
927  *
928  * There are three interesting races here.
929 
930  * The first is that in this particular sleep/wakeup usage, a single
931  * wakeup can rouse a process from two consecutive sleeps!
932  * The ordering is:
933  *
934  * 	(a) set Sema.waiting = 1
935  * 	(a) call sleep
936  * 	(b) set Sema.waiting = 0
937  * 	(a) check Sema.waiting inside sleep, return w/o sleeping
938  * 	(a) try for semaphore, fail
939  * 	(a) set Sema.waiting = 1
940  * 	(a) call sleep
941  * 	(b) call wakeup(a)
942  * 	(a) wake up again
943  *
944  * This is okay - semacquire will just go around the loop
945  * again.  It does mean that at the top of the for(;;) loop in
946  * semacquire, phore.waiting might already be set to 1.
947  *
948  * The second is that a releaser might wake an acquirer who is
949  * interrupted before he can acquire the lock.  Since
950  * release(n) issues only n wakeup calls -- only n can be used
951  * anyway -- if the interrupted process is not going to use his
952  * wakeup call he must pass it on to another acquirer.
953  *
954  * The third race is similar to the second but more subtle.  An
955  * acquirer sets waiting=1 and then does a final canacquire()
956  * before going to sleep.  The opposite order would result in
957  * missing wakeups that happen between canacquire and
958  * waiting=1.  (In fact, the whole point of Sema.waiting is to
959  * avoid missing wakeups between canacquire() and sleep().) But
960  * there can be spurious wakeups between a successful
961  * canacquire() and the following semdequeue().  This wakeup is
962  * not useful to the acquirer, since he has already acquired
963  * the semaphore.  Like in the previous case, though, the
964  * acquirer must pass the wakeup call along.
965  *
966  * This is all rather subtle.  The code below has been verified
967  * with the spin model /sys/src/9/port/semaphore.p.  The
968  * original code anticipated the second race but not the first
969  * or third, which were caught only with spin.  The first race
970  * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
971  * It was lucky that my abstract model of sleep/wakeup still managed
972  * to preserve that behavior.
973  *
974  * I remain slightly concerned about memory coherence
975  * outside of locks.  The spin model does not take
976  * queued processor writes into account so we have to
977  * think hard.  The only variables accessed outside locks
978  * are the semaphore value itself and the boolean flag
979  * Sema.waiting.  The value is only accessed with CAS,
980  * whose job description includes doing the right thing as
981  * far as memory coherence across processors.  That leaves
982  * Sema.waiting.  To handle it, we call coherence() before each
983  * read and after each write.		- rsc
984  */
985 
986 /* Add semaphore p with addr a to list in seg. */
987 static void
semqueue(Segment * s,int * addr,Sema * p)988 semqueue(Segment* s, int* addr, Sema* p)
989 {
990 	memset(p, 0, sizeof *p);
991 	p->addr = addr;
992 
993 	lock(&s->sema);	/* uses s->sema.Rendez.Lock, but no one else is */
994 	p->next = &s->sema;
995 	p->prev = s->sema.prev;
996 	p->next->prev = p;
997 	p->prev->next = p;
998 	unlock(&s->sema);
999 }
1000 
1001 /* Remove semaphore p from list in seg. */
1002 static void
semdequeue(Segment * s,Sema * p)1003 semdequeue(Segment* s, Sema* p)
1004 {
1005 	lock(&s->sema);
1006 	p->next->prev = p->prev;
1007 	p->prev->next = p->next;
1008 	unlock(&s->sema);
1009 }
1010 
1011 /* Wake up n waiters with addr on list in seg. */
1012 static void
semwakeup(Segment * s,int * addr,int n)1013 semwakeup(Segment* s, int* addr, int n)
1014 {
1015 	Sema *p;
1016 
1017 	lock(&s->sema);
1018 	for(p = s->sema.next; p != &s->sema && n > 0; p = p->next){
1019 		if(p->addr == addr && p->waiting){
1020 			p->waiting = 0;
1021 			coherence();
1022 			wakeup(p);
1023 			n--;
1024 		}
1025 	}
1026 	unlock(&s->sema);
1027 }
1028 
1029 /* Add delta to semaphore and wake up waiters as appropriate. */
1030 static int
semrelease(Segment * s,int * addr,int delta)1031 semrelease(Segment* s, int* addr, int delta)
1032 {
1033 	int value;
1034 
1035 	do
1036 		value = *addr;
1037 	while(!CASW(addr, value, value+delta));
1038 	semwakeup(s, addr, delta);
1039 
1040 	return value+delta;
1041 }
1042 
1043 /* Try to acquire semaphore using compare-and-swap */
1044 static int
canacquire(int * addr)1045 canacquire(int* addr)
1046 {
1047 	int value;
1048 
1049 	while((value = *addr) > 0){
1050 		if(CASW(addr, value, value-1))
1051 			return 1;
1052 	}
1053 
1054 	return 0;
1055 }
1056 
1057 /* Should we wake up? */
1058 static int
semawoke(void * p)1059 semawoke(void* p)
1060 {
1061 	coherence();
1062 	return !((Sema*)p)->waiting;
1063 }
1064 
1065 /* Acquire semaphore (subtract 1). */
1066 static int
semacquire(Segment * s,int * addr,int block)1067 semacquire(Segment* s, int* addr, int block)
1068 {
1069 	int acquired;
1070 	Sema phore;
1071 
1072 	if(canacquire(addr))
1073 		return 1;
1074 	if(!block)
1075 		return 0;
1076 
1077 	acquired = 0;
1078 	semqueue(s, addr, &phore);
1079 	for(;;){
1080 		phore.waiting = 1;
1081 		coherence();
1082 		if(canacquire(addr)){
1083 			acquired = 1;
1084 			break;
1085 		}
1086 		if(waserror())
1087 			break;
1088 		sleep(&phore, semawoke, &phore);
1089 		poperror();
1090 	}
1091 	semdequeue(s, &phore);
1092 	coherence();	/* not strictly necessary due to lock in semdequeue */
1093 	if(!phore.waiting)
1094 		semwakeup(s, addr, 1);
1095 	if(!acquired)
1096 		nexterror();
1097 
1098 	return 1;
1099 }
1100 
1101 /* Acquire semaphore or time-out */
1102 static int
tsemacquire(Segment * s,int * addr,long ms)1103 tsemacquire(Segment* s, int* addr, long ms)
1104 {
1105 	int acquired;
1106 	ulong t;
1107 	Sema phore;
1108 
1109 	if(canacquire(addr))
1110 		return 1;
1111 	if(ms == 0)
1112 		return 0;
1113 
1114 	acquired = 0;
1115 	semqueue(s, addr, &phore);
1116 	for(;;){
1117 		phore.waiting = 1;
1118 		coherence();
1119 		if(canacquire(addr)){
1120 			acquired = 1;
1121 			break;
1122 		}
1123 		if(waserror())
1124 			break;
1125 		t = m->ticks;
1126 		tsleep(&phore, semawoke, &phore, ms);
1127 		ms -= TK2MS(m->ticks-t);
1128 		poperror();
1129 		if(ms <= 0)
1130 			break;
1131 	}
1132 	semdequeue(s, &phore);
1133 	coherence();	/* not strictly necessary due to lock in semdequeue */
1134 	if(!phore.waiting)
1135 		semwakeup(s, addr, 1);
1136 	if(ms <= 0)
1137 		return 0;
1138 	if(!acquired)
1139 		nexterror();
1140 	return 1;
1141 }
1142 
1143 void
syssemacquire(Ar0 * ar0,va_list list)1144 syssemacquire(Ar0* ar0, va_list list)
1145 {
1146 	Segment *s;
1147 	int *addr, block;
1148 
1149 	/*
1150 	 * int semacquire(long* addr, int block);
1151 	 * should be (and will be implemented below as) perhaps
1152 	 * int semacquire(int* addr, int block);
1153 	 */
1154 	addr = va_arg(list, int*);
1155 	addr = validaddr(addr, sizeof(int), 1);
1156 	evenaddr(PTR2UINT(addr));
1157 	block = va_arg(list, int);
1158 
1159 	if((s = seg(up, PTR2UINT(addr), 0)) == nil)
1160 		error(Ebadarg);
1161 	if(*addr < 0)
1162 		error(Ebadarg);
1163 
1164 	ar0->i = semacquire(s, addr, block);
1165 }
1166 
1167 void
systsemacquire(Ar0 * ar0,va_list list)1168 systsemacquire(Ar0* ar0, va_list list)
1169 {
1170 	Segment *s;
1171 	int *addr, ms;
1172 
1173 	/*
1174 	 * int tsemacquire(long* addr, ulong ms);
1175 	 * should be (and will be implemented below as) perhaps
1176 	 * int tsemacquire(int* addr, ulong ms);
1177 	 */
1178 	addr = va_arg(list, int*);
1179 	addr = validaddr(addr, sizeof(int), 1);
1180 	evenaddr(PTR2UINT(addr));
1181 	ms = va_arg(list, ulong);
1182 
1183 	if((s = seg(up, PTR2UINT(addr), 0)) == nil)
1184 		error(Ebadarg);
1185 	if(*addr < 0)
1186 		error(Ebadarg);
1187 
1188 	ar0->i = tsemacquire(s, addr, ms);
1189 }
1190 
1191 void
syssemrelease(Ar0 * ar0,va_list list)1192 syssemrelease(Ar0* ar0, va_list list)
1193 {
1194 	Segment *s;
1195 	int *addr, delta;
1196 
1197 	/*
1198 	 * long semrelease(long* addr, long count);
1199 	 * should be (and will be implemented below as) perhaps
1200 	 * int semrelease(int* addr, int count);
1201 	 */
1202 	addr = va_arg(list, int*);
1203 	addr = validaddr(addr, sizeof(int), 1);
1204 	evenaddr(PTR2UINT(addr));
1205 	delta = va_arg(list, int);
1206 
1207 	if((s = seg(up, PTR2UINT(addr), 0)) == nil)
1208 		error(Ebadarg);
1209 	if(delta < 0 || *addr < 0)
1210 		error(Ebadarg);
1211 
1212 	ar0->i = semrelease(s, addr, delta);
1213 }
1214 
1215 void
sysnsec(Ar0 * ar0,va_list)1216 sysnsec(Ar0* ar0, va_list)
1217 {
1218 	ar0->vl = todget(nil);
1219 }
1220