xref: /plan9/sys/src/9/port/sysproc.c (revision f9e1cf08d3be51592e03e639fc848a68dc31a55e)
1 #include	"u.h"
2 #include	"tos.h"
3 #include	"../port/lib.h"
4 #include	"mem.h"
5 #include	"dat.h"
6 #include	"fns.h"
7 #include	"../port/error.h"
8 #include	"edf.h"
9 
10 #include	<a.out.h>
11 
12 int	shargs(char*, int, char**);
13 
14 extern void checkpages(void);
15 extern void checkpagerefs(void);
16 
17 long
18 sysr1(ulong*)
19 {
20 	checkpagerefs();
21 	return 0;
22 }
23 
24 long
25 sysrfork(ulong *arg)
26 {
27 	Proc *p;
28 	int n, i;
29 	Fgrp *ofg;
30 	Pgrp *opg;
31 	Rgrp *org;
32 	Egrp *oeg;
33 	ulong pid, flag;
34 	Mach *wm;
35 
36 	flag = arg[0];
37 	/* Check flags before we commit */
38 	if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
39 		error(Ebadarg);
40 	if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
41 		error(Ebadarg);
42 	if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
43 		error(Ebadarg);
44 
45 	if((flag&RFPROC) == 0) {
46 		if(flag & (RFMEM|RFNOWAIT))
47 			error(Ebadarg);
48 		if(flag & (RFFDG|RFCFDG)) {
49 			ofg = up->fgrp;
50 			if(flag & RFFDG)
51 				up->fgrp = dupfgrp(ofg);
52 			else
53 				up->fgrp = dupfgrp(nil);
54 			closefgrp(ofg);
55 		}
56 		if(flag & (RFNAMEG|RFCNAMEG)) {
57 			opg = up->pgrp;
58 			up->pgrp = newpgrp();
59 			if(flag & RFNAMEG)
60 				pgrpcpy(up->pgrp, opg);
61 			/* inherit noattach */
62 			up->pgrp->noattach = opg->noattach;
63 			closepgrp(opg);
64 		}
65 		if(flag & RFNOMNT)
66 			up->pgrp->noattach = 1;
67 		if(flag & RFREND) {
68 			org = up->rgrp;
69 			up->rgrp = newrgrp();
70 			closergrp(org);
71 		}
72 		if(flag & (RFENVG|RFCENVG)) {
73 			oeg = up->egrp;
74 			up->egrp = smalloc(sizeof(Egrp));
75 			up->egrp->ref = 1;
76 			if(flag & RFENVG)
77 				envcpy(up->egrp, oeg);
78 			closeegrp(oeg);
79 		}
80 		if(flag & RFNOTEG)
81 			up->noteid = incref(&noteidalloc);
82 		return 0;
83 	}
84 
85 	p = newproc();
86 
87 	p->fpsave = up->fpsave;
88 	p->scallnr = up->scallnr;
89 	p->s = up->s;
90 	p->nerrlab = 0;
91 	p->slash = up->slash;
92 	p->dot = up->dot;
93 	incref(p->dot);
94 
95 	memmove(p->note, up->note, sizeof(p->note));
96 	p->privatemem = up->privatemem;
97 	p->noswap = up->noswap;
98 	p->nnote = up->nnote;
99 	p->notified = 0;
100 	p->lastnote = up->lastnote;
101 	p->notify = up->notify;
102 	p->ureg = up->ureg;
103 	p->dbgreg = 0;
104 
105 	/* Make a new set of memory segments */
106 	n = flag & RFMEM;
107 	qlock(&p->seglock);
108 	if(waserror()){
109 		qunlock(&p->seglock);
110 		nexterror();
111 	}
112 	for(i = 0; i < NSEG; i++)
113 		if(up->seg[i])
114 			p->seg[i] = dupseg(up->seg, i, n);
115 	qunlock(&p->seglock);
116 	poperror();
117 
118 	/* File descriptors */
119 	if(flag & (RFFDG|RFCFDG)) {
120 		if(flag & RFFDG)
121 			p->fgrp = dupfgrp(up->fgrp);
122 		else
123 			p->fgrp = dupfgrp(nil);
124 	}
125 	else {
126 		p->fgrp = up->fgrp;
127 		incref(p->fgrp);
128 	}
129 
130 	/* Process groups */
131 	if(flag & (RFNAMEG|RFCNAMEG)) {
132 		p->pgrp = newpgrp();
133 		if(flag & RFNAMEG)
134 			pgrpcpy(p->pgrp, up->pgrp);
135 		/* inherit noattach */
136 		p->pgrp->noattach = up->pgrp->noattach;
137 	}
138 	else {
139 		p->pgrp = up->pgrp;
140 		incref(p->pgrp);
141 	}
142 	if(flag & RFNOMNT)
143 		up->pgrp->noattach = 1;
144 
145 	if(flag & RFREND)
146 		p->rgrp = newrgrp();
147 	else {
148 		incref(up->rgrp);
149 		p->rgrp = up->rgrp;
150 	}
151 
152 	/* Environment group */
153 	if(flag & (RFENVG|RFCENVG)) {
154 		p->egrp = smalloc(sizeof(Egrp));
155 		p->egrp->ref = 1;
156 		if(flag & RFENVG)
157 			envcpy(p->egrp, up->egrp);
158 	}
159 	else {
160 		p->egrp = up->egrp;
161 		incref(p->egrp);
162 	}
163 	p->hang = up->hang;
164 	p->procmode = up->procmode;
165 
166 	/* Craft a return frame which will cause the child to pop out of
167 	 * the scheduler in user mode with the return register zero
168 	 */
169 	forkchild(p, up->dbgreg);
170 
171 	p->parent = up;
172 	p->parentpid = up->pid;
173 	if(flag&RFNOWAIT)
174 		p->parentpid = 0;
175 	else {
176 		lock(&up->exl);
177 		up->nchild++;
178 		unlock(&up->exl);
179 	}
180 	if((flag&RFNOTEG) == 0)
181 		p->noteid = up->noteid;
182 
183 	p->fpstate = up->fpstate;
184 	pid = p->pid;
185 	memset(p->time, 0, sizeof(p->time));
186 	p->time[TReal] = MACHP(0)->ticks;
187 
188 	kstrdup(&p->text, up->text);
189 	kstrdup(&p->user, up->user);
190 	/*
191 	 *  since the bss/data segments are now shareable,
192 	 *  any mmu info about this process is now stale
193 	 *  (i.e. has bad properties) and has to be discarded.
194 	 */
195 	flushmmu();
196 	p->basepri = up->basepri;
197 	p->priority = up->basepri;
198 	p->fixedpri = up->fixedpri;
199 	p->mp = up->mp;
200 	wm = up->wired;
201 	if(wm)
202 		procwired(p, wm->machno);
203 	ready(p);
204 	sched();
205 	return pid;
206 }
207 
208 static ulong
209 l2be(long l)
210 {
211 	uchar *cp;
212 
213 	cp = (uchar*)&l;
214 	return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
215 }
216 
217 long
218 sysexec(ulong *arg)
219 {
220 	Segment *s, *ts;
221 	ulong t, d, b;
222 	int i;
223 	Chan *tc;
224 	char **argv, **argp;
225 	char *a, *charp, *args, *file;
226 	char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
227 	ulong ssize, spage, nargs, nbytes, n, bssend;
228 	int indir;
229 	Exec exec;
230 	char line[sizeof(Exec)];
231 	Fgrp *f;
232 	Image *img;
233 	ulong magic, text, entry, data, bss;
234 	Tos *tos;
235 
236 	validaddr(arg[0], 1, 0);
237 	file = (char*)arg[0];
238 	indir = 0;
239 	elem = nil;
240 	if(waserror()){
241 		free(elem);
242 		nexterror();
243 	}
244 	for(;;){
245 		tc = namec(file, Aopen, OEXEC, 0);
246 		if(waserror()){
247 			cclose(tc);
248 			nexterror();
249 		}
250 		if(!indir)
251 			kstrdup(&elem, up->genbuf);
252 
253 		n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
254 		if(n < 2)
255 			error(Ebadexec);
256 		magic = l2be(exec.magic);
257 		text = l2be(exec.text);
258 		entry = l2be(exec.entry);
259 		if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
260 			if(text >= USTKTOP-UTZERO
261 			|| entry < UTZERO+sizeof(Exec)
262 			|| entry >= UTZERO+sizeof(Exec)+text)
263 				error(Ebadexec);
264 			break; /* for binary */
265 		}
266 
267 		/*
268 		 * Process #! /bin/sh args ...
269 		 */
270 		memmove(line, &exec, sizeof(Exec));
271 		if(indir || line[0]!='#' || line[1]!='!')
272 			error(Ebadexec);
273 		n = shargs(line, n, progarg);
274 		if(n == 0)
275 			error(Ebadexec);
276 		indir = 1;
277 		/*
278 		 * First arg becomes complete file name
279 		 */
280 		progarg[n++] = file;
281 		progarg[n] = 0;
282 		validaddr(arg[1], BY2WD, 1);
283 		arg[1] += BY2WD;
284 		file = progarg[0];
285 		if(strlen(elem) >= sizeof progelem)
286 			error(Ebadexec);
287 		strcpy(progelem, elem);
288 		progarg[0] = progelem;
289 		poperror();
290 		cclose(tc);
291 	}
292 
293 	data = l2be(exec.data);
294 	bss = l2be(exec.bss);
295 	t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1);
296 	d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
297 	bssend = t + data + bss;
298 	b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
299 	if(t >= KZERO || d >= KZERO || b >= KZERO)
300 		error(Ebadexec);
301 
302 	/*
303 	 * Args: pass 1: count
304 	 */
305 	nbytes = sizeof(Tos);		/* hole for profiling clock at top of stack (and more) */
306 	nargs = 0;
307 	if(indir){
308 		argp = progarg;
309 		while(*argp){
310 			a = *argp++;
311 			nbytes += strlen(a) + 1;
312 			nargs++;
313 		}
314 	}
315 	evenaddr(arg[1]);
316 	argp = (char**)arg[1];
317 	validaddr((ulong)argp, BY2WD, 0);
318 	while(*argp){
319 		a = *argp++;
320 		if(((ulong)argp&(BY2PG-1)) < BY2WD)
321 			validaddr((ulong)argp, BY2WD, 0);
322 		validaddr((ulong)a, 1, 0);
323 		nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
324 		nargs++;
325 	}
326 	ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
327 
328 	/*
329 	 * 8-byte align SP for those (e.g. sparc) that need it.
330 	 * execregs() will subtract another 4 bytes for argc.
331 	 */
332 	if((ssize+4) & 7)
333 		ssize += 4;
334 	spage = (ssize+(BY2PG-1)) >> PGSHIFT;
335 
336 	/*
337 	 * Build the stack segment, putting it in kernel virtual for the moment
338 	 */
339 	if(spage > TSTKSIZ)
340 		error(Enovmem);
341 
342 	qlock(&up->seglock);
343 	if(waserror()){
344 		qunlock(&up->seglock);
345 		nexterror();
346 	}
347 	up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
348 
349 	/*
350 	 * Args: pass 2: assemble; the pages will be faulted in
351 	 */
352 	tos = (Tos*)(TSTKTOP - sizeof(Tos));
353 	tos->cyclefreq = m->cyclefreq;
354 	cycles((uvlong*)&tos->pcycles);
355 	tos->pcycles = -tos->pcycles;
356 	tos->kcycles = tos->pcycles;
357 	tos->clock = 0;
358 	argv = (char**)(TSTKTOP - ssize);
359 	charp = (char*)(TSTKTOP - nbytes);
360 	args = charp;
361 	if(indir)
362 		argp = progarg;
363 	else
364 		argp = (char**)arg[1];
365 
366 	for(i=0; i<nargs; i++){
367 		if(indir && *argp==0) {
368 			indir = 0;
369 			argp = (char**)arg[1];
370 		}
371 		*argv++ = charp + (USTKTOP-TSTKTOP);
372 		n = strlen(*argp) + 1;
373 		memmove(charp, *argp++, n);
374 		charp += n;
375 	}
376 
377 	free(up->text);
378 	up->text = elem;
379 	elem = nil;	/* so waserror() won't free elem */
380 	USED(elem);
381 
382 	/* copy args; easiest from new process's stack */
383 	n = charp - args;
384 	if(n > 128)	/* don't waste too much space on huge arg lists */
385 		n = 128;
386 	a = up->args;
387 	up->args = nil;
388 	free(a);
389 	up->args = smalloc(n);
390 	memmove(up->args, args, n);
391 	if(n>0 && up->args[n-1]!='\0'){
392 		/* make sure last arg is NUL-terminated */
393 		/* put NUL at UTF-8 character boundary */
394 		for(i=n-1; i>0; --i)
395 			if(fullrune(up->args+i, n-i))
396 				break;
397 		up->args[i] = 0;
398 		n = i+1;
399 	}
400 	up->nargs = n;
401 
402 	/*
403 	 * Committed.
404 	 * Free old memory.
405 	 * Special segments are maintained across exec
406 	 */
407 	for(i = SSEG; i <= BSEG; i++) {
408 		putseg(up->seg[i]);
409 		/* prevent a second free if we have an error */
410 		up->seg[i] = 0;
411 	}
412 	for(i = BSEG+1; i < NSEG; i++) {
413 		s = up->seg[i];
414 		if(s != 0 && (s->type&SG_CEXEC)) {
415 			putseg(s);
416 			up->seg[i] = 0;
417 		}
418 	}
419 
420 	/*
421 	 * Close on exec
422 	 */
423 	f = up->fgrp;
424 	for(i=0; i<=f->maxfd; i++)
425 		fdclose(i, CCEXEC);
426 
427 	/* Text.  Shared. Attaches to cache image if possible */
428 	/* attachimage returns a locked cache image */
429 	img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
430 	ts = img->s;
431 	up->seg[TSEG] = ts;
432 	ts->flushme = 1;
433 	ts->fstart = 0;
434 	ts->flen = sizeof(Exec)+text;
435 	unlock(img);
436 
437 	/* Data. Shared. */
438 	s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
439 	up->seg[DSEG] = s;
440 
441 	/* Attached by hand */
442 	incref(img);
443 	s->image = img;
444 	s->fstart = ts->fstart+ts->flen;
445 	s->flen = data;
446 
447 	/* BSS. Zero fill on demand */
448 	up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
449 
450 	/*
451 	 * Move the stack
452 	 */
453 	s = up->seg[ESEG];
454 	up->seg[ESEG] = 0;
455 	up->seg[SSEG] = s;
456 	qunlock(&up->seglock);
457 	poperror();	/* seglock */
458 	poperror();	/* elem */
459 	s->base = USTKTOP-USTKSIZE;
460 	s->top = USTKTOP;
461 	relocateseg(s, USTKTOP-TSTKTOP);
462 
463 	/*
464 	 *  '/' processes are higher priority (hack to make /ip more responsive).
465 	 */
466 	if(devtab[tc->type]->dc == L'/')
467 		up->basepri = PriRoot;
468 	up->priority = up->basepri;
469 	poperror();
470 	cclose(tc);
471 
472 	/*
473 	 *  At this point, the mmu contains info about the old address
474 	 *  space and needs to be flushed
475 	 */
476 	flushmmu();
477 	qlock(&up->debug);
478 	up->nnote = 0;
479 	up->notify = 0;
480 	up->notified = 0;
481 	up->privatemem = 0;
482 	procsetup(up);
483 	qunlock(&up->debug);
484 	if(up->hang)
485 		up->procctl = Proc_stopme;
486 
487 	return execregs(entry, ssize, nargs);
488 }
489 
490 int
491 shargs(char *s, int n, char **ap)
492 {
493 	int i;
494 
495 	s += 2;
496 	n -= 2;		/* skip #! */
497 	for(i=0; s[i]!='\n'; i++)
498 		if(i == n-1)
499 			return 0;
500 	s[i] = 0;
501 	*ap = 0;
502 	i = 0;
503 	for(;;) {
504 		while(*s==' ' || *s=='\t')
505 			s++;
506 		if(*s == 0)
507 			break;
508 		i++;
509 		*ap++ = s;
510 		*ap = 0;
511 		while(*s && *s!=' ' && *s!='\t')
512 			s++;
513 		if(*s == 0)
514 			break;
515 		else
516 			*s++ = 0;
517 	}
518 	return i;
519 }
520 
521 int
522 return0(void*)
523 {
524 	return 0;
525 }
526 
527 long
528 syssleep(ulong *arg)
529 {
530 
531 	int n;
532 
533 	n = arg[0];
534 	if(n <= 0) {
535 		if (up->edf && (up->edf->flags & Admitted))
536 			edfyield();
537 		else
538 			yield();
539 		return 0;
540 	}
541 	if(n < TK2MS(1))
542 		n = TK2MS(1);
543 	tsleep(&up->sleep, return0, 0, n);
544 	return 0;
545 }
546 
547 long
548 sysalarm(ulong *arg)
549 {
550 	return procalarm(arg[0]);
551 }
552 
553 long
554 sysexits(ulong *arg)
555 {
556 	char *status;
557 	char *inval = "invalid exit string";
558 	char buf[ERRMAX];
559 
560 	status = (char*)arg[0];
561 	if(status){
562 		if(waserror())
563 			status = inval;
564 		else{
565 			validaddr((ulong)status, 1, 0);
566 			if(vmemchr(status, 0, ERRMAX) == 0){
567 				memmove(buf, status, ERRMAX);
568 				buf[ERRMAX-1] = 0;
569 				status = buf;
570 			}
571 			poperror();
572 		}
573 
574 	}
575 	pexit(status, 1);
576 	return 0;		/* not reached */
577 }
578 
579 long
580 sys_wait(ulong *arg)
581 {
582 	int pid;
583 	Waitmsg w;
584 	OWaitmsg *ow;
585 
586 	if(arg[0] == 0)
587 		return pwait(nil);
588 
589 	validaddr(arg[0], sizeof(OWaitmsg), 1);
590 	evenaddr(arg[0]);
591 	pid = pwait(&w);
592 	if(pid >= 0){
593 		ow = (OWaitmsg*)arg[0];
594 		readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
595 		readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
596 		readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
597 		readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
598 		strncpy(ow->msg, w.msg, sizeof(ow->msg));
599 		ow->msg[sizeof(ow->msg)-1] = '\0';
600 	}
601 	return pid;
602 }
603 
604 long
605 sysawait(ulong *arg)
606 {
607 	int i;
608 	int pid;
609 	Waitmsg w;
610 	ulong n;
611 
612 	n = arg[1];
613 	validaddr(arg[0], n, 1);
614 	pid = pwait(&w);
615 	if(pid < 0)
616 		return -1;
617 	i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
618 		w.pid,
619 		w.time[TUser], w.time[TSys], w.time[TReal],
620 		w.msg);
621 
622 	return i;
623 }
624 
625 void
626 werrstr(char *fmt, ...)
627 {
628 	va_list va;
629 
630 	if(up == nil)
631 		return;
632 
633 	va_start(va, fmt);
634 	vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
635 	va_end(va);
636 }
637 
638 static long
639 generrstr(char *buf, uint nbuf)
640 {
641 	char tmp[ERRMAX];
642 
643 	if(nbuf == 0)
644 		error(Ebadarg);
645 	validaddr((ulong)buf, nbuf, 1);
646 	if(nbuf > sizeof tmp)
647 		nbuf = sizeof tmp;
648 	memmove(tmp, buf, nbuf);
649 
650 	/* make sure it's NUL-terminated */
651 	tmp[nbuf-1] = '\0';
652 	memmove(buf, up->syserrstr, nbuf);
653 	buf[nbuf-1] = '\0';
654 	memmove(up->syserrstr, tmp, nbuf);
655 	return 0;
656 }
657 
658 long
659 syserrstr(ulong *arg)
660 {
661 	return generrstr((char*)arg[0], arg[1]);
662 }
663 
664 /* compatibility for old binaries */
665 long
666 sys_errstr(ulong *arg)
667 {
668 	return generrstr((char*)arg[0], 64);
669 }
670 
671 long
672 sysnotify(ulong *arg)
673 {
674 	if(arg[0] != 0)
675 		validaddr(arg[0], sizeof(ulong), 0);
676 	up->notify = (int(*)(void*, char*))(arg[0]);
677 	return 0;
678 }
679 
680 long
681 sysnoted(ulong *arg)
682 {
683 	if(arg[0]!=NRSTR && !up->notified)
684 		error(Egreg);
685 	return 0;
686 }
687 
688 long
689 syssegbrk(ulong *arg)
690 {
691 	int i;
692 	ulong addr;
693 	Segment *s;
694 
695 	addr = arg[0];
696 	for(i = 0; i < NSEG; i++) {
697 		s = up->seg[i];
698 		if(s == 0 || addr < s->base || addr >= s->top)
699 			continue;
700 		switch(s->type&SG_TYPE) {
701 		case SG_TEXT:
702 		case SG_DATA:
703 		case SG_STACK:
704 			error(Ebadarg);
705 		default:
706 			return ibrk(arg[1], i);
707 		}
708 	}
709 
710 	error(Ebadarg);
711 	return 0;		/* not reached */
712 }
713 
714 long
715 syssegattach(ulong *arg)
716 {
717 	return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
718 }
719 
720 long
721 syssegdetach(ulong *arg)
722 {
723 	int i;
724 	ulong addr;
725 	Segment *s;
726 
727 	qlock(&up->seglock);
728 	if(waserror()){
729 		qunlock(&up->seglock);
730 		nexterror();
731 	}
732 
733 	s = 0;
734 	addr = arg[0];
735 	for(i = 0; i < NSEG; i++)
736 		if(s = up->seg[i]) {
737 			qlock(&s->lk);
738 			if((addr >= s->base && addr < s->top) ||
739 			   (s->top == s->base && addr == s->base))
740 				goto found;
741 			qunlock(&s->lk);
742 		}
743 
744 	error(Ebadarg);
745 
746 found:
747 	/*
748 	 * Check we are not detaching the initial stack segment.
749 	 */
750 	if(s == up->seg[SSEG]){
751 		qunlock(&s->lk);
752 		error(Ebadarg);
753 	}
754 	up->seg[i] = 0;
755 	qunlock(&s->lk);
756 	putseg(s);
757 	qunlock(&up->seglock);
758 	poperror();
759 
760 	/* Ensure we flush any entries from the lost segment */
761 	flushmmu();
762 	return 0;
763 }
764 
765 long
766 syssegfree(ulong *arg)
767 {
768 	Segment *s;
769 	ulong from, to;
770 
771 	from = arg[0];
772 	s = seg(up, from, 1);
773 	if(s == nil)
774 		error(Ebadarg);
775 	to = (from + arg[1]) & ~(BY2PG-1);
776 	from = PGROUND(from);
777 
778 	if(to > s->top) {
779 		qunlock(&s->lk);
780 		error(Ebadarg);
781 	}
782 
783 	mfreeseg(s, from, (to - from) / BY2PG);
784 	qunlock(&s->lk);
785 	flushmmu();
786 
787 	return 0;
788 }
789 
790 /* For binary compatibility */
791 long
792 sysbrk_(ulong *arg)
793 {
794 	return ibrk(arg[0], BSEG);
795 }
796 
797 long
798 sysrendezvous(ulong *arg)
799 {
800 	uintptr tag, val;
801 	Proc *p, **l;
802 
803 	tag = arg[0];
804 	l = &REND(up->rgrp, tag);
805 	up->rendval = ~(uintptr)0;
806 
807 	lock(up->rgrp);
808 	for(p = *l; p; p = p->rendhash) {
809 		if(p->rendtag == tag) {
810 			*l = p->rendhash;
811 			val = p->rendval;
812 			p->rendval = arg[1];
813 
814 			while(p->mach != 0)
815 				;
816 			ready(p);
817 			unlock(up->rgrp);
818 			return val;
819 		}
820 		l = &p->rendhash;
821 	}
822 
823 	/* Going to sleep here */
824 	up->rendtag = tag;
825 	up->rendval = arg[1];
826 	up->rendhash = *l;
827 	*l = up;
828 	up->state = Rendezvous;
829 	unlock(up->rgrp);
830 
831 	sched();
832 
833 	return up->rendval;
834 }
835 
836 /*
837  * The implementation of semaphores is complicated by needing
838  * to avoid rescheduling in syssemrelease, so that it is safe
839  * to call from real-time processes.  This means syssemrelease
840  * cannot acquire any qlocks, only spin locks.
841  *
842  * Semacquire and semrelease must both manipulate the semaphore
843  * wait list.  Lock-free linked lists only exist in theory, not
844  * in practice, so the wait list is protected by a spin lock.
845  *
846  * The semaphore value *addr is stored in user memory, so it
847  * cannot be read or written while holding spin locks.
848  *
849  * Thus, we can access the list only when holding the lock, and
850  * we can access the semaphore only when not holding the lock.
851  * This makes things interesting.  Note that sleep's condition function
852  * is called while holding two locks - r and up->rlock - so it cannot
853  * access the semaphore value either.
854  *
855  * An acquirer announces its intention to try for the semaphore
856  * by putting a Sema structure onto the wait list and then
857  * setting Sema.waiting.  After one last check of semaphore,
858  * the acquirer sleeps until Sema.waiting==0.  A releaser of n
859  * must wake up n acquirers who have Sema.waiting set.  It does
860  * this by clearing Sema.waiting and then calling wakeup.
861  *
862  * There are three interesting races here.
863 
864  * The first is that in this particular sleep/wakeup usage, a single
865  * wakeup can rouse a process from two consecutive sleeps!
866  * The ordering is:
867  *
868  * 	(a) set Sema.waiting = 1
869  * 	(a) call sleep
870  * 	(b) set Sema.waiting = 0
871  * 	(a) check Sema.waiting inside sleep, return w/o sleeping
872  * 	(a) try for semaphore, fail
873  * 	(a) set Sema.waiting = 1
874  * 	(a) call sleep
875  * 	(b) call wakeup(a)
876  * 	(a) wake up again
877  *
878  * This is okay - semacquire will just go around the loop
879  * again.  It does mean that at the top of the for(;;) loop in
880  * semacquire, phore.waiting might already be set to 1.
881  *
882  * The second is that a releaser might wake an acquirer who is
883  * interrupted before he can acquire the lock.  Since
884  * release(n) issues only n wakeup calls -- only n can be used
885  * anyway -- if the interrupted process is not going to use his
886  * wakeup call he must pass it on to another acquirer.
887  *
888  * The third race is similar to the second but more subtle.  An
889  * acquirer sets waiting=1 and then does a final canacquire()
890  * before going to sleep.  The opposite order would result in
891  * missing wakeups that happen between canacquire and
892  * waiting=1.  (In fact, the whole point of Sema.waiting is to
893  * avoid missing wakeups between canacquire() and sleep().) But
894  * there can be spurious wakeups between a successful
895  * canacquire() and the following semdequeue().  This wakeup is
896  * not useful to the acquirer, since he has already acquired
897  * the semaphore.  Like in the previous case, though, the
898  * acquirer must pass the wakeup call along.
899  *
900  * This is all rather subtle.  The code below has been verified
901  * with the spin model /sys/src/9/port/semaphore.p.  The
902  * original code anticipated the second race but not the first
903  * or third, which were caught only with spin.  The first race
904  * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
905  * It was lucky that my abstract model of sleep/wakeup still managed
906  * to preserve that behavior.
907  *
908  * I remain slightly concerned about memory coherence
909  * outside of locks.  The spin model does not take
910  * queued processor writes into account so we have to
911  * think hard.  The only variables accessed outside locks
912  * are the semaphore value itself and the boolean flag
913  * Sema.waiting.  The value is only accessed with cmpswap,
914  * whose job description includes doing the right thing as
915  * far as memory coherence across processors.  That leaves
916  * Sema.waiting.  To handle it, we call coherence() before each
917  * read and after each write.		- rsc
918  */
919 
920 /* Add semaphore p with addr a to list in seg. */
921 static void
922 semqueue(Segment *s, long *a, Sema *p)
923 {
924 	memset(p, 0, sizeof *p);
925 	p->addr = a;
926 	lock(&s->sema);	/* uses s->sema.Rendez.Lock, but no one else is */
927 	p->next = &s->sema;
928 	p->prev = s->sema.prev;
929 	p->next->prev = p;
930 	p->prev->next = p;
931 	unlock(&s->sema);
932 }
933 
934 /* Remove semaphore p from list in seg. */
935 static void
936 semdequeue(Segment *s, Sema *p)
937 {
938 	lock(&s->sema);
939 	p->next->prev = p->prev;
940 	p->prev->next = p->next;
941 	unlock(&s->sema);
942 }
943 
944 /* Wake up n waiters with addr a on list in seg. */
945 static void
946 semwakeup(Segment *s, long *a, long n)
947 {
948 	Sema *p;
949 
950 	lock(&s->sema);
951 	for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
952 		if(p->addr == a && p->waiting){
953 			p->waiting = 0;
954 			coherence();
955 			wakeup(p);
956 			n--;
957 		}
958 	}
959 	unlock(&s->sema);
960 }
961 
962 /* Add delta to semaphore and wake up waiters as appropriate. */
963 static long
964 semrelease(Segment *s, long *addr, long delta)
965 {
966 	long value;
967 
968 	do
969 		value = *addr;
970 	while(!cmpswap(addr, value, value+delta));
971 	semwakeup(s, addr, delta);
972 	return value+delta;
973 }
974 
975 /* Try to acquire semaphore using compare-and-swap */
976 static int
977 canacquire(long *addr)
978 {
979 	long value;
980 
981 	while((value=*addr) > 0)
982 		if(cmpswap(addr, value, value-1))
983 			return 1;
984 	return 0;
985 }
986 
987 /* Should we wake up? */
988 static int
989 semawoke(void *p)
990 {
991 	coherence();
992 	return !((Sema*)p)->waiting;
993 }
994 
995 /* Acquire semaphore (subtract 1). */
996 static int
997 semacquire(Segment *s, long *addr, int block)
998 {
999 	int acquired;
1000 	Sema phore;
1001 
1002 	if(canacquire(addr))
1003 		return 1;
1004 	if(!block)
1005 		return 0;
1006 
1007 	acquired = 0;
1008 	semqueue(s, addr, &phore);
1009 	for(;;){
1010 		phore.waiting = 1;
1011 		coherence();
1012 		if(canacquire(addr)){
1013 			acquired = 1;
1014 			break;
1015 		}
1016 		if(waserror())
1017 			break;
1018 		sleep(&phore, semawoke, &phore);
1019 		poperror();
1020 	}
1021 	semdequeue(s, &phore);
1022 	coherence();	/* not strictly necessary due to lock in semdequeue */
1023 	if(!phore.waiting)
1024 		semwakeup(s, addr, 1);
1025 	if(!acquired)
1026 		nexterror();
1027 	return 1;
1028 }
1029 
1030 long
1031 syssemacquire(ulong *arg)
1032 {
1033 	int block;
1034 	long *addr;
1035 	Segment *s;
1036 
1037 	validaddr(arg[0], sizeof(long), 1);
1038 	evenaddr(arg[0]);
1039 	addr = (long*)arg[0];
1040 	block = arg[1];
1041 
1042 	if((s = seg(up, (ulong)addr, 0)) == nil)
1043 		error(Ebadarg);
1044 	if(*addr < 0)
1045 		error(Ebadarg);
1046 	return semacquire(s, addr, block);
1047 }
1048 
1049 long
1050 syssemrelease(ulong *arg)
1051 {
1052 	long *addr, delta;
1053 	Segment *s;
1054 
1055 	validaddr(arg[0], sizeof(long), 1);
1056 	evenaddr(arg[0]);
1057 	addr = (long*)arg[0];
1058 	delta = arg[1];
1059 
1060 	if((s = seg(up, (ulong)addr, 0)) == nil)
1061 		error(Ebadarg);
1062 	if(delta < 0 || *addr < 0)
1063 		error(Ebadarg);
1064 	return semrelease(s, addr, arg[1]);
1065 }
1066