xref: /plan9/sys/src/9/port/sysproc.c (revision ec59a3ddbfceee0efe34584c2c9981a5e5ff1ec4)
1 #include	"u.h"
2 #include	"tos.h"
3 #include	"../port/lib.h"
4 #include	"mem.h"
5 #include	"dat.h"
6 #include	"fns.h"
7 #include	"../port/error.h"
8 #include	"edf.h"
9 
10 #include	<a.out.h>
11 
12 int	shargs(char*, int, char**);
13 
14 extern void checkpages(void);
15 extern void checkpagerefs(void);
16 
17 long
18 sysr1(ulong*)
19 {
20 	checkpagerefs();
21 	return 0;
22 }
23 
24 long
25 sysrfork(ulong *arg)
26 {
27 	Proc *p;
28 	int n, i;
29 	Fgrp *ofg;
30 	Pgrp *opg;
31 	Rgrp *org;
32 	Egrp *oeg;
33 	ulong pid, flag;
34 	Mach *wm;
35 
36 	flag = arg[0];
37 	/* Check flags before we commit */
38 	if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
39 		error(Ebadarg);
40 	if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
41 		error(Ebadarg);
42 	if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
43 		error(Ebadarg);
44 
45 	if((flag&RFPROC) == 0) {
46 		if(flag & (RFMEM|RFNOWAIT))
47 			error(Ebadarg);
48 		if(flag & (RFFDG|RFCFDG)) {
49 			ofg = up->fgrp;
50 			if(flag & RFFDG)
51 				up->fgrp = dupfgrp(ofg);
52 			else
53 				up->fgrp = dupfgrp(nil);
54 			closefgrp(ofg);
55 		}
56 		if(flag & (RFNAMEG|RFCNAMEG)) {
57 			opg = up->pgrp;
58 			up->pgrp = newpgrp();
59 			if(flag & RFNAMEG)
60 				pgrpcpy(up->pgrp, opg);
61 			/* inherit noattach */
62 			up->pgrp->noattach = opg->noattach;
63 			closepgrp(opg);
64 		}
65 		if(flag & RFNOMNT)
66 			up->pgrp->noattach = 1;
67 		if(flag & RFREND) {
68 			org = up->rgrp;
69 			up->rgrp = newrgrp();
70 			closergrp(org);
71 		}
72 		if(flag & (RFENVG|RFCENVG)) {
73 			oeg = up->egrp;
74 			up->egrp = smalloc(sizeof(Egrp));
75 			up->egrp->ref = 1;
76 			if(flag & RFENVG)
77 				envcpy(up->egrp, oeg);
78 			closeegrp(oeg);
79 		}
80 		if(flag & RFNOTEG)
81 			up->noteid = incref(&noteidalloc);
82 		return 0;
83 	}
84 
85 	p = newproc();
86 
87 	p->fpsave = up->fpsave;
88 	p->scallnr = up->scallnr;
89 	p->s = up->s;
90 	p->nerrlab = 0;
91 	p->slash = up->slash;
92 	p->dot = up->dot;
93 	incref(p->dot);
94 
95 	memmove(p->note, up->note, sizeof(p->note));
96 	p->privatemem = up->privatemem;
97 	p->noswap = up->noswap;
98 	p->nnote = up->nnote;
99 	p->notified = 0;
100 	p->lastnote = up->lastnote;
101 	p->notify = up->notify;
102 	p->ureg = up->ureg;
103 	p->dbgreg = 0;
104 
105 	/* Make a new set of memory segments */
106 	n = flag & RFMEM;
107 	qlock(&p->seglock);
108 	if(waserror()){
109 		qunlock(&p->seglock);
110 		nexterror();
111 	}
112 	for(i = 0; i < NSEG; i++)
113 		if(up->seg[i])
114 			p->seg[i] = dupseg(up->seg, i, n);
115 	qunlock(&p->seglock);
116 	poperror();
117 
118 	/* File descriptors */
119 	if(flag & (RFFDG|RFCFDG)) {
120 		if(flag & RFFDG)
121 			p->fgrp = dupfgrp(up->fgrp);
122 		else
123 			p->fgrp = dupfgrp(nil);
124 	}
125 	else {
126 		p->fgrp = up->fgrp;
127 		incref(p->fgrp);
128 	}
129 
130 	/* Process groups */
131 	if(flag & (RFNAMEG|RFCNAMEG)) {
132 		p->pgrp = newpgrp();
133 		if(flag & RFNAMEG)
134 			pgrpcpy(p->pgrp, up->pgrp);
135 		/* inherit noattach */
136 		p->pgrp->noattach = up->pgrp->noattach;
137 	}
138 	else {
139 		p->pgrp = up->pgrp;
140 		incref(p->pgrp);
141 	}
142 	if(flag & RFNOMNT)
143 		up->pgrp->noattach = 1;
144 
145 	if(flag & RFREND)
146 		p->rgrp = newrgrp();
147 	else {
148 		incref(up->rgrp);
149 		p->rgrp = up->rgrp;
150 	}
151 
152 	/* Environment group */
153 	if(flag & (RFENVG|RFCENVG)) {
154 		p->egrp = smalloc(sizeof(Egrp));
155 		p->egrp->ref = 1;
156 		if(flag & RFENVG)
157 			envcpy(p->egrp, up->egrp);
158 	}
159 	else {
160 		p->egrp = up->egrp;
161 		incref(p->egrp);
162 	}
163 	p->hang = up->hang;
164 	p->procmode = up->procmode;
165 
166 	/* Craft a return frame which will cause the child to pop out of
167 	 * the scheduler in user mode with the return register zero
168 	 */
169 	forkchild(p, up->dbgreg);
170 
171 	p->parent = up;
172 	p->parentpid = up->pid;
173 	if(flag&RFNOWAIT)
174 		p->parentpid = 0;
175 	else {
176 		lock(&up->exl);
177 		up->nchild++;
178 		unlock(&up->exl);
179 	}
180 	if((flag&RFNOTEG) == 0)
181 		p->noteid = up->noteid;
182 
183 	p->fpstate = up->fpstate;
184 	pid = p->pid;
185 	memset(p->time, 0, sizeof(p->time));
186 	p->time[TReal] = MACHP(0)->ticks;
187 
188 	kstrdup(&p->text, up->text);
189 	kstrdup(&p->user, up->user);
190 	/*
191 	 *  since the bss/data segments are now shareable,
192 	 *  any mmu info about this process is now stale
193 	 *  (i.e. has bad properties) and has to be discarded.
194 	 */
195 	flushmmu();
196 	p->basepri = up->basepri;
197 	p->priority = up->basepri;
198 	p->fixedpri = up->fixedpri;
199 	p->mp = up->mp;
200 	wm = up->wired;
201 	if(wm)
202 		procwired(p, wm->machno);
203 	ready(p);
204 	sched();
205 	return pid;
206 }
207 
208 static ulong
209 l2be(long l)
210 {
211 	uchar *cp;
212 
213 	cp = (uchar*)&l;
214 	return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
215 }
216 
217 long
218 sysexec(ulong *arg)
219 {
220 	Segment *s, *ts;
221 	ulong t, d, b;
222 	int i;
223 	Chan *tc;
224 	char **argv, **argp;
225 	char *a, *charp, *args, *file;
226 	char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
227 	ulong ssize, spage, nargs, nbytes, n, bssend;
228 	int indir;
229 	Exec exec;
230 	char line[sizeof(Exec)];
231 	Fgrp *f;
232 	Image *img;
233 	ulong magic, text, entry, data, bss;
234 	Tos *tos;
235 
236 	validaddr(arg[0], 1, 0);
237 	file = (char*)arg[0];
238 	indir = 0;
239 	elem = nil;
240 	if(waserror()){
241 		free(elem);
242 		nexterror();
243 	}
244 	for(;;){
245 		tc = namec(file, Aopen, OEXEC, 0);
246 		if(waserror()){
247 			cclose(tc);
248 			nexterror();
249 		}
250 		if(!indir)
251 			kstrdup(&elem, up->genbuf);
252 
253 		n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
254 		if(n < 2)
255 			error(Ebadexec);
256 		magic = l2be(exec.magic);
257 		text = l2be(exec.text);
258 		entry = l2be(exec.entry);
259 		if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
260 			if(text >= USTKTOP-UTZERO
261 			|| entry < UTZERO+sizeof(Exec)
262 			|| entry >= UTZERO+sizeof(Exec)+text)
263 				error(Ebadexec);
264 			break; /* for binary */
265 		}
266 
267 		/*
268 		 * Process #! /bin/sh args ...
269 		 */
270 		memmove(line, &exec, sizeof(Exec));
271 		if(indir || line[0]!='#' || line[1]!='!')
272 			error(Ebadexec);
273 		n = shargs(line, n, progarg);
274 		if(n == 0)
275 			error(Ebadexec);
276 		indir = 1;
277 		/*
278 		 * First arg becomes complete file name
279 		 */
280 		progarg[n++] = file;
281 		progarg[n] = 0;
282 		validaddr(arg[1], BY2WD, 1);
283 		arg[1] += BY2WD;
284 		file = progarg[0];
285 		if(strlen(elem) >= sizeof progelem)
286 			error(Ebadexec);
287 		strcpy(progelem, elem);
288 		progarg[0] = progelem;
289 		poperror();
290 		cclose(tc);
291 	}
292 
293 	data = l2be(exec.data);
294 	bss = l2be(exec.bss);
295 	t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1);
296 	d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
297 	bssend = t + data + bss;
298 	b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
299 	if(t >= KZERO || d >= KZERO || b >= KZERO)
300 		error(Ebadexec);
301 
302 	/*
303 	 * Args: pass 1: count
304 	 */
305 	nbytes = sizeof(Tos);		/* hole for profiling clock at top of stack (and more) */
306 	nargs = 0;
307 	if(indir){
308 		argp = progarg;
309 		while(*argp){
310 			a = *argp++;
311 			nbytes += strlen(a) + 1;
312 			nargs++;
313 		}
314 	}
315 	evenaddr(arg[1]);
316 	argp = (char**)arg[1];
317 	validaddr((ulong)argp, BY2WD, 0);
318 	while(*argp){
319 		a = *argp++;
320 		if(((ulong)argp&(BY2PG-1)) < BY2WD)
321 			validaddr((ulong)argp, BY2WD, 0);
322 		validaddr((ulong)a, 1, 0);
323 		nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
324 		nargs++;
325 	}
326 	ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
327 
328 	/*
329 	 * 8-byte align SP for those (e.g. sparc) that need it.
330 	 * execregs() will subtract another 4 bytes for argc.
331 	 */
332 	if((ssize+4) & 7)
333 		ssize += 4;
334 	spage = (ssize+(BY2PG-1)) >> PGSHIFT;
335 
336 	/*
337 	 * Build the stack segment, putting it in kernel virtual for the moment
338 	 */
339 	if(spage > TSTKSIZ)
340 		error(Enovmem);
341 
342 	qlock(&up->seglock);
343 	if(waserror()){
344 		qunlock(&up->seglock);
345 		nexterror();
346 	}
347 	up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
348 
349 	/*
350 	 * Args: pass 2: assemble; the pages will be faulted in
351 	 */
352 	tos = (Tos*)(TSTKTOP - sizeof(Tos));
353 	tos->cyclefreq = m->cyclefreq;
354 	cycles((uvlong*)&tos->pcycles);
355 	tos->pcycles = -tos->pcycles;
356 	tos->kcycles = tos->pcycles;
357 	tos->clock = 0;
358 	argv = (char**)(TSTKTOP - ssize);
359 	charp = (char*)(TSTKTOP - nbytes);
360 	args = charp;
361 	if(indir)
362 		argp = progarg;
363 	else
364 		argp = (char**)arg[1];
365 
366 	for(i=0; i<nargs; i++){
367 		if(indir && *argp==0) {
368 			indir = 0;
369 			argp = (char**)arg[1];
370 		}
371 		*argv++ = charp + (USTKTOP-TSTKTOP);
372 		n = strlen(*argp) + 1;
373 		memmove(charp, *argp++, n);
374 		charp += n;
375 	}
376 
377 	free(up->text);
378 	up->text = elem;
379 	elem = nil;	/* so waserror() won't free elem */
380 	USED(elem);
381 
382 	/* copy args; easiest from new process's stack */
383 	n = charp - args;
384 	if(n > 128)	/* don't waste too much space on huge arg lists */
385 		n = 128;
386 	a = up->args;
387 	up->args = nil;
388 	free(a);
389 	up->args = smalloc(n);
390 	memmove(up->args, args, n);
391 	if(n>0 && up->args[n-1]!='\0'){
392 		/* make sure last arg is NUL-terminated */
393 		/* put NUL at UTF-8 character boundary */
394 		for(i=n-1; i>0; --i)
395 			if(fullrune(up->args+i, n-i))
396 				break;
397 		up->args[i] = 0;
398 		n = i+1;
399 	}
400 	up->nargs = n;
401 
402 	/*
403 	 * Committed.
404 	 * Free old memory.
405 	 * Special segments are maintained across exec
406 	 */
407 	for(i = SSEG; i <= BSEG; i++) {
408 		putseg(up->seg[i]);
409 		/* prevent a second free if we have an error */
410 		up->seg[i] = 0;
411 	}
412 	for(i = BSEG+1; i < NSEG; i++) {
413 		s = up->seg[i];
414 		if(s != 0 && (s->type&SG_CEXEC)) {
415 			putseg(s);
416 			up->seg[i] = 0;
417 		}
418 	}
419 
420 	/*
421 	 * Close on exec
422 	 */
423 	f = up->fgrp;
424 	for(i=0; i<=f->maxfd; i++)
425 		fdclose(i, CCEXEC);
426 
427 	/* Text.  Shared. Attaches to cache image if possible */
428 	/* attachimage returns a locked cache image */
429 	img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
430 	ts = img->s;
431 	up->seg[TSEG] = ts;
432 	ts->flushme = 1;
433 	ts->fstart = 0;
434 	ts->flen = sizeof(Exec)+text;
435 	unlock(img);
436 
437 	/* Data. Shared. */
438 	s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
439 	up->seg[DSEG] = s;
440 
441 	/* Attached by hand */
442 	incref(img);
443 	s->image = img;
444 	s->fstart = ts->fstart+ts->flen;
445 	s->flen = data;
446 
447 	/* BSS. Zero fill on demand */
448 	up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
449 
450 	/*
451 	 * Move the stack
452 	 */
453 	s = up->seg[ESEG];
454 	up->seg[ESEG] = 0;
455 	up->seg[SSEG] = s;
456 	qunlock(&up->seglock);
457 	poperror();	/* seglock */
458 	poperror();	/* elem */
459 	s->base = USTKTOP-USTKSIZE;
460 	s->top = USTKTOP;
461 	relocateseg(s, USTKTOP-TSTKTOP);
462 
463 	/*
464 	 *  '/' processes are higher priority (hack to make /ip more responsive).
465 	 */
466 	if(devtab[tc->type]->dc == L'/')
467 		up->basepri = PriRoot;
468 	up->priority = up->basepri;
469 	poperror();
470 	cclose(tc);
471 
472 	/*
473 	 *  At this point, the mmu contains info about the old address
474 	 *  space and needs to be flushed
475 	 */
476 	flushmmu();
477 	qlock(&up->debug);
478 	up->nnote = 0;
479 	up->notify = 0;
480 	up->notified = 0;
481 	up->privatemem = 0;
482 	procsetup(up);
483 	qunlock(&up->debug);
484 	if(up->hang)
485 		up->procctl = Proc_stopme;
486 
487 	return execregs(entry, ssize, nargs);
488 }
489 
490 int
491 shargs(char *s, int n, char **ap)
492 {
493 	int i;
494 
495 	s += 2;
496 	n -= 2;		/* skip #! */
497 	for(i=0; s[i]!='\n'; i++)
498 		if(i == n-1)
499 			return 0;
500 	s[i] = 0;
501 	*ap = 0;
502 	i = 0;
503 	for(;;) {
504 		while(*s==' ' || *s=='\t')
505 			s++;
506 		if(*s == 0)
507 			break;
508 		i++;
509 		*ap++ = s;
510 		*ap = 0;
511 		while(*s && *s!=' ' && *s!='\t')
512 			s++;
513 		if(*s == 0)
514 			break;
515 		else
516 			*s++ = 0;
517 	}
518 	return i;
519 }
520 
521 int
522 return0(void*)
523 {
524 	return 0;
525 }
526 
527 long
528 syssleep(ulong *arg)
529 {
530 
531 	int n;
532 
533 	n = arg[0];
534 	if(n <= 0) {
535 		if (up->edf && (up->edf->flags & Admitted))
536 			edfyield();
537 		else
538 			yield();
539 		return 0;
540 	}
541 	if(n < TK2MS(1))
542 		n = TK2MS(1);
543 	tsleep(&up->sleep, return0, 0, n);
544 	return 0;
545 }
546 
547 long
548 sysalarm(ulong *arg)
549 {
550 	return procalarm(arg[0]);
551 }
552 
553 long
554 sysexits(ulong *arg)
555 {
556 	char *status;
557 	char *inval = "invalid exit string";
558 	char buf[ERRMAX];
559 
560 	status = (char*)arg[0];
561 	if(status){
562 		if(waserror())
563 			status = inval;
564 		else{
565 			validaddr((ulong)status, 1, 0);
566 			if(vmemchr(status, 0, ERRMAX) == 0){
567 				memmove(buf, status, ERRMAX);
568 				buf[ERRMAX-1] = 0;
569 				status = buf;
570 			}
571 			poperror();
572 		}
573 
574 	}
575 	pexit(status, 1);
576 	return 0;		/* not reached */
577 }
578 
579 long
580 sys_wait(ulong *arg)
581 {
582 	int pid;
583 	Waitmsg w;
584 	OWaitmsg *ow;
585 
586 	if(arg[0] == 0)
587 		return pwait(nil);
588 
589 	validaddr(arg[0], sizeof(OWaitmsg), 1);
590 	evenaddr(arg[0]);
591 	pid = pwait(&w);
592 	if(pid >= 0){
593 		ow = (OWaitmsg*)arg[0];
594 		readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
595 		readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
596 		readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
597 		readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
598 		strncpy(ow->msg, w.msg, sizeof(ow->msg));
599 		ow->msg[sizeof(ow->msg)-1] = '\0';
600 	}
601 	return pid;
602 }
603 
604 long
605 sysawait(ulong *arg)
606 {
607 	int i;
608 	int pid;
609 	Waitmsg w;
610 	ulong n;
611 
612 	n = arg[1];
613 	validaddr(arg[0], n, 1);
614 	pid = pwait(&w);
615 	if(pid < 0)
616 		return -1;
617 	i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
618 		w.pid,
619 		w.time[TUser], w.time[TSys], w.time[TReal],
620 		w.msg);
621 
622 	return i;
623 }
624 
625 long
626 sysdeath(ulong*)
627 {
628 	pprint("deprecated system call\n");
629 	pexit("Suicide", 0);
630 	return 0;	/* not reached */
631 }
632 
633 void
634 werrstr(char *fmt, ...)
635 {
636 	va_list va;
637 
638 	if(up == nil)
639 		return;
640 
641 	va_start(va, fmt);
642 	vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
643 	va_end(va);
644 }
645 
646 static long
647 generrstr(char *buf, uint nbuf)
648 {
649 	char tmp[ERRMAX];
650 
651 	if(nbuf == 0)
652 		error(Ebadarg);
653 	validaddr((ulong)buf, nbuf, 1);
654 	if(nbuf > sizeof tmp)
655 		nbuf = sizeof tmp;
656 	memmove(tmp, buf, nbuf);
657 
658 	/* make sure it's NUL-terminated */
659 	tmp[nbuf-1] = '\0';
660 	memmove(buf, up->syserrstr, nbuf);
661 	buf[nbuf-1] = '\0';
662 	memmove(up->syserrstr, tmp, nbuf);
663 	return 0;
664 }
665 
666 long
667 syserrstr(ulong *arg)
668 {
669 	return generrstr((char*)arg[0], arg[1]);
670 }
671 
672 /* compatibility for old binaries */
673 long
674 sys_errstr(ulong *arg)
675 {
676 	return generrstr((char*)arg[0], 64);
677 }
678 
679 long
680 sysnotify(ulong *arg)
681 {
682 	if(arg[0] != 0)
683 		validaddr(arg[0], sizeof(ulong), 0);
684 	up->notify = (int(*)(void*, char*))(arg[0]);
685 	return 0;
686 }
687 
688 long
689 sysnoted(ulong *arg)
690 {
691 	if(arg[0]!=NRSTR && !up->notified)
692 		error(Egreg);
693 	return 0;
694 }
695 
696 long
697 syssegbrk(ulong *arg)
698 {
699 	int i;
700 	ulong addr;
701 	Segment *s;
702 
703 	addr = arg[0];
704 	for(i = 0; i < NSEG; i++) {
705 		s = up->seg[i];
706 		if(s == 0 || addr < s->base || addr >= s->top)
707 			continue;
708 		switch(s->type&SG_TYPE) {
709 		case SG_TEXT:
710 		case SG_DATA:
711 		case SG_STACK:
712 			error(Ebadarg);
713 		default:
714 			return ibrk(arg[1], i);
715 		}
716 	}
717 
718 	error(Ebadarg);
719 	return 0;		/* not reached */
720 }
721 
722 long
723 syssegattach(ulong *arg)
724 {
725 	return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
726 }
727 
728 long
729 syssegdetach(ulong *arg)
730 {
731 	int i;
732 	ulong addr;
733 	Segment *s;
734 
735 	qlock(&up->seglock);
736 	if(waserror()){
737 		qunlock(&up->seglock);
738 		nexterror();
739 	}
740 
741 	s = 0;
742 	addr = arg[0];
743 	for(i = 0; i < NSEG; i++)
744 		if(s = up->seg[i]) {
745 			qlock(&s->lk);
746 			if((addr >= s->base && addr < s->top) ||
747 			   (s->top == s->base && addr == s->base))
748 				goto found;
749 			qunlock(&s->lk);
750 		}
751 
752 	error(Ebadarg);
753 
754 found:
755 	/*
756 	 * Check we are not detaching the initial stack segment.
757 	 */
758 	if(s == up->seg[SSEG]){
759 		qunlock(&s->lk);
760 		error(Ebadarg);
761 	}
762 	up->seg[i] = 0;
763 	qunlock(&s->lk);
764 	putseg(s);
765 	qunlock(&up->seglock);
766 	poperror();
767 
768 	/* Ensure we flush any entries from the lost segment */
769 	flushmmu();
770 	return 0;
771 }
772 
773 long
774 syssegfree(ulong *arg)
775 {
776 	Segment *s;
777 	ulong from, to;
778 
779 	from = arg[0];
780 	s = seg(up, from, 1);
781 	if(s == nil)
782 		error(Ebadarg);
783 	to = (from + arg[1]) & ~(BY2PG-1);
784 	from = PGROUND(from);
785 
786 	if(to > s->top) {
787 		qunlock(&s->lk);
788 		error(Ebadarg);
789 	}
790 
791 	mfreeseg(s, from, (to - from) / BY2PG);
792 	qunlock(&s->lk);
793 	flushmmu();
794 
795 	return 0;
796 }
797 
798 /* For binary compatibility */
799 long
800 sysbrk_(ulong *arg)
801 {
802 	return ibrk(arg[0], BSEG);
803 }
804 
805 long
806 sysrendezvous(ulong *arg)
807 {
808 	uintptr tag, val;
809 	Proc *p, **l;
810 
811 	tag = arg[0];
812 	l = &REND(up->rgrp, tag);
813 	up->rendval = ~(uintptr)0;
814 
815 	lock(up->rgrp);
816 	for(p = *l; p; p = p->rendhash) {
817 		if(p->rendtag == tag) {
818 			*l = p->rendhash;
819 			val = p->rendval;
820 			p->rendval = arg[1];
821 
822 			while(p->mach != 0)
823 				;
824 			ready(p);
825 			unlock(up->rgrp);
826 			return val;
827 		}
828 		l = &p->rendhash;
829 	}
830 
831 	/* Going to sleep here */
832 	up->rendtag = tag;
833 	up->rendval = arg[1];
834 	up->rendhash = *l;
835 	*l = up;
836 	up->state = Rendezvous;
837 	unlock(up->rgrp);
838 
839 	sched();
840 
841 	return up->rendval;
842 }
843 
844 /*
845  * The implementation of semaphores is complicated by needing
846  * to avoid rescheduling in syssemrelease, so that it is safe
847  * to call from real-time processes.  This means syssemrelease
848  * cannot acquire any qlocks, only spin locks.
849  *
850  * Semacquire and semrelease must both manipulate the semaphore
851  * wait list.  Lock-free linked lists only exist in theory, not
852  * in practice, so the wait list is protected by a spin lock.
853  *
854  * The semaphore value *addr is stored in user memory, so it
855  * cannot be read or written while holding spin locks.
856  *
857  * Thus, we can access the list only when holding the lock, and
858  * we can access the semaphore only when not holding the lock.
859  * This makes things interesting.  Note that sleep's condition function
860  * is called while holding two locks - r and up->rlock - so it cannot
861  * access the semaphore value either.
862  *
863  * An acquirer announces its intention to try for the semaphore
864  * by putting a Sema structure onto the wait list and then
865  * setting Sema.waiting.  After one last check of semaphore,
866  * the acquirer sleeps until Sema.waiting==0.  A releaser of n
867  * must wake up n acquirers who have Sema.waiting set.  It does
868  * this by clearing Sema.waiting and then calling wakeup.
869  *
870  * There are three interesting races here.
871 
872  * The first is that in this particular sleep/wakeup usage, a single
873  * wakeup can rouse a process from two consecutive sleeps!
874  * The ordering is:
875  *
876  * 	(a) set Sema.waiting = 1
877  * 	(a) call sleep
878  * 	(b) set Sema.waiting = 0
879  * 	(a) check Sema.waiting inside sleep, return w/o sleeping
880  * 	(a) try for semaphore, fail
881  * 	(a) set Sema.waiting = 1
882  * 	(a) call sleep
883  * 	(b) call wakeup(a)
884  * 	(a) wake up again
885  *
886  * This is okay - semacquire will just go around the loop
887  * again.  It does mean that at the top of the for(;;) loop in
888  * semacquire, phore.waiting might already be set to 1.
889  *
890  * The second is that a releaser might wake an acquirer who is
891  * interrupted before he can acquire the lock.  Since
892  * release(n) issues only n wakeup calls -- only n can be used
893  * anyway -- if the interrupted process is not going to use his
894  * wakeup call he must pass it on to another acquirer.
895  *
896  * The third race is similar to the second but more subtle.  An
897  * acquirer sets waiting=1 and then does a final canacquire()
898  * before going to sleep.  The opposite order would result in
899  * missing wakeups that happen between canacquire and
900  * waiting=1.  (In fact, the whole point of Sema.waiting is to
901  * avoid missing wakeups between canacquire() and sleep().) But
902  * there can be spurious wakeups between a successful
903  * canacquire() and the following semdequeue().  This wakeup is
904  * not useful to the acquirer, since he has already acquired
905  * the semaphore.  Like in the previous case, though, the
906  * acquirer must pass the wakeup call along.
907  *
908  * This is all rather subtle.  The code below has been verified
909  * with the spin model /sys/src/9/port/semaphore.p.  The
910  * original code anticipated the second race but not the first
911  * or third, which were caught only with spin.  The first race
912  * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
913  * It was lucky that my abstract model of sleep/wakeup still managed
914  * to preserve that behavior.
915  *
916  * I remain slightly concerned about memory coherence
917  * outside of locks.  The spin model does not take
918  * queued processor writes into account so we have to
919  * think hard.  The only variables accessed outside locks
920  * are the semaphore value itself and the boolean flag
921  * Sema.waiting.  The value is only accessed with cmpswap,
922  * whose job description includes doing the right thing as
923  * far as memory coherence across processors.  That leaves
924  * Sema.waiting.  To handle it, we call coherence() before each
925  * read and after each write.		- rsc
926  */
927 
928 /* Add semaphore p with addr a to list in seg. */
929 static void
930 semqueue(Segment *s, long *a, Sema *p)
931 {
932 	memset(p, 0, sizeof *p);
933 	p->addr = a;
934 	lock(&s->sema);	/* uses s->sema.Rendez.Lock, but no one else is */
935 	p->next = &s->sema;
936 	p->prev = s->sema.prev;
937 	p->next->prev = p;
938 	p->prev->next = p;
939 	unlock(&s->sema);
940 }
941 
942 /* Remove semaphore p from list in seg. */
943 static void
944 semdequeue(Segment *s, Sema *p)
945 {
946 	lock(&s->sema);
947 	p->next->prev = p->prev;
948 	p->prev->next = p->next;
949 	unlock(&s->sema);
950 }
951 
952 /* Wake up n waiters with addr a on list in seg. */
953 static void
954 semwakeup(Segment *s, long *a, long n)
955 {
956 	Sema *p;
957 
958 	lock(&s->sema);
959 	for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
960 		if(p->addr == a && p->waiting){
961 			p->waiting = 0;
962 			coherence();
963 			wakeup(p);
964 			n--;
965 		}
966 	}
967 	unlock(&s->sema);
968 }
969 
970 /* Add delta to semaphore and wake up waiters as appropriate. */
971 static long
972 semrelease(Segment *s, long *addr, long delta)
973 {
974 	long value;
975 
976 	do
977 		value = *addr;
978 	while(!cmpswap(addr, value, value+delta));
979 	semwakeup(s, addr, delta);
980 	return value+delta;
981 }
982 
983 /* Try to acquire semaphore using compare-and-swap */
984 static int
985 canacquire(long *addr)
986 {
987 	long value;
988 
989 	while((value=*addr) > 0)
990 		if(cmpswap(addr, value, value-1))
991 			return 1;
992 	return 0;
993 }
994 
995 /* Should we wake up? */
996 static int
997 semawoke(void *p)
998 {
999 	coherence();
1000 	return !((Sema*)p)->waiting;
1001 }
1002 
1003 /* Acquire semaphore (subtract 1). */
1004 static int
1005 semacquire(Segment *s, long *addr, int block)
1006 {
1007 	int acquired;
1008 	Sema phore;
1009 
1010 	if(canacquire(addr))
1011 		return 1;
1012 	if(!block)
1013 		return 0;
1014 
1015 	acquired = 0;
1016 	semqueue(s, addr, &phore);
1017 	for(;;){
1018 		phore.waiting = 1;
1019 		coherence();
1020 		if(canacquire(addr)){
1021 			acquired = 1;
1022 			break;
1023 		}
1024 		if(waserror())
1025 			break;
1026 		sleep(&phore, semawoke, &phore);
1027 		poperror();
1028 	}
1029 	semdequeue(s, &phore);
1030 	coherence();	/* not strictly necessary due to lock in semdequeue */
1031 	if(!phore.waiting)
1032 		semwakeup(s, addr, 1);
1033 	if(!acquired)
1034 		nexterror();
1035 	return 1;
1036 }
1037 
1038 long
1039 syssemacquire(ulong *arg)
1040 {
1041 	int block;
1042 	long *addr;
1043 	Segment *s;
1044 
1045 	validaddr(arg[0], sizeof(long), 1);
1046 	evenaddr(arg[0]);
1047 	addr = (long*)arg[0];
1048 	block = arg[1];
1049 
1050 	if((s = seg(up, (ulong)addr, 0)) == nil)
1051 		error(Ebadarg);
1052 	if(*addr < 0)
1053 		error(Ebadarg);
1054 	return semacquire(s, addr, block);
1055 }
1056 
1057 long
1058 syssemrelease(ulong *arg)
1059 {
1060 	long *addr, delta;
1061 	Segment *s;
1062 
1063 	validaddr(arg[0], sizeof(long), 1);
1064 	evenaddr(arg[0]);
1065 	addr = (long*)arg[0];
1066 	delta = arg[1];
1067 
1068 	if((s = seg(up, (ulong)addr, 0)) == nil)
1069 		error(Ebadarg);
1070 	if(delta < 0 || *addr < 0)
1071 		error(Ebadarg);
1072 	return semrelease(s, addr, arg[1]);
1073 }
1074