xref: /plan9-contrib/sys/src/9/port/sysproc.c (revision 3468a4915d661daa200976acc4f80f51aae144b2)
1 #include	"u.h"
2 #include	"tos.h"
3 #include	"../port/lib.h"
4 #include	"mem.h"
5 #include	"dat.h"
6 #include	"fns.h"
7 #include	"../port/error.h"
8 #include	"edf.h"
9 
10 #include	<a.out.h>
11 
12 int	shargs(char*, int, char**);
13 
14 extern void checkpages(void);
15 extern void checkpagerefs(void);
16 
17 long
18 sysr1(ulong*)
19 {
20 	checkpagerefs();
21 	return 0;
22 }
23 
24 long
25 sysrfork(ulong *arg)
26 {
27 	Proc *p;
28 	int n, i;
29 	Fgrp *ofg;
30 	Pgrp *opg;
31 	Rgrp *org;
32 	Egrp *oeg;
33 	ulong pid, flag;
34 	Mach *wm;
35 
36 	flag = arg[0];
37 	/* Check flags before we commit */
38 	if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
39 		error(Ebadarg);
40 	if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
41 		error(Ebadarg);
42 	if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
43 		error(Ebadarg);
44 
45 	if((flag&RFPROC) == 0) {
46 		if(flag & (RFMEM|RFNOWAIT))
47 			error(Ebadarg);
48 		if(flag & (RFFDG|RFCFDG)) {
49 			ofg = up->fgrp;
50 			if(flag & RFFDG)
51 				up->fgrp = dupfgrp(ofg);
52 			else
53 				up->fgrp = dupfgrp(nil);
54 			closefgrp(ofg);
55 		}
56 		if(flag & (RFNAMEG|RFCNAMEG)) {
57 			opg = up->pgrp;
58 			up->pgrp = newpgrp();
59 			if(flag & RFNAMEG)
60 				pgrpcpy(up->pgrp, opg);
61 			/* inherit noattach */
62 			up->pgrp->noattach = opg->noattach;
63 			closepgrp(opg);
64 		}
65 		if(flag & RFNOMNT)
66 			up->pgrp->noattach = 1;
67 		if(flag & RFREND) {
68 			org = up->rgrp;
69 			up->rgrp = newrgrp();
70 			closergrp(org);
71 		}
72 		if(flag & (RFENVG|RFCENVG)) {
73 			oeg = up->egrp;
74 			up->egrp = smalloc(sizeof(Egrp));
75 			up->egrp->ref = 1;
76 			if(flag & RFENVG)
77 				envcpy(up->egrp, oeg);
78 			closeegrp(oeg);
79 		}
80 		if(flag & RFNOTEG)
81 			up->noteid = incref(&noteidalloc);
82 		return 0;
83 	}
84 
85 	p = newproc();
86 
87 	p->fpsave = up->fpsave;
88 	p->scallnr = up->scallnr;
89 	p->s = up->s;
90 	p->nerrlab = 0;
91 	p->slash = up->slash;
92 	p->dot = up->dot;
93 	incref(p->dot);
94 
95 	memmove(p->note, up->note, sizeof(p->note));
96 	p->privatemem = up->privatemem;
97 	p->noswap = up->noswap;
98 	p->nnote = up->nnote;
99 	p->notified = 0;
100 	p->lastnote = up->lastnote;
101 	p->notify = up->notify;
102 	p->ureg = up->ureg;
103 	p->dbgreg = 0;
104 
105 	/* Make a new set of memory segments */
106 	n = flag & RFMEM;
107 	qlock(&p->seglock);
108 	if(waserror()){
109 		qunlock(&p->seglock);
110 		nexterror();
111 	}
112 	for(i = 0; i < NSEG; i++)
113 		if(up->seg[i])
114 			p->seg[i] = dupseg(up->seg, i, n);
115 	qunlock(&p->seglock);
116 	poperror();
117 
118 	/* File descriptors */
119 	if(flag & (RFFDG|RFCFDG)) {
120 		if(flag & RFFDG)
121 			p->fgrp = dupfgrp(up->fgrp);
122 		else
123 			p->fgrp = dupfgrp(nil);
124 	}
125 	else {
126 		p->fgrp = up->fgrp;
127 		incref(p->fgrp);
128 	}
129 
130 	/* Process groups */
131 	if(flag & (RFNAMEG|RFCNAMEG)) {
132 		p->pgrp = newpgrp();
133 		if(flag & RFNAMEG)
134 			pgrpcpy(p->pgrp, up->pgrp);
135 		/* inherit noattach */
136 		p->pgrp->noattach = up->pgrp->noattach;
137 	}
138 	else {
139 		p->pgrp = up->pgrp;
140 		incref(p->pgrp);
141 	}
142 	if(flag & RFNOMNT)
143 		up->pgrp->noattach = 1;
144 
145 	if(flag & RFREND)
146 		p->rgrp = newrgrp();
147 	else {
148 		incref(up->rgrp);
149 		p->rgrp = up->rgrp;
150 	}
151 
152 	/* Environment group */
153 	if(flag & (RFENVG|RFCENVG)) {
154 		p->egrp = smalloc(sizeof(Egrp));
155 		p->egrp->ref = 1;
156 		if(flag & RFENVG)
157 			envcpy(p->egrp, up->egrp);
158 	}
159 	else {
160 		p->egrp = up->egrp;
161 		incref(p->egrp);
162 	}
163 	p->hang = up->hang;
164 	p->procmode = up->procmode;
165 
166 	/* Craft a return frame which will cause the child to pop out of
167 	 * the scheduler in user mode with the return register zero
168 	 */
169 	forkchild(p, up->dbgreg);
170 
171 	p->parent = up;
172 	p->parentpid = up->pid;
173 	if(flag&RFNOWAIT)
174 		p->parentpid = 0;
175 	else {
176 		lock(&up->exl);
177 		up->nchild++;
178 		unlock(&up->exl);
179 	}
180 	if((flag&RFNOTEG) == 0)
181 		p->noteid = up->noteid;
182 
183 	p->fpstate = up->fpstate;
184 	pid = p->pid;
185 	memset(p->time, 0, sizeof(p->time));
186 	p->time[TReal] = MACHP(0)->ticks;
187 
188 	kstrdup(&p->text, up->text);
189 	kstrdup(&p->user, up->user);
190 	/*
191 	 *  since the bss/data segments are now shareable,
192 	 *  any mmu info about this process is now stale
193 	 *  (i.e. has bad properties) and has to be discarded.
194 	 */
195 	flushmmu();
196 	p->basepri = up->basepri;
197 	p->priority = up->basepri;
198 	p->fixedpri = up->fixedpri;
199 	p->mp = up->mp;
200 	wm = up->wired;
201 	if(wm)
202 		procwired(p, wm->machno);
203 	ready(p);
204 	sched();
205 	return pid;
206 }
207 
208 static ulong
209 l2be(long l)
210 {
211 	uchar *cp;
212 
213 	cp = (uchar*)&l;
214 	return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
215 }
216 
217 long
218 sysexec(ulong *arg)
219 {
220 	Segment *s, *ts;
221 	ulong t, d, b;
222 	int i;
223 	Chan *tc;
224 	char **argv, **argp;
225 	char *a, *charp, *args, *file, *file0;
226 	char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
227 	ulong ssize, spage, nargs, nbytes, n, bssend;
228 	int indir;
229 	Exec exec;
230 	char line[sizeof(Exec)];
231 	Fgrp *f;
232 	Image *img;
233 	ulong magic, text, entry, data, bss;
234 	Tos *tos;
235 
236 	indir = 0;
237 	elem = nil;
238 	validaddr(arg[0], 1, 0);
239 	file0 = validnamedup((char*)arg[0], 1);
240 	if(waserror()){
241 		free(file0);
242 		free(elem);
243 		nexterror();
244 	}
245 	file = file0;
246 	for(;;){
247 		tc = namec(file, Aopen, OEXEC, 0);
248 		if(waserror()){
249 			cclose(tc);
250 			nexterror();
251 		}
252 		if(!indir)
253 			kstrdup(&elem, up->genbuf);
254 
255 		n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
256 		if(n < 2)
257 			error(Ebadexec);
258 		magic = l2be(exec.magic);
259 		text = l2be(exec.text);
260 		entry = l2be(exec.entry);
261 		if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
262 			if(text >= USTKTOP-UTZERO
263 			|| entry < UTZERO+sizeof(Exec)
264 			|| entry >= UTZERO+sizeof(Exec)+text)
265 				error(Ebadexec);
266 			break; /* for binary */
267 		}
268 
269 		/*
270 		 * Process #! /bin/sh args ...
271 		 */
272 		memmove(line, &exec, sizeof(Exec));
273 		if(indir || line[0]!='#' || line[1]!='!')
274 			error(Ebadexec);
275 		n = shargs(line, n, progarg);
276 		if(n == 0)
277 			error(Ebadexec);
278 		indir = 1;
279 		/*
280 		 * First arg becomes complete file name
281 		 */
282 		progarg[n++] = file;
283 		progarg[n] = 0;
284 		validaddr(arg[1], BY2WD, 1);
285 		arg[1] += BY2WD;
286 		file = progarg[0];
287 		if(strlen(elem) >= sizeof progelem)
288 			error(Ebadexec);
289 		strcpy(progelem, elem);
290 		progarg[0] = progelem;
291 		poperror();
292 		cclose(tc);
293 	}
294 
295 	data = l2be(exec.data);
296 	bss = l2be(exec.bss);
297 	t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1);
298 	d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
299 	bssend = t + data + bss;
300 	b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
301 	if(t >= KZERO || d >= KZERO || b >= KZERO)
302 		error(Ebadexec);
303 
304 	/*
305 	 * Args: pass 1: count
306 	 */
307 	nbytes = sizeof(Tos);		/* hole for profiling clock at top of stack (and more) */
308 	nargs = 0;
309 	if(indir){
310 		argp = progarg;
311 		while(*argp){
312 			a = *argp++;
313 			nbytes += strlen(a) + 1;
314 			nargs++;
315 		}
316 	}
317 	evenaddr(arg[1]);
318 	argp = (char**)arg[1];
319 	validaddr((ulong)argp, BY2WD, 0);
320 	while(*argp){
321 		a = *argp++;
322 		if(((ulong)argp&(BY2PG-1)) < BY2WD)
323 			validaddr((ulong)argp, BY2WD, 0);
324 		validaddr((ulong)a, 1, 0);
325 		nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
326 		nargs++;
327 	}
328 	ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
329 
330 	/*
331 	 * 8-byte align SP for those (e.g. sparc) that need it.
332 	 * execregs() will subtract another 4 bytes for argc.
333 	 */
334 	if((ssize+4) & 7)
335 		ssize += 4;
336 	spage = (ssize+(BY2PG-1)) >> PGSHIFT;
337 
338 	/*
339 	 * Build the stack segment, putting it in kernel virtual for the moment
340 	 */
341 	if(spage > TSTKSIZ)
342 		error(Enovmem);
343 
344 	qlock(&up->seglock);
345 	if(waserror()){
346 		qunlock(&up->seglock);
347 		nexterror();
348 	}
349 	up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
350 
351 	/*
352 	 * Args: pass 2: assemble; the pages will be faulted in
353 	 */
354 	tos = (Tos*)(TSTKTOP - sizeof(Tos));
355 	tos->cyclefreq = m->cyclefreq;
356 	cycles((uvlong*)&tos->pcycles);
357 	tos->pcycles = -tos->pcycles;
358 	tos->kcycles = tos->pcycles;
359 	tos->clock = 0;
360 	argv = (char**)(TSTKTOP - ssize);
361 	charp = (char*)(TSTKTOP - nbytes);
362 	args = charp;
363 	if(indir)
364 		argp = progarg;
365 	else
366 		argp = (char**)arg[1];
367 
368 	for(i=0; i<nargs; i++){
369 		if(indir && *argp==0) {
370 			indir = 0;
371 			argp = (char**)arg[1];
372 		}
373 		*argv++ = charp + (USTKTOP-TSTKTOP);
374 		n = strlen(*argp) + 1;
375 		memmove(charp, *argp++, n);
376 		charp += n;
377 	}
378 	free(file0);
379 
380 	free(up->text);
381 	up->text = elem;
382 	elem = nil;	/* so waserror() won't free elem */
383 	USED(elem);
384 
385 	/* copy args; easiest from new process's stack */
386 	n = charp - args;
387 	if(n > 128)	/* don't waste too much space on huge arg lists */
388 		n = 128;
389 	a = up->args;
390 	up->args = nil;
391 	free(a);
392 	up->args = smalloc(n);
393 	memmove(up->args, args, n);
394 	if(n>0 && up->args[n-1]!='\0'){
395 		/* make sure last arg is NUL-terminated */
396 		/* put NUL at UTF-8 character boundary */
397 		for(i=n-1; i>0; --i)
398 			if(fullrune(up->args+i, n-i))
399 				break;
400 		up->args[i] = 0;
401 		n = i+1;
402 	}
403 	up->nargs = n;
404 
405 	/*
406 	 * Committed.
407 	 * Free old memory.
408 	 * Special segments are maintained across exec
409 	 */
410 	for(i = SSEG; i <= BSEG; i++) {
411 		putseg(up->seg[i]);
412 		/* prevent a second free if we have an error */
413 		up->seg[i] = 0;
414 	}
415 	for(i = BSEG+1; i < NSEG; i++) {
416 		s = up->seg[i];
417 		if(s != 0 && (s->type&SG_CEXEC)) {
418 			putseg(s);
419 			up->seg[i] = 0;
420 		}
421 	}
422 
423 	/*
424 	 * Close on exec
425 	 */
426 	f = up->fgrp;
427 	for(i=0; i<=f->maxfd; i++)
428 		fdclose(i, CCEXEC);
429 
430 	/* Text.  Shared. Attaches to cache image if possible */
431 	/* attachimage returns a locked cache image */
432 	img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
433 	ts = img->s;
434 	up->seg[TSEG] = ts;
435 	ts->flushme = 1;
436 	ts->fstart = 0;
437 	ts->flen = sizeof(Exec)+text;
438 	unlock(img);
439 
440 	/* Data. Shared. */
441 	s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
442 	up->seg[DSEG] = s;
443 
444 	/* Attached by hand */
445 	incref(img);
446 	s->image = img;
447 	s->fstart = ts->fstart+ts->flen;
448 	s->flen = data;
449 
450 	/* BSS. Zero fill on demand */
451 	up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
452 
453 	/*
454 	 * Move the stack
455 	 */
456 	s = up->seg[ESEG];
457 	up->seg[ESEG] = 0;
458 	up->seg[SSEG] = s;
459 	qunlock(&up->seglock);
460 	poperror();	/* seglock */
461 	poperror();	/* elem */
462 	s->base = USTKTOP-USTKSIZE;
463 	s->top = USTKTOP;
464 	relocateseg(s, USTKTOP-TSTKTOP);
465 
466 	/*
467 	 *  '/' processes are higher priority (hack to make /ip more responsive).
468 	 */
469 	if(devtab[tc->type]->dc == L'/')
470 		up->basepri = PriRoot;
471 	up->priority = up->basepri;
472 	poperror();
473 	cclose(tc);
474 
475 	/*
476 	 *  At this point, the mmu contains info about the old address
477 	 *  space and needs to be flushed
478 	 */
479 	flushmmu();
480 	qlock(&up->debug);
481 	up->nnote = 0;
482 	up->notify = 0;
483 	up->notified = 0;
484 	up->privatemem = 0;
485 	procsetup(up);
486 	qunlock(&up->debug);
487 	if(up->hang)
488 		up->procctl = Proc_stopme;
489 
490 	return execregs(entry, ssize, nargs);
491 }
492 
493 int
494 shargs(char *s, int n, char **ap)
495 {
496 	int i;
497 
498 	s += 2;
499 	n -= 2;		/* skip #! */
500 	for(i=0; s[i]!='\n'; i++)
501 		if(i == n-1)
502 			return 0;
503 	s[i] = 0;
504 	*ap = 0;
505 	i = 0;
506 	for(;;) {
507 		while(*s==' ' || *s=='\t')
508 			s++;
509 		if(*s == 0)
510 			break;
511 		i++;
512 		*ap++ = s;
513 		*ap = 0;
514 		while(*s && *s!=' ' && *s!='\t')
515 			s++;
516 		if(*s == 0)
517 			break;
518 		else
519 			*s++ = 0;
520 	}
521 	return i;
522 }
523 
524 int
525 return0(void*)
526 {
527 	return 0;
528 }
529 
530 long
531 syssleep(ulong *arg)
532 {
533 
534 	int n;
535 
536 	n = arg[0];
537 	if(n <= 0) {
538 		if (up->edf && (up->edf->flags & Admitted))
539 			edfyield();
540 		else
541 			yield();
542 		return 0;
543 	}
544 	if(n < TK2MS(1))
545 		n = TK2MS(1);
546 	tsleep(&up->sleep, return0, 0, n);
547 	return 0;
548 }
549 
550 long
551 sysalarm(ulong *arg)
552 {
553 	return procalarm(arg[0]);
554 }
555 
556 long
557 sysexits(ulong *arg)
558 {
559 	char *status;
560 	char *inval = "invalid exit string";
561 	char buf[ERRMAX];
562 
563 	status = (char*)arg[0];
564 	if(status){
565 		if(waserror())
566 			status = inval;
567 		else{
568 			validaddr((ulong)status, 1, 0);
569 			if(vmemchr(status, 0, ERRMAX) == 0){
570 				memmove(buf, status, ERRMAX);
571 				buf[ERRMAX-1] = 0;
572 				status = buf;
573 			}
574 			poperror();
575 		}
576 
577 	}
578 	pexit(status, 1);
579 	return 0;		/* not reached */
580 }
581 
582 long
583 sys_wait(ulong *arg)
584 {
585 	int pid;
586 	Waitmsg w;
587 	OWaitmsg *ow;
588 
589 	if(arg[0] == 0)
590 		return pwait(nil);
591 
592 	validaddr(arg[0], sizeof(OWaitmsg), 1);
593 	evenaddr(arg[0]);
594 	pid = pwait(&w);
595 	if(pid >= 0){
596 		ow = (OWaitmsg*)arg[0];
597 		readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
598 		readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
599 		readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
600 		readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
601 		strncpy(ow->msg, w.msg, sizeof(ow->msg));
602 		ow->msg[sizeof(ow->msg)-1] = '\0';
603 	}
604 	return pid;
605 }
606 
607 long
608 sysawait(ulong *arg)
609 {
610 	int i;
611 	int pid;
612 	Waitmsg w;
613 	ulong n;
614 
615 	n = arg[1];
616 	validaddr(arg[0], n, 1);
617 	pid = pwait(&w);
618 	if(pid < 0)
619 		return -1;
620 	i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
621 		w.pid,
622 		w.time[TUser], w.time[TSys], w.time[TReal],
623 		w.msg);
624 
625 	return i;
626 }
627 
628 void
629 werrstr(char *fmt, ...)
630 {
631 	va_list va;
632 
633 	if(up == nil)
634 		return;
635 
636 	va_start(va, fmt);
637 	vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
638 	va_end(va);
639 }
640 
641 static long
642 generrstr(char *buf, uint nbuf)
643 {
644 	char tmp[ERRMAX];
645 
646 	if(nbuf == 0)
647 		error(Ebadarg);
648 	validaddr((ulong)buf, nbuf, 1);
649 	if(nbuf > sizeof tmp)
650 		nbuf = sizeof tmp;
651 	memmove(tmp, buf, nbuf);
652 
653 	/* make sure it's NUL-terminated */
654 	tmp[nbuf-1] = '\0';
655 	memmove(buf, up->syserrstr, nbuf);
656 	buf[nbuf-1] = '\0';
657 	memmove(up->syserrstr, tmp, nbuf);
658 	return 0;
659 }
660 
661 long
662 syserrstr(ulong *arg)
663 {
664 	return generrstr((char*)arg[0], arg[1]);
665 }
666 
667 /* compatibility for old binaries */
668 long
669 sys_errstr(ulong *arg)
670 {
671 	return generrstr((char*)arg[0], 64);
672 }
673 
674 long
675 sysnotify(ulong *arg)
676 {
677 	if(arg[0] != 0)
678 		validaddr(arg[0], sizeof(ulong), 0);
679 	up->notify = (int(*)(void*, char*))(arg[0]);
680 	return 0;
681 }
682 
683 long
684 sysnoted(ulong *arg)
685 {
686 	if(arg[0]!=NRSTR && !up->notified)
687 		error(Egreg);
688 	return 0;
689 }
690 
691 long
692 syssegbrk(ulong *arg)
693 {
694 	int i;
695 	ulong addr;
696 	Segment *s;
697 
698 	addr = arg[0];
699 	for(i = 0; i < NSEG; i++) {
700 		s = up->seg[i];
701 		if(s == 0 || addr < s->base || addr >= s->top)
702 			continue;
703 		switch(s->type&SG_TYPE) {
704 		case SG_TEXT:
705 		case SG_DATA:
706 		case SG_STACK:
707 			error(Ebadarg);
708 		default:
709 			return ibrk(arg[1], i);
710 		}
711 	}
712 
713 	error(Ebadarg);
714 	return 0;		/* not reached */
715 }
716 
717 long
718 syssegattach(ulong *arg)
719 {
720 	return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
721 }
722 
723 long
724 syssegdetach(ulong *arg)
725 {
726 	int i;
727 	ulong addr;
728 	Segment *s;
729 
730 	qlock(&up->seglock);
731 	if(waserror()){
732 		qunlock(&up->seglock);
733 		nexterror();
734 	}
735 
736 	s = 0;
737 	addr = arg[0];
738 	for(i = 0; i < NSEG; i++)
739 		if(s = up->seg[i]) {
740 			qlock(&s->lk);
741 			if((addr >= s->base && addr < s->top) ||
742 			   (s->top == s->base && addr == s->base))
743 				goto found;
744 			qunlock(&s->lk);
745 		}
746 
747 	error(Ebadarg);
748 
749 found:
750 	/*
751 	 * Check we are not detaching the initial stack segment.
752 	 */
753 	if(s == up->seg[SSEG]){
754 		qunlock(&s->lk);
755 		error(Ebadarg);
756 	}
757 	up->seg[i] = 0;
758 	qunlock(&s->lk);
759 	putseg(s);
760 	qunlock(&up->seglock);
761 	poperror();
762 
763 	/* Ensure we flush any entries from the lost segment */
764 	flushmmu();
765 	return 0;
766 }
767 
768 long
769 syssegfree(ulong *arg)
770 {
771 	Segment *s;
772 	ulong from, to;
773 
774 	from = arg[0];
775 	s = seg(up, from, 1);
776 	if(s == nil)
777 		error(Ebadarg);
778 	to = (from + arg[1]) & ~(BY2PG-1);
779 	from = PGROUND(from);
780 
781 	if(to > s->top) {
782 		qunlock(&s->lk);
783 		error(Ebadarg);
784 	}
785 
786 	mfreeseg(s, from, (to - from) / BY2PG);
787 	qunlock(&s->lk);
788 	flushmmu();
789 
790 	return 0;
791 }
792 
793 /* For binary compatibility */
794 long
795 sysbrk_(ulong *arg)
796 {
797 	return ibrk(arg[0], BSEG);
798 }
799 
800 long
801 sysrendezvous(ulong *arg)
802 {
803 	uintptr tag, val;
804 	Proc *p, **l;
805 
806 	tag = arg[0];
807 	l = &REND(up->rgrp, tag);
808 	up->rendval = ~(uintptr)0;
809 
810 	lock(up->rgrp);
811 	for(p = *l; p; p = p->rendhash) {
812 		if(p->rendtag == tag) {
813 			*l = p->rendhash;
814 			val = p->rendval;
815 			p->rendval = arg[1];
816 
817 			while(p->mach != 0)
818 				;
819 			ready(p);
820 			unlock(up->rgrp);
821 			return val;
822 		}
823 		l = &p->rendhash;
824 	}
825 
826 	/* Going to sleep here */
827 	up->rendtag = tag;
828 	up->rendval = arg[1];
829 	up->rendhash = *l;
830 	*l = up;
831 	up->state = Rendezvous;
832 	unlock(up->rgrp);
833 
834 	sched();
835 
836 	return up->rendval;
837 }
838 
839 /*
840  * The implementation of semaphores is complicated by needing
841  * to avoid rescheduling in syssemrelease, so that it is safe
842  * to call from real-time processes.  This means syssemrelease
843  * cannot acquire any qlocks, only spin locks.
844  *
845  * Semacquire and semrelease must both manipulate the semaphore
846  * wait list.  Lock-free linked lists only exist in theory, not
847  * in practice, so the wait list is protected by a spin lock.
848  *
849  * The semaphore value *addr is stored in user memory, so it
850  * cannot be read or written while holding spin locks.
851  *
852  * Thus, we can access the list only when holding the lock, and
853  * we can access the semaphore only when not holding the lock.
854  * This makes things interesting.  Note that sleep's condition function
855  * is called while holding two locks - r and up->rlock - so it cannot
856  * access the semaphore value either.
857  *
858  * An acquirer announces its intention to try for the semaphore
859  * by putting a Sema structure onto the wait list and then
860  * setting Sema.waiting.  After one last check of semaphore,
861  * the acquirer sleeps until Sema.waiting==0.  A releaser of n
862  * must wake up n acquirers who have Sema.waiting set.  It does
863  * this by clearing Sema.waiting and then calling wakeup.
864  *
865  * There are three interesting races here.
866 
867  * The first is that in this particular sleep/wakeup usage, a single
868  * wakeup can rouse a process from two consecutive sleeps!
869  * The ordering is:
870  *
871  * 	(a) set Sema.waiting = 1
872  * 	(a) call sleep
873  * 	(b) set Sema.waiting = 0
874  * 	(a) check Sema.waiting inside sleep, return w/o sleeping
875  * 	(a) try for semaphore, fail
876  * 	(a) set Sema.waiting = 1
877  * 	(a) call sleep
878  * 	(b) call wakeup(a)
879  * 	(a) wake up again
880  *
881  * This is okay - semacquire will just go around the loop
882  * again.  It does mean that at the top of the for(;;) loop in
883  * semacquire, phore.waiting might already be set to 1.
884  *
885  * The second is that a releaser might wake an acquirer who is
886  * interrupted before he can acquire the lock.  Since
887  * release(n) issues only n wakeup calls -- only n can be used
888  * anyway -- if the interrupted process is not going to use his
889  * wakeup call he must pass it on to another acquirer.
890  *
891  * The third race is similar to the second but more subtle.  An
892  * acquirer sets waiting=1 and then does a final canacquire()
893  * before going to sleep.  The opposite order would result in
894  * missing wakeups that happen between canacquire and
895  * waiting=1.  (In fact, the whole point of Sema.waiting is to
896  * avoid missing wakeups between canacquire() and sleep().) But
897  * there can be spurious wakeups between a successful
898  * canacquire() and the following semdequeue().  This wakeup is
899  * not useful to the acquirer, since he has already acquired
900  * the semaphore.  Like in the previous case, though, the
901  * acquirer must pass the wakeup call along.
902  *
903  * This is all rather subtle.  The code below has been verified
904  * with the spin model /sys/src/9/port/semaphore.p.  The
905  * original code anticipated the second race but not the first
906  * or third, which were caught only with spin.  The first race
907  * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
908  * It was lucky that my abstract model of sleep/wakeup still managed
909  * to preserve that behavior.
910  *
911  * I remain slightly concerned about memory coherence
912  * outside of locks.  The spin model does not take
913  * queued processor writes into account so we have to
914  * think hard.  The only variables accessed outside locks
915  * are the semaphore value itself and the boolean flag
916  * Sema.waiting.  The value is only accessed with cmpswap,
917  * whose job description includes doing the right thing as
918  * far as memory coherence across processors.  That leaves
919  * Sema.waiting.  To handle it, we call coherence() before each
920  * read and after each write.		- rsc
921  */
922 
923 /* Add semaphore p with addr a to list in seg. */
924 static void
925 semqueue(Segment *s, long *a, Sema *p)
926 {
927 	memset(p, 0, sizeof *p);
928 	p->addr = a;
929 	lock(&s->sema);	/* uses s->sema.Rendez.Lock, but no one else is */
930 	p->next = &s->sema;
931 	p->prev = s->sema.prev;
932 	p->next->prev = p;
933 	p->prev->next = p;
934 	unlock(&s->sema);
935 }
936 
937 /* Remove semaphore p from list in seg. */
938 static void
939 semdequeue(Segment *s, Sema *p)
940 {
941 	lock(&s->sema);
942 	p->next->prev = p->prev;
943 	p->prev->next = p->next;
944 	unlock(&s->sema);
945 }
946 
947 /* Wake up n waiters with addr a on list in seg. */
948 static void
949 semwakeup(Segment *s, long *a, long n)
950 {
951 	Sema *p;
952 
953 	lock(&s->sema);
954 	for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
955 		if(p->addr == a && p->waiting){
956 			p->waiting = 0;
957 			coherence();
958 			wakeup(p);
959 			n--;
960 		}
961 	}
962 	unlock(&s->sema);
963 }
964 
965 /* Add delta to semaphore and wake up waiters as appropriate. */
966 static long
967 semrelease(Segment *s, long *addr, long delta)
968 {
969 	long value;
970 
971 	do
972 		value = *addr;
973 	while(!cmpswap(addr, value, value+delta));
974 	semwakeup(s, addr, delta);
975 	return value+delta;
976 }
977 
978 /* Try to acquire semaphore using compare-and-swap */
979 static int
980 canacquire(long *addr)
981 {
982 	long value;
983 
984 	while((value=*addr) > 0)
985 		if(cmpswap(addr, value, value-1))
986 			return 1;
987 	return 0;
988 }
989 
990 /* Should we wake up? */
991 static int
992 semawoke(void *p)
993 {
994 	coherence();
995 	return !((Sema*)p)->waiting;
996 }
997 
998 /* Acquire semaphore (subtract 1). */
999 static int
1000 semacquire(Segment *s, long *addr, int block)
1001 {
1002 	int acquired;
1003 	Sema phore;
1004 
1005 	if(canacquire(addr))
1006 		return 1;
1007 	if(!block)
1008 		return 0;
1009 
1010 	acquired = 0;
1011 	semqueue(s, addr, &phore);
1012 	for(;;){
1013 		phore.waiting = 1;
1014 		coherence();
1015 		if(canacquire(addr)){
1016 			acquired = 1;
1017 			break;
1018 		}
1019 		if(waserror())
1020 			break;
1021 		sleep(&phore, semawoke, &phore);
1022 		poperror();
1023 	}
1024 	semdequeue(s, &phore);
1025 	coherence();	/* not strictly necessary due to lock in semdequeue */
1026 	if(!phore.waiting)
1027 		semwakeup(s, addr, 1);
1028 	if(!acquired)
1029 		nexterror();
1030 	return 1;
1031 }
1032 
1033 long
1034 syssemacquire(ulong *arg)
1035 {
1036 	int block;
1037 	long *addr;
1038 	Segment *s;
1039 
1040 	validaddr(arg[0], sizeof(long), 1);
1041 	evenaddr(arg[0]);
1042 	addr = (long*)arg[0];
1043 	block = arg[1];
1044 
1045 	if((s = seg(up, (ulong)addr, 0)) == nil)
1046 		error(Ebadarg);
1047 	if(*addr < 0)
1048 		error(Ebadarg);
1049 	return semacquire(s, addr, block);
1050 }
1051 
1052 long
1053 syssemrelease(ulong *arg)
1054 {
1055 	long *addr, delta;
1056 	Segment *s;
1057 
1058 	validaddr(arg[0], sizeof(long), 1);
1059 	evenaddr(arg[0]);
1060 	addr = (long*)arg[0];
1061 	delta = arg[1];
1062 
1063 	if((s = seg(up, (ulong)addr, 0)) == nil)
1064 		error(Ebadarg);
1065 	if(delta < 0 || *addr < 0)
1066 		error(Ebadarg);
1067 	return semrelease(s, addr, arg[1]);
1068 }
1069