xref: /plan9/sys/src/9/port/sysproc.c (revision fac6300f1f1b25611e114fc0bdda9cf428c13da4)
1 #include	"u.h"
2 #include	"tos.h"
3 #include	"../port/lib.h"
4 #include	"mem.h"
5 #include	"dat.h"
6 #include	"fns.h"
7 #include	"../port/error.h"
8 #include	"../port/edf.h"
9 
10 #include	<a.out.h>
11 
12 int	shargs(char*, int, char**);
13 
14 extern void checkpages(void);
15 extern void checkpagerefs(void);
16 
17 long
sysr1(ulong *)18 sysr1(ulong*)
19 {
20 	checkpagerefs();
21 	return 0;
22 }
23 
24 long
sysrfork(ulong * arg)25 sysrfork(ulong *arg)
26 {
27 	Proc *p;
28 	int n, i;
29 	Fgrp *ofg;
30 	Pgrp *opg;
31 	Rgrp *org;
32 	Egrp *oeg;
33 	ulong pid, flag;
34 	Mach *wm;
35 
36 	flag = arg[0];
37 	/* Check flags before we commit */
38 	if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
39 		error(Ebadarg);
40 	if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
41 		error(Ebadarg);
42 	if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
43 		error(Ebadarg);
44 
45 	if((flag&RFPROC) == 0) {
46 		if(flag & (RFMEM|RFNOWAIT))
47 			error(Ebadarg);
48 		if(flag & (RFFDG|RFCFDG)) {
49 			ofg = up->fgrp;
50 			if(flag & RFFDG)
51 				up->fgrp = dupfgrp(ofg);
52 			else
53 				up->fgrp = dupfgrp(nil);
54 			closefgrp(ofg);
55 		}
56 		if(flag & (RFNAMEG|RFCNAMEG)) {
57 			opg = up->pgrp;
58 			up->pgrp = newpgrp();
59 			if(flag & RFNAMEG)
60 				pgrpcpy(up->pgrp, opg);
61 			/* inherit noattach */
62 			up->pgrp->noattach = opg->noattach;
63 			closepgrp(opg);
64 		}
65 		if(flag & RFNOMNT)
66 			up->pgrp->noattach = 1;
67 		if(flag & RFREND) {
68 			org = up->rgrp;
69 			up->rgrp = newrgrp();
70 			closergrp(org);
71 		}
72 		if(flag & (RFENVG|RFCENVG)) {
73 			oeg = up->egrp;
74 			up->egrp = smalloc(sizeof(Egrp));
75 			up->egrp->ref = 1;
76 			if(flag & RFENVG)
77 				envcpy(up->egrp, oeg);
78 			closeegrp(oeg);
79 		}
80 		if(flag & RFNOTEG)
81 			up->noteid = incref(&noteidalloc);
82 		return 0;
83 	}
84 
85 	p = newproc();
86 
87 	p->fpsave = up->fpsave;
88 	p->scallnr = up->scallnr;
89 	p->s = up->s;
90 	p->nerrlab = 0;
91 	p->slash = up->slash;
92 	p->dot = up->dot;
93 	incref(p->dot);
94 
95 	memmove(p->note, up->note, sizeof(p->note));
96 	p->privatemem = up->privatemem;
97 	p->noswap = up->noswap;
98 	p->nnote = up->nnote;
99 	p->notified = 0;
100 	p->lastnote = up->lastnote;
101 	p->notify = up->notify;
102 	p->ureg = up->ureg;
103 	p->dbgreg = 0;
104 
105 	/* Make a new set of memory segments */
106 	n = flag & RFMEM;
107 	qlock(&p->seglock);
108 	if(waserror()){
109 		qunlock(&p->seglock);
110 		nexterror();
111 	}
112 	for(i = 0; i < NSEG; i++)
113 		if(up->seg[i])
114 			p->seg[i] = dupseg(up->seg, i, n);
115 	qunlock(&p->seglock);
116 	poperror();
117 
118 	/* File descriptors */
119 	if(flag & (RFFDG|RFCFDG)) {
120 		if(flag & RFFDG)
121 			p->fgrp = dupfgrp(up->fgrp);
122 		else
123 			p->fgrp = dupfgrp(nil);
124 	}
125 	else {
126 		p->fgrp = up->fgrp;
127 		incref(p->fgrp);
128 	}
129 
130 	/* Process groups */
131 	if(flag & (RFNAMEG|RFCNAMEG)) {
132 		p->pgrp = newpgrp();
133 		if(flag & RFNAMEG)
134 			pgrpcpy(p->pgrp, up->pgrp);
135 		/* inherit noattach */
136 		p->pgrp->noattach = up->pgrp->noattach;
137 	}
138 	else {
139 		p->pgrp = up->pgrp;
140 		incref(p->pgrp);
141 	}
142 	if(flag & RFNOMNT)
143 		p->pgrp->noattach = 1;
144 
145 	if(flag & RFREND)
146 		p->rgrp = newrgrp();
147 	else {
148 		incref(up->rgrp);
149 		p->rgrp = up->rgrp;
150 	}
151 
152 	/* Environment group */
153 	if(flag & (RFENVG|RFCENVG)) {
154 		p->egrp = smalloc(sizeof(Egrp));
155 		p->egrp->ref = 1;
156 		if(flag & RFENVG)
157 			envcpy(p->egrp, up->egrp);
158 	}
159 	else {
160 		p->egrp = up->egrp;
161 		incref(p->egrp);
162 	}
163 	p->hang = up->hang;
164 	p->procmode = up->procmode;
165 
166 	/* Craft a return frame which will cause the child to pop out of
167 	 * the scheduler in user mode with the return register zero
168 	 */
169 	forkchild(p, up->dbgreg);
170 
171 	p->parent = up;
172 	p->parentpid = up->pid;
173 	if(flag&RFNOWAIT)
174 		p->parentpid = 0;
175 	else {
176 		lock(&up->exl);
177 		up->nchild++;
178 		unlock(&up->exl);
179 	}
180 	if((flag&RFNOTEG) == 0)
181 		p->noteid = up->noteid;
182 
183 	/* don't penalize the child, it hasn't done FP in a note handler. */
184 	p->fpstate = up->fpstate & ~FPillegal;
185 	pid = p->pid;
186 	memset(p->time, 0, sizeof(p->time));
187 	p->time[TReal] = MACHP(0)->ticks;
188 
189 	kstrdup(&p->text, up->text);
190 	kstrdup(&p->user, up->user);
191 	/*
192 	 *  since the bss/data segments are now shareable,
193 	 *  any mmu info about this process is now stale
194 	 *  (i.e. has bad properties) and has to be discarded.
195 	 */
196 	flushmmu();
197 	p->basepri = up->basepri;
198 	p->priority = up->basepri;
199 	p->fixedpri = up->fixedpri;
200 	p->mp = up->mp;
201 	wm = up->wired;
202 	if(wm)
203 		procwired(p, wm->machno);
204 	ready(p);
205 	sched();
206 	return pid;
207 }
208 
209 ulong
l2be(long l)210 l2be(long l)
211 {
212 	uchar *cp;
213 
214 	cp = (uchar*)&l;
215 	return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
216 }
217 
218 long
sysexec(ulong * arg)219 sysexec(ulong *arg)
220 {
221 	Segment *s, *ts;
222 	ulong t, d, b;
223 	int i;
224 	Chan *tc;
225 	char **argv, **argp;
226 	char *a, *charp, *args, *file, *file0;
227 	char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
228 	ulong ssize, spage, nargs, nbytes, n, bssend;
229 	int indir;
230 	Exec exec;
231 	char line[sizeof(Exec)];
232 	Fgrp *f;
233 	Image *img;
234 	ulong magic, text, entry, data, bss;
235 	Tos *tos;
236 
237 	indir = 0;
238 	elem = nil;
239 	validaddr(arg[0], 1, 0);
240 	file0 = validnamedup((char*)arg[0], 1);
241 	if(waserror()){
242 		free(file0);
243 		free(elem);
244 		nexterror();
245 	}
246 	file = file0;
247 	for(;;){
248 		tc = namec(file, Aopen, OEXEC, 0);
249 		if(waserror()){
250 			cclose(tc);
251 			nexterror();
252 		}
253 		if(!indir)
254 			kstrdup(&elem, up->genbuf);
255 
256 		n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
257 		if(n < 2)
258 			error(Ebadexec);
259 		magic = l2be(exec.magic);
260 		text = l2be(exec.text);
261 		entry = l2be(exec.entry);
262 		if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
263 			if(text >= USTKTOP-UTZERO
264 			|| entry < UTZERO+sizeof(Exec)
265 			|| entry >= UTZERO+sizeof(Exec)+text)
266 				error(Ebadexec);
267 			break; /* for binary */
268 		}
269 
270 		/*
271 		 * Process #! /bin/sh args ...
272 		 */
273 		memmove(line, &exec, sizeof(Exec));
274 		if(indir || line[0]!='#' || line[1]!='!')
275 			error(Ebadexec);
276 		n = shargs(line, n, progarg);
277 		if(n == 0)
278 			error(Ebadexec);
279 		indir = 1;
280 		/*
281 		 * First arg becomes complete file name
282 		 */
283 		progarg[n++] = file;
284 		progarg[n] = 0;
285 		validaddr(arg[1], BY2WD, 1);
286 		arg[1] += BY2WD;
287 		file = progarg[0];
288 		if(strlen(elem) >= sizeof progelem)
289 			error(Ebadexec);
290 		strcpy(progelem, elem);
291 		progarg[0] = progelem;
292 		poperror();
293 		cclose(tc);
294 	}
295 
296 	data = l2be(exec.data);
297 	bss = l2be(exec.bss);
298 	t = UTROUND(UTZERO+sizeof(Exec)+text);
299 	d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
300 	bssend = t + data + bss;
301 	b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
302 	if(t >= KZERO || d >= KZERO || b >= KZERO)
303 		error(Ebadexec);
304 
305 	/*
306 	 * Args: pass 1: count
307 	 */
308 	nbytes = sizeof(Tos);		/* hole for profiling clock at top of stack (and more) */
309 	nargs = 0;
310 	if(indir){
311 		argp = progarg;
312 		while(*argp){
313 			a = *argp++;
314 			nbytes += strlen(a) + 1;
315 			nargs++;
316 		}
317 	}
318 	validalign(arg[1], sizeof(char**));
319 	argp = (char**)arg[1];
320 	validaddr((ulong)argp, BY2WD, 0);
321 	while(*argp){
322 		a = *argp++;
323 		if(((ulong)argp&(BY2PG-1)) < BY2WD)
324 			validaddr((ulong)argp, BY2WD, 0);
325 		validaddr((ulong)a, 1, 0);
326 		nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
327 		nargs++;
328 	}
329 	ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
330 
331 	/*
332 	 * 8-byte align SP for those (e.g. sparc) that need it.
333 	 * execregs() will subtract another 4 bytes for argc.
334 	 */
335 	if((ssize+4) & 7)
336 		ssize += 4;
337 	spage = (ssize+(BY2PG-1)) >> PGSHIFT;
338 
339 	/*
340 	 * Build the stack segment, putting it in kernel virtual for the moment
341 	 */
342 	if(spage > TSTKSIZ)
343 		error(Enovmem);
344 
345 	qlock(&up->seglock);
346 	if(waserror()){
347 		qunlock(&up->seglock);
348 		nexterror();
349 	}
350 	up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
351 
352 	/*
353 	 * Args: pass 2: assemble; the pages will be faulted in
354 	 */
355 	tos = (Tos*)(TSTKTOP - sizeof(Tos));
356 	tos->cyclefreq = m->cyclefreq;
357 	cycles((uvlong*)&tos->pcycles);
358 	tos->pcycles = -tos->pcycles;
359 	tos->kcycles = tos->pcycles;
360 	tos->clock = 0;
361 	argv = (char**)(TSTKTOP - ssize);
362 	charp = (char*)(TSTKTOP - nbytes);
363 	args = charp;
364 	if(indir)
365 		argp = progarg;
366 	else
367 		argp = (char**)arg[1];
368 
369 	for(i=0; i<nargs; i++){
370 		if(indir && *argp==0) {
371 			indir = 0;
372 			argp = (char**)arg[1];
373 		}
374 		*argv++ = charp + (USTKTOP-TSTKTOP);
375 		n = strlen(*argp) + 1;
376 		memmove(charp, *argp++, n);
377 		charp += n;
378 	}
379 	free(file0);
380 
381 	free(up->text);
382 	up->text = elem;
383 	elem = nil;	/* so waserror() won't free elem */
384 	USED(elem);
385 
386 	/* copy args; easiest from new process's stack */
387 	n = charp - args;
388 	if(n > 128)	/* don't waste too much space on huge arg lists */
389 		n = 128;
390 	a = up->args;
391 	up->args = nil;
392 	free(a);
393 	up->args = smalloc(n);
394 	memmove(up->args, args, n);
395 	if(n>0 && up->args[n-1]!='\0'){
396 		/* make sure last arg is NUL-terminated */
397 		/* put NUL at UTF-8 character boundary */
398 		for(i=n-1; i>0; --i)
399 			if(fullrune(up->args+i, n-i))
400 				break;
401 		up->args[i] = 0;
402 		n = i+1;
403 	}
404 	up->nargs = n;
405 
406 	/*
407 	 * Committed.
408 	 * Free old memory.
409 	 * Special segments are maintained across exec
410 	 */
411 	for(i = SSEG; i <= BSEG; i++) {
412 		putseg(up->seg[i]);
413 		/* prevent a second free if we have an error */
414 		up->seg[i] = 0;
415 	}
416 	for(i = BSEG+1; i < NSEG; i++) {
417 		s = up->seg[i];
418 		if(s != 0 && (s->type&SG_CEXEC)) {
419 			putseg(s);
420 			up->seg[i] = 0;
421 		}
422 	}
423 
424 	/*
425 	 * Close on exec
426 	 */
427 	f = up->fgrp;
428 	for(i=0; i<=f->maxfd; i++)
429 		fdclose(i, CCEXEC);
430 
431 	/* Text.  Shared. Attaches to cache image if possible */
432 	/* attachimage returns a locked cache image */
433 	img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
434 	ts = img->s;
435 	up->seg[TSEG] = ts;
436 	ts->flushme = 1;
437 	ts->fstart = 0;
438 	ts->flen = sizeof(Exec)+text;
439 	unlock(img);
440 
441 	/* Data. Shared. */
442 	s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
443 	up->seg[DSEG] = s;
444 
445 	/* Attached by hand */
446 	incref(img);
447 	s->image = img;
448 	s->fstart = ts->fstart+ts->flen;
449 	s->flen = data;
450 
451 	/* BSS. Zero fill on demand */
452 	up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
453 
454 	/*
455 	 * Move the stack
456 	 */
457 	s = up->seg[ESEG];
458 	up->seg[ESEG] = 0;
459 	up->seg[SSEG] = s;
460 	qunlock(&up->seglock);
461 	poperror();	/* seglock */
462 	poperror();	/* elem */
463 	s->base = USTKTOP-USTKSIZE;
464 	s->top = USTKTOP;
465 	relocateseg(s, USTKTOP-TSTKTOP);
466 
467 	/*
468 	 *  '/' processes are higher priority (hack to make /ip more responsive).
469 	 */
470 	if(devtab[tc->type]->dc == L'/')
471 		up->basepri = PriRoot;
472 	up->priority = up->basepri;
473 	poperror();
474 	cclose(tc);
475 
476 	/*
477 	 *  At this point, the mmu contains info about the old address
478 	 *  space and needs to be flushed
479 	 */
480 	flushmmu();
481 	qlock(&up->debug);
482 	up->nnote = 0;
483 	up->notify = 0;
484 	up->notified = 0;
485 	up->privatemem = 0;
486 	procsetup(up);
487 	qunlock(&up->debug);
488 	if(up->hang)
489 		up->procctl = Proc_stopme;
490 
491 	return execregs(entry, ssize, nargs);
492 }
493 
494 int
shargs(char * s,int n,char ** ap)495 shargs(char *s, int n, char **ap)
496 {
497 	int i;
498 
499 	s += 2;
500 	n -= 2;		/* skip #! */
501 	for(i=0; s[i]!='\n'; i++)
502 		if(i == n-1)
503 			return 0;
504 	s[i] = 0;
505 	*ap = 0;
506 	i = 0;
507 	for(;;) {
508 		while(*s==' ' || *s=='\t')
509 			s++;
510 		if(*s == 0)
511 			break;
512 		i++;
513 		*ap++ = s;
514 		*ap = 0;
515 		while(*s && *s!=' ' && *s!='\t')
516 			s++;
517 		if(*s == 0)
518 			break;
519 		else
520 			*s++ = 0;
521 	}
522 	return i;
523 }
524 
525 int
return0(void *)526 return0(void*)
527 {
528 	return 0;
529 }
530 
531 long
syssleep(ulong * arg)532 syssleep(ulong *arg)
533 {
534 
535 	int n;
536 
537 	n = arg[0];
538 	if(n <= 0) {
539 		if (up->edf && (up->edf->flags & Admitted))
540 			edfyield();
541 		else
542 			yield();
543 		return 0;
544 	}
545 	if(n < TK2MS(1))
546 		n = TK2MS(1);
547 	tsleep(&up->sleep, return0, 0, n);
548 	return 0;
549 }
550 
551 long
sysalarm(ulong * arg)552 sysalarm(ulong *arg)
553 {
554 	return procalarm(arg[0]);
555 }
556 
557 long
sysexits(ulong * arg)558 sysexits(ulong *arg)
559 {
560 	char *status;
561 	char *inval = "invalid exit string";
562 	char buf[ERRMAX];
563 
564 	status = (char*)arg[0];
565 	if(status){
566 		if(waserror())
567 			status = inval;
568 		else{
569 			validaddr((ulong)status, 1, 0);
570 			if(vmemchr(status, 0, ERRMAX) == 0){
571 				memmove(buf, status, ERRMAX);
572 				buf[ERRMAX-1] = 0;
573 				status = buf;
574 			}
575 			poperror();
576 		}
577 
578 	}
579 	pexit(status, 1);
580 	return 0;		/* not reached */
581 }
582 
583 long
sys_wait(ulong * arg)584 sys_wait(ulong *arg)
585 {
586 	int pid;
587 	Waitmsg w;
588 	OWaitmsg *ow;
589 
590 	if(arg[0] == 0)
591 		return pwait(nil);
592 
593 	validaddr(arg[0], sizeof(OWaitmsg), 1);
594 	validalign(arg[0], BY2WD);			/* who cares? */
595 	pid = pwait(&w);
596 	if(pid >= 0){
597 		ow = (OWaitmsg*)arg[0];
598 		readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
599 		readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
600 		readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
601 		readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
602 		strncpy(ow->msg, w.msg, sizeof(ow->msg));
603 		ow->msg[sizeof(ow->msg)-1] = '\0';
604 	}
605 	return pid;
606 }
607 
608 long
sysawait(ulong * arg)609 sysawait(ulong *arg)
610 {
611 	int i;
612 	int pid;
613 	Waitmsg w;
614 	ulong n;
615 
616 	n = arg[1];
617 	validaddr(arg[0], n, 1);
618 	pid = pwait(&w);
619 	if(pid < 0)
620 		return -1;
621 	i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
622 		w.pid,
623 		w.time[TUser], w.time[TSys], w.time[TReal],
624 		w.msg);
625 
626 	return i;
627 }
628 
629 void
werrstr(char * fmt,...)630 werrstr(char *fmt, ...)
631 {
632 	va_list va;
633 
634 	if(up == nil)
635 		return;
636 
637 	va_start(va, fmt);
638 	vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
639 	va_end(va);
640 }
641 
642 static long
generrstr(char * buf,uint nbuf)643 generrstr(char *buf, uint nbuf)
644 {
645 	char tmp[ERRMAX];
646 
647 	if(nbuf == 0)
648 		error(Ebadarg);
649 	validaddr((ulong)buf, nbuf, 1);
650 	if(nbuf > sizeof tmp)
651 		nbuf = sizeof tmp;
652 	memmove(tmp, buf, nbuf);
653 
654 	/* make sure it's NUL-terminated */
655 	tmp[nbuf-1] = '\0';
656 	memmove(buf, up->syserrstr, nbuf);
657 	buf[nbuf-1] = '\0';
658 	memmove(up->syserrstr, tmp, nbuf);
659 	return 0;
660 }
661 
662 long
syserrstr(ulong * arg)663 syserrstr(ulong *arg)
664 {
665 	return generrstr((char*)arg[0], arg[1]);
666 }
667 
668 /* compatibility for old binaries */
669 long
sys_errstr(ulong * arg)670 sys_errstr(ulong *arg)
671 {
672 	return generrstr((char*)arg[0], 64);
673 }
674 
675 long
sysnotify(ulong * arg)676 sysnotify(ulong *arg)
677 {
678 	if(arg[0] != 0)
679 		validaddr(arg[0], sizeof(ulong), 0);
680 	up->notify = (int(*)(void*, char*))(arg[0]);
681 	return 0;
682 }
683 
684 long
sysnoted(ulong * arg)685 sysnoted(ulong *arg)
686 {
687 	if(arg[0]!=NRSTR && !up->notified)
688 		error(Egreg);
689 	return 0;
690 }
691 
692 long
syssegbrk(ulong * arg)693 syssegbrk(ulong *arg)
694 {
695 	int i;
696 	ulong addr;
697 	Segment *s;
698 
699 	addr = arg[0];
700 	for(i = 0; i < NSEG; i++) {
701 		s = up->seg[i];
702 		if(s == 0 || addr < s->base || addr >= s->top)
703 			continue;
704 		switch(s->type&SG_TYPE) {
705 		case SG_TEXT:
706 		case SG_DATA:
707 		case SG_STACK:
708 			error(Ebadarg);
709 		default:
710 			return ibrk(arg[1], i);
711 		}
712 	}
713 
714 	error(Ebadarg);
715 	return 0;		/* not reached */
716 }
717 
718 long
syssegattach(ulong * arg)719 syssegattach(ulong *arg)
720 {
721 	return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
722 }
723 
724 long
syssegdetach(ulong * arg)725 syssegdetach(ulong *arg)
726 {
727 	int i;
728 	ulong addr;
729 	Segment *s;
730 
731 	qlock(&up->seglock);
732 	if(waserror()){
733 		qunlock(&up->seglock);
734 		nexterror();
735 	}
736 
737 	s = 0;
738 	addr = arg[0];
739 	for(i = 0; i < NSEG; i++)
740 		if(s = up->seg[i]) {
741 			qlock(&s->lk);
742 			if((addr >= s->base && addr < s->top) ||
743 			   (s->top == s->base && addr == s->base))
744 				goto found;
745 			qunlock(&s->lk);
746 		}
747 
748 	error(Ebadarg);
749 
750 found:
751 	/*
752 	 * Check we are not detaching the initial stack segment.
753 	 */
754 	if(s == up->seg[SSEG]){
755 		qunlock(&s->lk);
756 		error(Ebadarg);
757 	}
758 	up->seg[i] = 0;
759 	qunlock(&s->lk);
760 	putseg(s);
761 	qunlock(&up->seglock);
762 	poperror();
763 
764 	/* Ensure we flush any entries from the lost segment */
765 	flushmmu();
766 	return 0;
767 }
768 
769 long
syssegfree(ulong * arg)770 syssegfree(ulong *arg)
771 {
772 	Segment *s;
773 	ulong from, to;
774 
775 	from = arg[0];
776 	s = seg(up, from, 1);
777 	if(s == nil)
778 		error(Ebadarg);
779 	to = (from + arg[1]) & ~(BY2PG-1);
780 	from = PGROUND(from);
781 
782 	if(to > s->top) {
783 		qunlock(&s->lk);
784 		error(Ebadarg);
785 	}
786 
787 	mfreeseg(s, from, (to - from) / BY2PG);
788 	qunlock(&s->lk);
789 	flushmmu();
790 
791 	return 0;
792 }
793 
794 /* For binary compatibility */
795 long
sysbrk_(ulong * arg)796 sysbrk_(ulong *arg)
797 {
798 	return ibrk(arg[0], BSEG);
799 }
800 
801 long
sysrendezvous(ulong * arg)802 sysrendezvous(ulong *arg)
803 {
804 	uintptr tag, val;
805 	Proc *p, **l;
806 
807 	tag = arg[0];
808 	l = &REND(up->rgrp, tag);
809 	up->rendval = ~(uintptr)0;
810 
811 	lock(up->rgrp);
812 	for(p = *l; p; p = p->rendhash) {
813 		if(p->rendtag == tag) {
814 			*l = p->rendhash;
815 			val = p->rendval;
816 			p->rendval = arg[1];
817 
818 			while(p->mach != 0)
819 				;
820 			ready(p);
821 			unlock(up->rgrp);
822 			return val;
823 		}
824 		l = &p->rendhash;
825 	}
826 
827 	/* Going to sleep here */
828 	up->rendtag = tag;
829 	up->rendval = arg[1];
830 	up->rendhash = *l;
831 	*l = up;
832 	up->state = Rendezvous;
833 	unlock(up->rgrp);
834 
835 	sched();
836 
837 	return up->rendval;
838 }
839 
840 /*
841  * The implementation of semaphores is complicated by needing
842  * to avoid rescheduling in syssemrelease, so that it is safe
843  * to call from real-time processes.  This means syssemrelease
844  * cannot acquire any qlocks, only spin locks.
845  *
846  * Semacquire and semrelease must both manipulate the semaphore
847  * wait list.  Lock-free linked lists only exist in theory, not
848  * in practice, so the wait list is protected by a spin lock.
849  *
850  * The semaphore value *addr is stored in user memory, so it
851  * cannot be read or written while holding spin locks.
852  *
853  * Thus, we can access the list only when holding the lock, and
854  * we can access the semaphore only when not holding the lock.
855  * This makes things interesting.  Note that sleep's condition function
856  * is called while holding two locks - r and up->rlock - so it cannot
857  * access the semaphore value either.
858  *
859  * An acquirer announces its intention to try for the semaphore
860  * by putting a Sema structure onto the wait list and then
861  * setting Sema.waiting.  After one last check of semaphore,
862  * the acquirer sleeps until Sema.waiting==0.  A releaser of n
863  * must wake up n acquirers who have Sema.waiting set.  It does
864  * this by clearing Sema.waiting and then calling wakeup.
865  *
866  * There are three interesting races here.
867 
868  * The first is that in this particular sleep/wakeup usage, a single
869  * wakeup can rouse a process from two consecutive sleeps!
870  * The ordering is:
871  *
872  * 	(a) set Sema.waiting = 1
873  * 	(a) call sleep
874  * 	(b) set Sema.waiting = 0
875  * 	(a) check Sema.waiting inside sleep, return w/o sleeping
876  * 	(a) try for semaphore, fail
877  * 	(a) set Sema.waiting = 1
878  * 	(a) call sleep
879  * 	(b) call wakeup(a)
880  * 	(a) wake up again
881  *
882  * This is okay - semacquire will just go around the loop
883  * again.  It does mean that at the top of the for(;;) loop in
884  * semacquire, phore.waiting might already be set to 1.
885  *
886  * The second is that a releaser might wake an acquirer who is
887  * interrupted before he can acquire the lock.  Since
888  * release(n) issues only n wakeup calls -- only n can be used
889  * anyway -- if the interrupted process is not going to use his
890  * wakeup call he must pass it on to another acquirer.
891  *
892  * The third race is similar to the second but more subtle.  An
893  * acquirer sets waiting=1 and then does a final canacquire()
894  * before going to sleep.  The opposite order would result in
895  * missing wakeups that happen between canacquire and
896  * waiting=1.  (In fact, the whole point of Sema.waiting is to
897  * avoid missing wakeups between canacquire() and sleep().) But
898  * there can be spurious wakeups between a successful
899  * canacquire() and the following semdequeue().  This wakeup is
900  * not useful to the acquirer, since he has already acquired
901  * the semaphore.  Like in the previous case, though, the
902  * acquirer must pass the wakeup call along.
903  *
904  * This is all rather subtle.  The code below has been verified
905  * with the spin model /sys/src/9/port/semaphore.p.  The
906  * original code anticipated the second race but not the first
907  * or third, which were caught only with spin.  The first race
908  * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
909  * It was lucky that my abstract model of sleep/wakeup still managed
910  * to preserve that behavior.
911  *
912  * I remain slightly concerned about memory coherence
913  * outside of locks.  The spin model does not take
914  * queued processor writes into account so we have to
915  * think hard.  The only variables accessed outside locks
916  * are the semaphore value itself and the boolean flag
917  * Sema.waiting.  The value is only accessed with cmpswap,
918  * whose job description includes doing the right thing as
919  * far as memory coherence across processors.  That leaves
920  * Sema.waiting.  To handle it, we call coherence() before each
921  * read and after each write.		- rsc
922  */
923 
924 /* Add semaphore p with addr a to list in seg. */
925 static void
semqueue(Segment * s,long * a,Sema * p)926 semqueue(Segment *s, long *a, Sema *p)
927 {
928 	memset(p, 0, sizeof *p);
929 	p->addr = a;
930 	lock(&s->sema);	/* uses s->sema.Rendez.Lock, but no one else is */
931 	p->next = &s->sema;
932 	p->prev = s->sema.prev;
933 	p->next->prev = p;
934 	p->prev->next = p;
935 	unlock(&s->sema);
936 }
937 
938 /* Remove semaphore p from list in seg. */
939 static void
semdequeue(Segment * s,Sema * p)940 semdequeue(Segment *s, Sema *p)
941 {
942 	lock(&s->sema);
943 	p->next->prev = p->prev;
944 	p->prev->next = p->next;
945 	unlock(&s->sema);
946 }
947 
948 /* Wake up n waiters with addr a on list in seg. */
949 static void
semwakeup(Segment * s,long * a,long n)950 semwakeup(Segment *s, long *a, long n)
951 {
952 	Sema *p;
953 
954 	lock(&s->sema);
955 	for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
956 		if(p->addr == a && p->waiting){
957 			p->waiting = 0;
958 			coherence();
959 			wakeup(p);
960 			n--;
961 		}
962 	}
963 	unlock(&s->sema);
964 }
965 
966 /* Add delta to semaphore and wake up waiters as appropriate. */
967 static long
semrelease(Segment * s,long * addr,long delta)968 semrelease(Segment *s, long *addr, long delta)
969 {
970 	long value;
971 
972 	do
973 		value = *addr;
974 	while(!cmpswap(addr, value, value+delta));
975 	semwakeup(s, addr, delta);
976 	return value+delta;
977 }
978 
979 /* Try to acquire semaphore using compare-and-swap */
980 static int
canacquire(long * addr)981 canacquire(long *addr)
982 {
983 	long value;
984 
985 	while((value=*addr) > 0)
986 		if(cmpswap(addr, value, value-1))
987 			return 1;
988 	return 0;
989 }
990 
991 /* Should we wake up? */
992 static int
semawoke(void * p)993 semawoke(void *p)
994 {
995 	coherence();
996 	return !((Sema*)p)->waiting;
997 }
998 
999 /* Acquire semaphore (subtract 1). */
1000 static int
semacquire(Segment * s,long * addr,int block)1001 semacquire(Segment *s, long *addr, int block)
1002 {
1003 	int acquired;
1004 	Sema phore;
1005 
1006 	if(canacquire(addr))
1007 		return 1;
1008 	if(!block)
1009 		return 0;
1010 
1011 	acquired = 0;
1012 	semqueue(s, addr, &phore);
1013 	for(;;){
1014 		phore.waiting = 1;
1015 		coherence();
1016 		if(canacquire(addr)){
1017 			acquired = 1;
1018 			break;
1019 		}
1020 		if(waserror())
1021 			break;
1022 		sleep(&phore, semawoke, &phore);
1023 		poperror();
1024 	}
1025 	semdequeue(s, &phore);
1026 	coherence();	/* not strictly necessary due to lock in semdequeue */
1027 	if(!phore.waiting)
1028 		semwakeup(s, addr, 1);
1029 	if(!acquired)
1030 		nexterror();
1031 	return 1;
1032 }
1033 
1034 /* Acquire semaphore or time-out */
1035 static int
tsemacquire(Segment * s,long * addr,ulong ms)1036 tsemacquire(Segment *s, long *addr, ulong ms)
1037 {
1038 	int acquired, timedout;
1039 	ulong t, elms;
1040 	Sema phore;
1041 
1042 	if(canacquire(addr))
1043 		return 1;
1044 	if(ms == 0)
1045 		return 0;
1046 	acquired = timedout = 0;
1047 	semqueue(s, addr, &phore);
1048 	for(;;){
1049 		phore.waiting = 1;
1050 		coherence();
1051 		if(canacquire(addr)){
1052 			acquired = 1;
1053 			break;
1054 		}
1055 		if(waserror())
1056 			break;
1057 		t = m->ticks;
1058 		tsleep(&phore, semawoke, &phore, ms);
1059 		elms = TK2MS(m->ticks - t);
1060 		poperror();
1061 		if(elms >= ms){
1062 			timedout = 1;
1063 			break;
1064 		}
1065 		ms -= elms;
1066 	}
1067 	semdequeue(s, &phore);
1068 	coherence();	/* not strictly necessary due to lock in semdequeue */
1069 	if(!phore.waiting)
1070 		semwakeup(s, addr, 1);
1071 	if(timedout)
1072 		return 0;
1073 	if(!acquired)
1074 		nexterror();
1075 	return 1;
1076 }
1077 
1078 long
syssemacquire(ulong * arg)1079 syssemacquire(ulong *arg)
1080 {
1081 	int block;
1082 	long *addr;
1083 	Segment *s;
1084 
1085 	validaddr(arg[0], sizeof(long), 1);
1086 	validalign(arg[0], sizeof(long));
1087 	addr = (long*)arg[0];
1088 	block = arg[1];
1089 
1090 	if((s = seg(up, (ulong)addr, 0)) == nil)
1091 		error(Ebadarg);
1092 	if(*addr < 0)
1093 		error(Ebadarg);
1094 	return semacquire(s, addr, block);
1095 }
1096 
1097 long
systsemacquire(ulong * arg)1098 systsemacquire(ulong *arg)
1099 {
1100 	long *addr;
1101 	ulong ms;
1102 	Segment *s;
1103 
1104 	validaddr(arg[0], sizeof(long), 1);
1105 	validalign(arg[0], sizeof(long));
1106 	addr = (long*)arg[0];
1107 	ms = arg[1];
1108 
1109 	if((s = seg(up, (ulong)addr, 0)) == nil)
1110 		error(Ebadarg);
1111 	if(*addr < 0)
1112 		error(Ebadarg);
1113 	return tsemacquire(s, addr, ms);
1114 }
1115 
1116 long
syssemrelease(ulong * arg)1117 syssemrelease(ulong *arg)
1118 {
1119 	long *addr, delta;
1120 	Segment *s;
1121 
1122 	validaddr(arg[0], sizeof(long), 1);
1123 	validalign(arg[0], sizeof(long));
1124 	addr = (long*)arg[0];
1125 	delta = arg[1];
1126 
1127 	if((s = seg(up, (ulong)addr, 0)) == nil)
1128 		error(Ebadarg);
1129 	/* delta == 0 is a no-op, not a release */
1130 	if(delta < 0 || *addr < 0)
1131 		error(Ebadarg);
1132 	return semrelease(s, addr, delta);
1133 }
1134 
1135 long
sysnsec(ulong * arg)1136 sysnsec(ulong *arg)
1137 {
1138 	validaddr(arg[0], sizeof(vlong), 1);
1139 	validalign(arg[0], sizeof(vlong));
1140 
1141 	*(vlong*)arg[0] = todget(nil);
1142 
1143 	return 0;
1144 }
1145