1 #include "u.h"
2 #include "tos.h"
3 #include "../port/lib.h"
4 #include "mem.h"
5 #include "dat.h"
6 #include "fns.h"
7 #include "../port/error.h"
8 #include "../port/edf.h"
9
10 #include <a.out.h>
11
12 int shargs(char*, int, char**);
13
14 extern void checkpages(void);
15 extern void checkpagerefs(void);
16
17 long
sysr1(ulong *)18 sysr1(ulong*)
19 {
20 checkpagerefs();
21 return 0;
22 }
23
24 long
sysrfork(ulong * arg)25 sysrfork(ulong *arg)
26 {
27 Proc *p;
28 int n, i;
29 Fgrp *ofg;
30 Pgrp *opg;
31 Rgrp *org;
32 Egrp *oeg;
33 ulong pid, flag;
34 Mach *wm;
35
36 flag = arg[0];
37 /* Check flags before we commit */
38 if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
39 error(Ebadarg);
40 if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
41 error(Ebadarg);
42 if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
43 error(Ebadarg);
44
45 if((flag&RFPROC) == 0) {
46 if(flag & (RFMEM|RFNOWAIT))
47 error(Ebadarg);
48 if(flag & (RFFDG|RFCFDG)) {
49 ofg = up->fgrp;
50 if(flag & RFFDG)
51 up->fgrp = dupfgrp(ofg);
52 else
53 up->fgrp = dupfgrp(nil);
54 closefgrp(ofg);
55 }
56 if(flag & (RFNAMEG|RFCNAMEG)) {
57 opg = up->pgrp;
58 up->pgrp = newpgrp();
59 if(flag & RFNAMEG)
60 pgrpcpy(up->pgrp, opg);
61 /* inherit noattach */
62 up->pgrp->noattach = opg->noattach;
63 closepgrp(opg);
64 }
65 if(flag & RFNOMNT)
66 up->pgrp->noattach = 1;
67 if(flag & RFREND) {
68 org = up->rgrp;
69 up->rgrp = newrgrp();
70 closergrp(org);
71 }
72 if(flag & (RFENVG|RFCENVG)) {
73 oeg = up->egrp;
74 up->egrp = smalloc(sizeof(Egrp));
75 up->egrp->ref = 1;
76 if(flag & RFENVG)
77 envcpy(up->egrp, oeg);
78 closeegrp(oeg);
79 }
80 if(flag & RFNOTEG)
81 up->noteid = incref(¬eidalloc);
82 return 0;
83 }
84
85 p = newproc();
86
87 p->fpsave = up->fpsave;
88 p->scallnr = up->scallnr;
89 p->s = up->s;
90 p->nerrlab = 0;
91 p->slash = up->slash;
92 p->dot = up->dot;
93 incref(p->dot);
94
95 memmove(p->note, up->note, sizeof(p->note));
96 p->privatemem = up->privatemem;
97 p->noswap = up->noswap;
98 p->nnote = up->nnote;
99 p->notified = 0;
100 p->lastnote = up->lastnote;
101 p->notify = up->notify;
102 p->ureg = up->ureg;
103 p->dbgreg = 0;
104
105 /* Make a new set of memory segments */
106 n = flag & RFMEM;
107 qlock(&p->seglock);
108 if(waserror()){
109 qunlock(&p->seglock);
110 nexterror();
111 }
112 for(i = 0; i < NSEG; i++)
113 if(up->seg[i])
114 p->seg[i] = dupseg(up->seg, i, n);
115 qunlock(&p->seglock);
116 poperror();
117
118 /* File descriptors */
119 if(flag & (RFFDG|RFCFDG)) {
120 if(flag & RFFDG)
121 p->fgrp = dupfgrp(up->fgrp);
122 else
123 p->fgrp = dupfgrp(nil);
124 }
125 else {
126 p->fgrp = up->fgrp;
127 incref(p->fgrp);
128 }
129
130 /* Process groups */
131 if(flag & (RFNAMEG|RFCNAMEG)) {
132 p->pgrp = newpgrp();
133 if(flag & RFNAMEG)
134 pgrpcpy(p->pgrp, up->pgrp);
135 /* inherit noattach */
136 p->pgrp->noattach = up->pgrp->noattach;
137 }
138 else {
139 p->pgrp = up->pgrp;
140 incref(p->pgrp);
141 }
142 if(flag & RFNOMNT)
143 p->pgrp->noattach = 1;
144
145 if(flag & RFREND)
146 p->rgrp = newrgrp();
147 else {
148 incref(up->rgrp);
149 p->rgrp = up->rgrp;
150 }
151
152 /* Environment group */
153 if(flag & (RFENVG|RFCENVG)) {
154 p->egrp = smalloc(sizeof(Egrp));
155 p->egrp->ref = 1;
156 if(flag & RFENVG)
157 envcpy(p->egrp, up->egrp);
158 }
159 else {
160 p->egrp = up->egrp;
161 incref(p->egrp);
162 }
163 p->hang = up->hang;
164 p->procmode = up->procmode;
165
166 /* Craft a return frame which will cause the child to pop out of
167 * the scheduler in user mode with the return register zero
168 */
169 forkchild(p, up->dbgreg);
170
171 p->parent = up;
172 p->parentpid = up->pid;
173 if(flag&RFNOWAIT)
174 p->parentpid = 0;
175 else {
176 lock(&up->exl);
177 up->nchild++;
178 unlock(&up->exl);
179 }
180 if((flag&RFNOTEG) == 0)
181 p->noteid = up->noteid;
182
183 /* don't penalize the child, it hasn't done FP in a note handler. */
184 p->fpstate = up->fpstate & ~FPillegal;
185 pid = p->pid;
186 memset(p->time, 0, sizeof(p->time));
187 p->time[TReal] = MACHP(0)->ticks;
188
189 kstrdup(&p->text, up->text);
190 kstrdup(&p->user, up->user);
191 /*
192 * since the bss/data segments are now shareable,
193 * any mmu info about this process is now stale
194 * (i.e. has bad properties) and has to be discarded.
195 */
196 flushmmu();
197 p->basepri = up->basepri;
198 p->priority = up->basepri;
199 p->fixedpri = up->fixedpri;
200 p->mp = up->mp;
201 wm = up->wired;
202 if(wm)
203 procwired(p, wm->machno);
204 ready(p);
205 sched();
206 return pid;
207 }
208
209 ulong
l2be(long l)210 l2be(long l)
211 {
212 uchar *cp;
213
214 cp = (uchar*)&l;
215 return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
216 }
217
218 long
sysexec(ulong * arg)219 sysexec(ulong *arg)
220 {
221 Segment *s, *ts;
222 ulong t, d, b;
223 int i;
224 Chan *tc;
225 char **argv, **argp;
226 char *a, *charp, *args, *file, *file0;
227 char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
228 ulong ssize, spage, nargs, nbytes, n, bssend;
229 int indir;
230 Exec exec;
231 char line[sizeof(Exec)];
232 Fgrp *f;
233 Image *img;
234 ulong magic, text, entry, data, bss;
235 Tos *tos;
236
237 indir = 0;
238 elem = nil;
239 validaddr(arg[0], 1, 0);
240 file0 = validnamedup((char*)arg[0], 1);
241 if(waserror()){
242 free(file0);
243 free(elem);
244 nexterror();
245 }
246 file = file0;
247 for(;;){
248 tc = namec(file, Aopen, OEXEC, 0);
249 if(waserror()){
250 cclose(tc);
251 nexterror();
252 }
253 if(!indir)
254 kstrdup(&elem, up->genbuf);
255
256 n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
257 if(n < 2)
258 error(Ebadexec);
259 magic = l2be(exec.magic);
260 text = l2be(exec.text);
261 entry = l2be(exec.entry);
262 if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
263 if(text >= USTKTOP-UTZERO
264 || entry < UTZERO+sizeof(Exec)
265 || entry >= UTZERO+sizeof(Exec)+text)
266 error(Ebadexec);
267 break; /* for binary */
268 }
269
270 /*
271 * Process #! /bin/sh args ...
272 */
273 memmove(line, &exec, sizeof(Exec));
274 if(indir || line[0]!='#' || line[1]!='!')
275 error(Ebadexec);
276 n = shargs(line, n, progarg);
277 if(n == 0)
278 error(Ebadexec);
279 indir = 1;
280 /*
281 * First arg becomes complete file name
282 */
283 progarg[n++] = file;
284 progarg[n] = 0;
285 validaddr(arg[1], BY2WD, 1);
286 arg[1] += BY2WD;
287 file = progarg[0];
288 if(strlen(elem) >= sizeof progelem)
289 error(Ebadexec);
290 strcpy(progelem, elem);
291 progarg[0] = progelem;
292 poperror();
293 cclose(tc);
294 }
295
296 data = l2be(exec.data);
297 bss = l2be(exec.bss);
298 t = UTROUND(UTZERO+sizeof(Exec)+text);
299 d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
300 bssend = t + data + bss;
301 b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
302 if(t >= KZERO || d >= KZERO || b >= KZERO)
303 error(Ebadexec);
304
305 /*
306 * Args: pass 1: count
307 */
308 nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
309 nargs = 0;
310 if(indir){
311 argp = progarg;
312 while(*argp){
313 a = *argp++;
314 nbytes += strlen(a) + 1;
315 nargs++;
316 }
317 }
318 validalign(arg[1], sizeof(char**));
319 argp = (char**)arg[1];
320 validaddr((ulong)argp, BY2WD, 0);
321 while(*argp){
322 a = *argp++;
323 if(((ulong)argp&(BY2PG-1)) < BY2WD)
324 validaddr((ulong)argp, BY2WD, 0);
325 validaddr((ulong)a, 1, 0);
326 nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
327 nargs++;
328 }
329 ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
330
331 /*
332 * 8-byte align SP for those (e.g. sparc) that need it.
333 * execregs() will subtract another 4 bytes for argc.
334 */
335 if((ssize+4) & 7)
336 ssize += 4;
337 spage = (ssize+(BY2PG-1)) >> PGSHIFT;
338
339 /*
340 * Build the stack segment, putting it in kernel virtual for the moment
341 */
342 if(spage > TSTKSIZ)
343 error(Enovmem);
344
345 qlock(&up->seglock);
346 if(waserror()){
347 qunlock(&up->seglock);
348 nexterror();
349 }
350 up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
351
352 /*
353 * Args: pass 2: assemble; the pages will be faulted in
354 */
355 tos = (Tos*)(TSTKTOP - sizeof(Tos));
356 tos->cyclefreq = m->cyclefreq;
357 cycles((uvlong*)&tos->pcycles);
358 tos->pcycles = -tos->pcycles;
359 tos->kcycles = tos->pcycles;
360 tos->clock = 0;
361 argv = (char**)(TSTKTOP - ssize);
362 charp = (char*)(TSTKTOP - nbytes);
363 args = charp;
364 if(indir)
365 argp = progarg;
366 else
367 argp = (char**)arg[1];
368
369 for(i=0; i<nargs; i++){
370 if(indir && *argp==0) {
371 indir = 0;
372 argp = (char**)arg[1];
373 }
374 *argv++ = charp + (USTKTOP-TSTKTOP);
375 n = strlen(*argp) + 1;
376 memmove(charp, *argp++, n);
377 charp += n;
378 }
379 free(file0);
380
381 free(up->text);
382 up->text = elem;
383 elem = nil; /* so waserror() won't free elem */
384 USED(elem);
385
386 /* copy args; easiest from new process's stack */
387 n = charp - args;
388 if(n > 128) /* don't waste too much space on huge arg lists */
389 n = 128;
390 a = up->args;
391 up->args = nil;
392 free(a);
393 up->args = smalloc(n);
394 memmove(up->args, args, n);
395 if(n>0 && up->args[n-1]!='\0'){
396 /* make sure last arg is NUL-terminated */
397 /* put NUL at UTF-8 character boundary */
398 for(i=n-1; i>0; --i)
399 if(fullrune(up->args+i, n-i))
400 break;
401 up->args[i] = 0;
402 n = i+1;
403 }
404 up->nargs = n;
405
406 /*
407 * Committed.
408 * Free old memory.
409 * Special segments are maintained across exec
410 */
411 for(i = SSEG; i <= BSEG; i++) {
412 putseg(up->seg[i]);
413 /* prevent a second free if we have an error */
414 up->seg[i] = 0;
415 }
416 for(i = BSEG+1; i < NSEG; i++) {
417 s = up->seg[i];
418 if(s != 0 && (s->type&SG_CEXEC)) {
419 putseg(s);
420 up->seg[i] = 0;
421 }
422 }
423
424 /* Text. Shared. Attaches to cache image if possible */
425 /* attachimage returns a locked cache image */
426 img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
427 ts = img->s;
428 up->seg[TSEG] = ts;
429 ts->flushme = 1;
430 ts->fstart = 0;
431 ts->flen = sizeof(Exec)+text;
432 unlock(img);
433
434 /* Data. Shared. */
435 s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
436 up->seg[DSEG] = s;
437
438 /* Attached by hand */
439 incref(img);
440 s->image = img;
441 s->fstart = ts->fstart+ts->flen;
442 s->flen = data;
443
444 /* BSS. Zero fill on demand */
445 up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
446
447 /*
448 * Move the stack
449 */
450 s = up->seg[ESEG];
451 up->seg[ESEG] = 0;
452 up->seg[SSEG] = s;
453 qunlock(&up->seglock);
454 poperror(); /* seglock */
455 poperror(); /* elem */
456 s->base = USTKTOP-USTKSIZE;
457 s->top = USTKTOP;
458 relocateseg(s, USTKTOP-TSTKTOP);
459
460 /*
461 * '/' processes are higher priority (hack to make /ip more responsive).
462 */
463 if(devtab[tc->type]->dc == L'/')
464 up->basepri = PriRoot;
465 up->priority = up->basepri;
466 poperror();
467 cclose(tc);
468
469 /*
470 * At this point, the mmu contains info about the old address
471 * space and needs to be flushed
472 */
473 flushmmu();
474 qlock(&up->debug);
475 up->nnote = 0;
476 up->notepending = 0;
477 up->notify = 0;
478 up->notified = 0;
479 up->privatemem = 0;
480 procsetup(up);
481 qunlock(&up->debug);
482 if(up->hang)
483 up->procctl = Proc_stopme;
484
485 /*
486 * Close on exec
487 */
488 f = up->fgrp;
489 for(i=0; i<=f->maxfd; i++)
490 fdclose(i, CCEXEC);
491
492 return execregs(entry, ssize, nargs);
493 }
494
495 int
shargs(char * s,int n,char ** ap)496 shargs(char *s, int n, char **ap)
497 {
498 int i;
499
500 s += 2;
501 n -= 2; /* skip #! */
502 for(i=0; s[i]!='\n'; i++)
503 if(i == n-1)
504 return 0;
505 s[i] = 0;
506 *ap = 0;
507 i = 0;
508 for(;;) {
509 while(*s==' ' || *s=='\t')
510 s++;
511 if(*s == 0)
512 break;
513 i++;
514 *ap++ = s;
515 *ap = 0;
516 while(*s && *s!=' ' && *s!='\t')
517 s++;
518 if(*s == 0)
519 break;
520 else
521 *s++ = 0;
522 }
523 return i;
524 }
525
526 int
return0(void *)527 return0(void*)
528 {
529 return 0;
530 }
531
532 long
syssleep(ulong * arg)533 syssleep(ulong *arg)
534 {
535
536 int n;
537
538 n = arg[0];
539 if(n <= 0) {
540 if (up->edf && (up->edf->flags & Admitted))
541 edfyield();
542 else
543 yield();
544 return 0;
545 }
546 if(n < TK2MS(1))
547 n = TK2MS(1);
548 tsleep(&up->sleep, return0, 0, n);
549 return 0;
550 }
551
552 long
sysalarm(ulong * arg)553 sysalarm(ulong *arg)
554 {
555 return procalarm(arg[0]);
556 }
557
558 long
sysexits(ulong * arg)559 sysexits(ulong *arg)
560 {
561 char *status;
562 char *inval = "invalid exit string";
563 char buf[ERRMAX];
564
565 status = (char*)arg[0];
566 if(status){
567 if(waserror())
568 status = inval;
569 else{
570 validaddr((ulong)status, 1, 0);
571 if(vmemchr(status, 0, ERRMAX) == 0){
572 memmove(buf, status, ERRMAX);
573 buf[ERRMAX-1] = 0;
574 status = buf;
575 }
576 poperror();
577 }
578
579 }
580 pexit(status, 1);
581 return 0; /* not reached */
582 }
583
584 long
sys_wait(ulong * arg)585 sys_wait(ulong *arg)
586 {
587 int pid;
588 Waitmsg w;
589 OWaitmsg *ow;
590
591 if(arg[0] == 0)
592 return pwait(nil);
593
594 validaddr(arg[0], sizeof(OWaitmsg), 1);
595 validalign(arg[0], BY2WD); /* who cares? */
596 pid = pwait(&w);
597 if(pid >= 0){
598 ow = (OWaitmsg*)arg[0];
599 readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
600 readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
601 readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
602 readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
603 strncpy(ow->msg, w.msg, sizeof(ow->msg));
604 ow->msg[sizeof(ow->msg)-1] = '\0';
605 }
606 return pid;
607 }
608
609 long
sysawait(ulong * arg)610 sysawait(ulong *arg)
611 {
612 int i;
613 int pid;
614 Waitmsg w;
615 ulong n;
616
617 n = arg[1];
618 validaddr(arg[0], n, 1);
619 pid = pwait(&w);
620 if(pid < 0)
621 return -1;
622 i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
623 w.pid,
624 w.time[TUser], w.time[TSys], w.time[TReal],
625 w.msg);
626
627 return i;
628 }
629
630 void
werrstr(char * fmt,...)631 werrstr(char *fmt, ...)
632 {
633 va_list va;
634
635 if(up == nil)
636 return;
637
638 va_start(va, fmt);
639 vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
640 va_end(va);
641 }
642
643 static long
generrstr(char * buf,uint nbuf)644 generrstr(char *buf, uint nbuf)
645 {
646 char tmp[ERRMAX];
647
648 if(nbuf == 0)
649 error(Ebadarg);
650 validaddr((ulong)buf, nbuf, 1);
651 if(nbuf > sizeof tmp)
652 nbuf = sizeof tmp;
653 memmove(tmp, buf, nbuf);
654
655 /* make sure it's NUL-terminated */
656 tmp[nbuf-1] = '\0';
657 memmove(buf, up->syserrstr, nbuf);
658 buf[nbuf-1] = '\0';
659 memmove(up->syserrstr, tmp, nbuf);
660 return 0;
661 }
662
663 long
syserrstr(ulong * arg)664 syserrstr(ulong *arg)
665 {
666 return generrstr((char*)arg[0], arg[1]);
667 }
668
669 /* compatibility for old binaries */
670 long
sys_errstr(ulong * arg)671 sys_errstr(ulong *arg)
672 {
673 return generrstr((char*)arg[0], 64);
674 }
675
676 long
sysnotify(ulong * arg)677 sysnotify(ulong *arg)
678 {
679 if(arg[0] != 0)
680 validaddr(arg[0], sizeof(ulong), 0);
681 up->notify = (int(*)(void*, char*))(arg[0]);
682 return 0;
683 }
684
685 long
sysnoted(ulong * arg)686 sysnoted(ulong *arg)
687 {
688 if(arg[0]!=NRSTR && !up->notified)
689 error(Egreg);
690 return 0;
691 }
692
693 long
syssegbrk(ulong * arg)694 syssegbrk(ulong *arg)
695 {
696 int i;
697 ulong addr;
698 Segment *s;
699
700 addr = arg[0];
701 for(i = 0; i < NSEG; i++) {
702 s = up->seg[i];
703 if(s == 0 || addr < s->base || addr >= s->top)
704 continue;
705 switch(s->type&SG_TYPE) {
706 case SG_TEXT:
707 case SG_DATA:
708 case SG_STACK:
709 error(Ebadarg);
710 default:
711 return ibrk(arg[1], i);
712 }
713 }
714
715 error(Ebadarg);
716 return 0; /* not reached */
717 }
718
719 long
syssegattach(ulong * arg)720 syssegattach(ulong *arg)
721 {
722 return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
723 }
724
725 long
syssegdetach(ulong * arg)726 syssegdetach(ulong *arg)
727 {
728 int i;
729 ulong addr;
730 Segment *s;
731
732 qlock(&up->seglock);
733 if(waserror()){
734 qunlock(&up->seglock);
735 nexterror();
736 }
737
738 s = 0;
739 addr = arg[0];
740 for(i = 0; i < NSEG; i++)
741 if(s = up->seg[i]) {
742 qlock(&s->lk);
743 if((addr >= s->base && addr < s->top) ||
744 (s->top == s->base && addr == s->base))
745 goto found;
746 qunlock(&s->lk);
747 }
748
749 error(Ebadarg);
750
751 found:
752 /*
753 * Check we are not detaching the initial stack segment.
754 */
755 if(s == up->seg[SSEG]){
756 qunlock(&s->lk);
757 error(Ebadarg);
758 }
759 up->seg[i] = 0;
760 qunlock(&s->lk);
761 putseg(s);
762 qunlock(&up->seglock);
763 poperror();
764
765 /* Ensure we flush any entries from the lost segment */
766 flushmmu();
767 return 0;
768 }
769
770 long
syssegfree(ulong * arg)771 syssegfree(ulong *arg)
772 {
773 Segment *s;
774 ulong from, to;
775
776 from = arg[0];
777 s = seg(up, from, 1);
778 if(s == nil)
779 error(Ebadarg);
780 to = (from + arg[1]) & ~(BY2PG-1);
781 from = PGROUND(from);
782
783 if(to > s->top) {
784 qunlock(&s->lk);
785 error(Ebadarg);
786 }
787
788 mfreeseg(s, from, (to - from) / BY2PG);
789 qunlock(&s->lk);
790 flushmmu();
791
792 return 0;
793 }
794
795 /* For binary compatibility */
796 long
sysbrk_(ulong * arg)797 sysbrk_(ulong *arg)
798 {
799 return ibrk(arg[0], BSEG);
800 }
801
802 long
sysrendezvous(ulong * arg)803 sysrendezvous(ulong *arg)
804 {
805 uintptr tag, val;
806 Proc *p, **l;
807
808 tag = arg[0];
809 l = &REND(up->rgrp, tag);
810 up->rendval = ~(uintptr)0;
811
812 lock(up->rgrp);
813 for(p = *l; p; p = p->rendhash) {
814 if(p->rendtag == tag) {
815 *l = p->rendhash;
816 val = p->rendval;
817 p->rendval = arg[1];
818
819 while(p->mach != 0)
820 ;
821 ready(p);
822 unlock(up->rgrp);
823 return val;
824 }
825 l = &p->rendhash;
826 }
827
828 /* Going to sleep here */
829 up->rendtag = tag;
830 up->rendval = arg[1];
831 up->rendhash = *l;
832 *l = up;
833 up->state = Rendezvous;
834 unlock(up->rgrp);
835
836 sched();
837
838 return up->rendval;
839 }
840
841 /*
842 * The implementation of semaphores is complicated by needing
843 * to avoid rescheduling in syssemrelease, so that it is safe
844 * to call from real-time processes. This means syssemrelease
845 * cannot acquire any qlocks, only spin locks.
846 *
847 * Semacquire and semrelease must both manipulate the semaphore
848 * wait list. Lock-free linked lists only exist in theory, not
849 * in practice, so the wait list is protected by a spin lock.
850 *
851 * The semaphore value *addr is stored in user memory, so it
852 * cannot be read or written while holding spin locks.
853 *
854 * Thus, we can access the list only when holding the lock, and
855 * we can access the semaphore only when not holding the lock.
856 * This makes things interesting. Note that sleep's condition function
857 * is called while holding two locks - r and up->rlock - so it cannot
858 * access the semaphore value either.
859 *
860 * An acquirer announces its intention to try for the semaphore
861 * by putting a Sema structure onto the wait list and then
862 * setting Sema.waiting. After one last check of semaphore,
863 * the acquirer sleeps until Sema.waiting==0. A releaser of n
864 * must wake up n acquirers who have Sema.waiting set. It does
865 * this by clearing Sema.waiting and then calling wakeup.
866 *
867 * There are three interesting races here.
868
869 * The first is that in this particular sleep/wakeup usage, a single
870 * wakeup can rouse a process from two consecutive sleeps!
871 * The ordering is:
872 *
873 * (a) set Sema.waiting = 1
874 * (a) call sleep
875 * (b) set Sema.waiting = 0
876 * (a) check Sema.waiting inside sleep, return w/o sleeping
877 * (a) try for semaphore, fail
878 * (a) set Sema.waiting = 1
879 * (a) call sleep
880 * (b) call wakeup(a)
881 * (a) wake up again
882 *
883 * This is okay - semacquire will just go around the loop
884 * again. It does mean that at the top of the for(;;) loop in
885 * semacquire, phore.waiting might already be set to 1.
886 *
887 * The second is that a releaser might wake an acquirer who is
888 * interrupted before he can acquire the lock. Since
889 * release(n) issues only n wakeup calls -- only n can be used
890 * anyway -- if the interrupted process is not going to use his
891 * wakeup call he must pass it on to another acquirer.
892 *
893 * The third race is similar to the second but more subtle. An
894 * acquirer sets waiting=1 and then does a final canacquire()
895 * before going to sleep. The opposite order would result in
896 * missing wakeups that happen between canacquire and
897 * waiting=1. (In fact, the whole point of Sema.waiting is to
898 * avoid missing wakeups between canacquire() and sleep().) But
899 * there can be spurious wakeups between a successful
900 * canacquire() and the following semdequeue(). This wakeup is
901 * not useful to the acquirer, since he has already acquired
902 * the semaphore. Like in the previous case, though, the
903 * acquirer must pass the wakeup call along.
904 *
905 * This is all rather subtle. The code below has been verified
906 * with the spin model /sys/src/9/port/semaphore.p. The
907 * original code anticipated the second race but not the first
908 * or third, which were caught only with spin. The first race
909 * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
910 * It was lucky that my abstract model of sleep/wakeup still managed
911 * to preserve that behavior.
912 *
913 * I remain slightly concerned about memory coherence
914 * outside of locks. The spin model does not take
915 * queued processor writes into account so we have to
916 * think hard. The only variables accessed outside locks
917 * are the semaphore value itself and the boolean flag
918 * Sema.waiting. The value is only accessed with cmpswap,
919 * whose job description includes doing the right thing as
920 * far as memory coherence across processors. That leaves
921 * Sema.waiting. To handle it, we call coherence() before each
922 * read and after each write. - rsc
923 */
924
925 /* Add semaphore p with addr a to list in seg. */
926 static void
semqueue(Segment * s,long * a,Sema * p)927 semqueue(Segment *s, long *a, Sema *p)
928 {
929 memset(p, 0, sizeof *p);
930 p->addr = a;
931 lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
932 p->next = &s->sema;
933 p->prev = s->sema.prev;
934 p->next->prev = p;
935 p->prev->next = p;
936 unlock(&s->sema);
937 }
938
939 /* Remove semaphore p from list in seg. */
940 static void
semdequeue(Segment * s,Sema * p)941 semdequeue(Segment *s, Sema *p)
942 {
943 lock(&s->sema);
944 p->next->prev = p->prev;
945 p->prev->next = p->next;
946 unlock(&s->sema);
947 }
948
949 /* Wake up n waiters with addr a on list in seg. */
950 static void
semwakeup(Segment * s,long * a,long n)951 semwakeup(Segment *s, long *a, long n)
952 {
953 Sema *p;
954
955 lock(&s->sema);
956 for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
957 if(p->addr == a && p->waiting){
958 p->waiting = 0;
959 coherence();
960 wakeup(p);
961 n--;
962 }
963 }
964 unlock(&s->sema);
965 }
966
967 /* Add delta to semaphore and wake up waiters as appropriate. */
968 static long
semrelease(Segment * s,long * addr,long delta)969 semrelease(Segment *s, long *addr, long delta)
970 {
971 long value;
972
973 do
974 value = *addr;
975 while(!cmpswap(addr, value, value+delta));
976 semwakeup(s, addr, delta);
977 return value+delta;
978 }
979
980 /* Try to acquire semaphore using compare-and-swap */
981 static int
canacquire(long * addr)982 canacquire(long *addr)
983 {
984 long value;
985
986 while((value=*addr) > 0)
987 if(cmpswap(addr, value, value-1))
988 return 1;
989 return 0;
990 }
991
992 /* Should we wake up? */
993 static int
semawoke(void * p)994 semawoke(void *p)
995 {
996 coherence();
997 return !((Sema*)p)->waiting;
998 }
999
1000 /* Acquire semaphore (subtract 1). */
1001 static int
semacquire(Segment * s,long * addr,int block)1002 semacquire(Segment *s, long *addr, int block)
1003 {
1004 int acquired;
1005 Sema phore;
1006
1007 if(canacquire(addr))
1008 return 1;
1009 if(!block)
1010 return 0;
1011
1012 acquired = 0;
1013 semqueue(s, addr, &phore);
1014 for(;;){
1015 phore.waiting = 1;
1016 coherence();
1017 if(canacquire(addr)){
1018 acquired = 1;
1019 break;
1020 }
1021 if(waserror())
1022 break;
1023 sleep(&phore, semawoke, &phore);
1024 poperror();
1025 }
1026 semdequeue(s, &phore);
1027 coherence(); /* not strictly necessary due to lock in semdequeue */
1028 if(!phore.waiting)
1029 semwakeup(s, addr, 1);
1030 if(!acquired)
1031 nexterror();
1032 return 1;
1033 }
1034
1035 /* Acquire semaphore or time-out */
1036 static int
tsemacquire(Segment * s,long * addr,ulong ms)1037 tsemacquire(Segment *s, long *addr, ulong ms)
1038 {
1039 int acquired, timedout;
1040 ulong t, elms;
1041 Sema phore;
1042
1043 if(canacquire(addr))
1044 return 1;
1045 if(ms == 0)
1046 return 0;
1047 acquired = timedout = 0;
1048 semqueue(s, addr, &phore);
1049 for(;;){
1050 phore.waiting = 1;
1051 coherence();
1052 if(canacquire(addr)){
1053 acquired = 1;
1054 break;
1055 }
1056 if(waserror())
1057 break;
1058 t = m->ticks;
1059 tsleep(&phore, semawoke, &phore, ms);
1060 elms = TK2MS(m->ticks - t);
1061 poperror();
1062 if(elms >= ms){
1063 timedout = 1;
1064 break;
1065 }
1066 ms -= elms;
1067 }
1068 semdequeue(s, &phore);
1069 coherence(); /* not strictly necessary due to lock in semdequeue */
1070 if(!phore.waiting)
1071 semwakeup(s, addr, 1);
1072 if(timedout)
1073 return 0;
1074 if(!acquired)
1075 nexterror();
1076 return 1;
1077 }
1078
1079 long
syssemacquire(ulong * arg)1080 syssemacquire(ulong *arg)
1081 {
1082 int block;
1083 long *addr;
1084 Segment *s;
1085
1086 validaddr(arg[0], sizeof(long), 1);
1087 validalign(arg[0], sizeof(long));
1088 addr = (long*)arg[0];
1089 block = arg[1];
1090
1091 if((s = seg(up, (ulong)addr, 0)) == nil)
1092 error(Ebadarg);
1093 if(*addr < 0)
1094 error(Ebadarg);
1095 return semacquire(s, addr, block);
1096 }
1097
1098 long
systsemacquire(ulong * arg)1099 systsemacquire(ulong *arg)
1100 {
1101 long *addr;
1102 ulong ms;
1103 Segment *s;
1104
1105 validaddr(arg[0], sizeof(long), 1);
1106 validalign(arg[0], sizeof(long));
1107 addr = (long*)arg[0];
1108 ms = arg[1];
1109
1110 if((s = seg(up, (ulong)addr, 0)) == nil)
1111 error(Ebadarg);
1112 if(*addr < 0)
1113 error(Ebadarg);
1114 return tsemacquire(s, addr, ms);
1115 }
1116
1117 long
syssemrelease(ulong * arg)1118 syssemrelease(ulong *arg)
1119 {
1120 long *addr, delta;
1121 Segment *s;
1122
1123 validaddr(arg[0], sizeof(long), 1);
1124 validalign(arg[0], sizeof(long));
1125 addr = (long*)arg[0];
1126 delta = arg[1];
1127
1128 if((s = seg(up, (ulong)addr, 0)) == nil)
1129 error(Ebadarg);
1130 /* delta == 0 is a no-op, not a release */
1131 if(delta < 0 || *addr < 0)
1132 error(Ebadarg);
1133 return semrelease(s, addr, delta);
1134 }
1135
1136 long
sysnsec(ulong * arg)1137 sysnsec(ulong *arg)
1138 {
1139 validaddr(arg[0], sizeof(vlong), 1);
1140 validalign(arg[0], sizeof(vlong));
1141
1142 *(vlong*)arg[0] = todget(nil);
1143
1144 return 0;
1145 }
1146