1 #include "u.h" 2 #include "tos.h" 3 #include "../port/lib.h" 4 #include "mem.h" 5 #include "dat.h" 6 #include "fns.h" 7 #include "../port/error.h" 8 #include "edf.h" 9 10 #include <a.out.h> 11 12 int shargs(char*, int, char**); 13 14 extern void checkpages(void); 15 extern void checkpagerefs(void); 16 17 long 18 sysr1(ulong*) 19 { 20 checkpagerefs(); 21 return 0; 22 } 23 24 long 25 sysrfork(ulong *arg) 26 { 27 Proc *p; 28 int n, i; 29 Fgrp *ofg; 30 Pgrp *opg; 31 Rgrp *org; 32 Egrp *oeg; 33 ulong pid, flag; 34 Mach *wm; 35 36 flag = arg[0]; 37 /* Check flags before we commit */ 38 if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) 39 error(Ebadarg); 40 if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG)) 41 error(Ebadarg); 42 if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG)) 43 error(Ebadarg); 44 45 if((flag&RFPROC) == 0) { 46 if(flag & (RFMEM|RFNOWAIT)) 47 error(Ebadarg); 48 if(flag & (RFFDG|RFCFDG)) { 49 ofg = up->fgrp; 50 if(flag & RFFDG) 51 up->fgrp = dupfgrp(ofg); 52 else 53 up->fgrp = dupfgrp(nil); 54 closefgrp(ofg); 55 } 56 if(flag & (RFNAMEG|RFCNAMEG)) { 57 opg = up->pgrp; 58 up->pgrp = newpgrp(); 59 if(flag & RFNAMEG) 60 pgrpcpy(up->pgrp, opg); 61 /* inherit noattach */ 62 up->pgrp->noattach = opg->noattach; 63 closepgrp(opg); 64 } 65 if(flag & RFNOMNT) 66 up->pgrp->noattach = 1; 67 if(flag & RFREND) { 68 org = up->rgrp; 69 up->rgrp = newrgrp(); 70 closergrp(org); 71 } 72 if(flag & (RFENVG|RFCENVG)) { 73 oeg = up->egrp; 74 up->egrp = smalloc(sizeof(Egrp)); 75 up->egrp->ref = 1; 76 if(flag & RFENVG) 77 envcpy(up->egrp, oeg); 78 closeegrp(oeg); 79 } 80 if(flag & RFNOTEG) 81 up->noteid = incref(¬eidalloc); 82 return 0; 83 } 84 85 p = newproc(); 86 87 p->fpsave = up->fpsave; 88 p->scallnr = up->scallnr; 89 p->s = up->s; 90 p->nerrlab = 0; 91 p->slash = up->slash; 92 p->dot = up->dot; 93 incref(p->dot); 94 95 memmove(p->note, up->note, sizeof(p->note)); 96 p->privatemem = up->privatemem; 97 p->noswap = up->noswap; 98 p->nnote = up->nnote; 99 p->notified = 0; 100 p->lastnote = up->lastnote; 101 p->notify = up->notify; 102 p->ureg = up->ureg; 103 p->dbgreg = 0; 104 105 /* Make a new set of memory segments */ 106 n = flag & RFMEM; 107 qlock(&p->seglock); 108 if(waserror()){ 109 qunlock(&p->seglock); 110 nexterror(); 111 } 112 for(i = 0; i < NSEG; i++) 113 if(up->seg[i]) 114 p->seg[i] = dupseg(up->seg, i, n); 115 qunlock(&p->seglock); 116 poperror(); 117 118 /* File descriptors */ 119 if(flag & (RFFDG|RFCFDG)) { 120 if(flag & RFFDG) 121 p->fgrp = dupfgrp(up->fgrp); 122 else 123 p->fgrp = dupfgrp(nil); 124 } 125 else { 126 p->fgrp = up->fgrp; 127 incref(p->fgrp); 128 } 129 130 /* Process groups */ 131 if(flag & (RFNAMEG|RFCNAMEG)) { 132 p->pgrp = newpgrp(); 133 if(flag & RFNAMEG) 134 pgrpcpy(p->pgrp, up->pgrp); 135 /* inherit noattach */ 136 p->pgrp->noattach = up->pgrp->noattach; 137 } 138 else { 139 p->pgrp = up->pgrp; 140 incref(p->pgrp); 141 } 142 if(flag & RFNOMNT) 143 up->pgrp->noattach = 1; 144 145 if(flag & RFREND) 146 p->rgrp = newrgrp(); 147 else { 148 incref(up->rgrp); 149 p->rgrp = up->rgrp; 150 } 151 152 /* Environment group */ 153 if(flag & (RFENVG|RFCENVG)) { 154 p->egrp = smalloc(sizeof(Egrp)); 155 p->egrp->ref = 1; 156 if(flag & RFENVG) 157 envcpy(p->egrp, up->egrp); 158 } 159 else { 160 p->egrp = up->egrp; 161 incref(p->egrp); 162 } 163 p->hang = up->hang; 164 p->procmode = up->procmode; 165 166 /* Craft a return frame which will cause the child to pop out of 167 * the scheduler in user mode with the return register zero 168 */ 169 forkchild(p, up->dbgreg); 170 171 p->parent = up; 172 p->parentpid = up->pid; 173 if(flag&RFNOWAIT) 174 p->parentpid = 0; 175 else { 176 lock(&up->exl); 177 up->nchild++; 178 unlock(&up->exl); 179 } 180 if((flag&RFNOTEG) == 0) 181 p->noteid = up->noteid; 182 183 p->fpstate = up->fpstate; 184 pid = p->pid; 185 memset(p->time, 0, sizeof(p->time)); 186 p->time[TReal] = MACHP(0)->ticks; 187 188 kstrdup(&p->text, up->text); 189 kstrdup(&p->user, up->user); 190 /* 191 * since the bss/data segments are now shareable, 192 * any mmu info about this process is now stale 193 * (i.e. has bad properties) and has to be discarded. 194 */ 195 flushmmu(); 196 p->basepri = up->basepri; 197 p->priority = up->basepri; 198 p->fixedpri = up->fixedpri; 199 p->mp = up->mp; 200 wm = up->wired; 201 if(wm) 202 procwired(p, wm->machno); 203 ready(p); 204 sched(); 205 return pid; 206 } 207 208 static ulong 209 l2be(long l) 210 { 211 uchar *cp; 212 213 cp = (uchar*)&l; 214 return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3]; 215 } 216 217 long 218 sysexec(ulong *arg) 219 { 220 Segment *s, *ts; 221 ulong t, d, b; 222 int i; 223 Chan *tc; 224 char **argv, **argp; 225 char *a, *charp, *args, *file, *file0; 226 char *progarg[sizeof(Exec)/2+1], *elem, progelem[64]; 227 ulong ssize, spage, nargs, nbytes, n, bssend; 228 int indir; 229 Exec exec; 230 char line[sizeof(Exec)]; 231 Fgrp *f; 232 Image *img; 233 ulong magic, text, entry, data, bss; 234 Tos *tos; 235 236 indir = 0; 237 elem = nil; 238 validaddr(arg[0], 1, 0); 239 file0 = validnamedup((char*)arg[0], 1); 240 if(waserror()){ 241 free(file0); 242 free(elem); 243 nexterror(); 244 } 245 file = file0; 246 for(;;){ 247 tc = namec(file, Aopen, OEXEC, 0); 248 if(waserror()){ 249 cclose(tc); 250 nexterror(); 251 } 252 if(!indir) 253 kstrdup(&elem, up->genbuf); 254 255 n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0); 256 if(n < 2) 257 error(Ebadexec); 258 magic = l2be(exec.magic); 259 text = l2be(exec.text); 260 entry = l2be(exec.entry); 261 if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){ 262 if(text >= USTKTOP-UTZERO 263 || entry < UTZERO+sizeof(Exec) 264 || entry >= UTZERO+sizeof(Exec)+text) 265 error(Ebadexec); 266 break; /* for binary */ 267 } 268 269 /* 270 * Process #! /bin/sh args ... 271 */ 272 memmove(line, &exec, sizeof(Exec)); 273 if(indir || line[0]!='#' || line[1]!='!') 274 error(Ebadexec); 275 n = shargs(line, n, progarg); 276 if(n == 0) 277 error(Ebadexec); 278 indir = 1; 279 /* 280 * First arg becomes complete file name 281 */ 282 progarg[n++] = file; 283 progarg[n] = 0; 284 validaddr(arg[1], BY2WD, 1); 285 arg[1] += BY2WD; 286 file = progarg[0]; 287 if(strlen(elem) >= sizeof progelem) 288 error(Ebadexec); 289 strcpy(progelem, elem); 290 progarg[0] = progelem; 291 poperror(); 292 cclose(tc); 293 } 294 295 data = l2be(exec.data); 296 bss = l2be(exec.bss); 297 t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1); 298 d = (t + data + (BY2PG-1)) & ~(BY2PG-1); 299 bssend = t + data + bss; 300 b = (bssend + (BY2PG-1)) & ~(BY2PG-1); 301 if(t >= KZERO || d >= KZERO || b >= KZERO) 302 error(Ebadexec); 303 304 /* 305 * Args: pass 1: count 306 */ 307 nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */ 308 nargs = 0; 309 if(indir){ 310 argp = progarg; 311 while(*argp){ 312 a = *argp++; 313 nbytes += strlen(a) + 1; 314 nargs++; 315 } 316 } 317 evenaddr(arg[1]); 318 argp = (char**)arg[1]; 319 validaddr((ulong)argp, BY2WD, 0); 320 while(*argp){ 321 a = *argp++; 322 if(((ulong)argp&(BY2PG-1)) < BY2WD) 323 validaddr((ulong)argp, BY2WD, 0); 324 validaddr((ulong)a, 1, 0); 325 nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1; 326 nargs++; 327 } 328 ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1)); 329 330 /* 331 * 8-byte align SP for those (e.g. sparc) that need it. 332 * execregs() will subtract another 4 bytes for argc. 333 */ 334 if((ssize+4) & 7) 335 ssize += 4; 336 spage = (ssize+(BY2PG-1)) >> PGSHIFT; 337 338 /* 339 * Build the stack segment, putting it in kernel virtual for the moment 340 */ 341 if(spage > TSTKSIZ) 342 error(Enovmem); 343 344 qlock(&up->seglock); 345 if(waserror()){ 346 qunlock(&up->seglock); 347 nexterror(); 348 } 349 up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG); 350 351 /* 352 * Args: pass 2: assemble; the pages will be faulted in 353 */ 354 tos = (Tos*)(TSTKTOP - sizeof(Tos)); 355 tos->cyclefreq = m->cyclefreq; 356 cycles((uvlong*)&tos->pcycles); 357 tos->pcycles = -tos->pcycles; 358 tos->kcycles = tos->pcycles; 359 tos->clock = 0; 360 argv = (char**)(TSTKTOP - ssize); 361 charp = (char*)(TSTKTOP - nbytes); 362 args = charp; 363 if(indir) 364 argp = progarg; 365 else 366 argp = (char**)arg[1]; 367 368 for(i=0; i<nargs; i++){ 369 if(indir && *argp==0) { 370 indir = 0; 371 argp = (char**)arg[1]; 372 } 373 *argv++ = charp + (USTKTOP-TSTKTOP); 374 n = strlen(*argp) + 1; 375 memmove(charp, *argp++, n); 376 charp += n; 377 } 378 free(file0); 379 380 free(up->text); 381 up->text = elem; 382 elem = nil; /* so waserror() won't free elem */ 383 USED(elem); 384 385 /* copy args; easiest from new process's stack */ 386 n = charp - args; 387 if(n > 128) /* don't waste too much space on huge arg lists */ 388 n = 128; 389 a = up->args; 390 up->args = nil; 391 free(a); 392 up->args = smalloc(n); 393 memmove(up->args, args, n); 394 if(n>0 && up->args[n-1]!='\0'){ 395 /* make sure last arg is NUL-terminated */ 396 /* put NUL at UTF-8 character boundary */ 397 for(i=n-1; i>0; --i) 398 if(fullrune(up->args+i, n-i)) 399 break; 400 up->args[i] = 0; 401 n = i+1; 402 } 403 up->nargs = n; 404 405 /* 406 * Committed. 407 * Free old memory. 408 * Special segments are maintained across exec 409 */ 410 for(i = SSEG; i <= BSEG; i++) { 411 putseg(up->seg[i]); 412 /* prevent a second free if we have an error */ 413 up->seg[i] = 0; 414 } 415 for(i = BSEG+1; i < NSEG; i++) { 416 s = up->seg[i]; 417 if(s != 0 && (s->type&SG_CEXEC)) { 418 putseg(s); 419 up->seg[i] = 0; 420 } 421 } 422 423 /* 424 * Close on exec 425 */ 426 f = up->fgrp; 427 for(i=0; i<=f->maxfd; i++) 428 fdclose(i, CCEXEC); 429 430 /* Text. Shared. Attaches to cache image if possible */ 431 /* attachimage returns a locked cache image */ 432 img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT); 433 ts = img->s; 434 up->seg[TSEG] = ts; 435 ts->flushme = 1; 436 ts->fstart = 0; 437 ts->flen = sizeof(Exec)+text; 438 unlock(img); 439 440 /* Data. Shared. */ 441 s = newseg(SG_DATA, t, (d-t)>>PGSHIFT); 442 up->seg[DSEG] = s; 443 444 /* Attached by hand */ 445 incref(img); 446 s->image = img; 447 s->fstart = ts->fstart+ts->flen; 448 s->flen = data; 449 450 /* BSS. Zero fill on demand */ 451 up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT); 452 453 /* 454 * Move the stack 455 */ 456 s = up->seg[ESEG]; 457 up->seg[ESEG] = 0; 458 up->seg[SSEG] = s; 459 qunlock(&up->seglock); 460 poperror(); /* seglock */ 461 poperror(); /* elem */ 462 s->base = USTKTOP-USTKSIZE; 463 s->top = USTKTOP; 464 relocateseg(s, USTKTOP-TSTKTOP); 465 466 /* 467 * '/' processes are higher priority (hack to make /ip more responsive). 468 */ 469 if(devtab[tc->type]->dc == L'/') 470 up->basepri = PriRoot; 471 up->priority = up->basepri; 472 poperror(); 473 cclose(tc); 474 475 /* 476 * At this point, the mmu contains info about the old address 477 * space and needs to be flushed 478 */ 479 flushmmu(); 480 qlock(&up->debug); 481 up->nnote = 0; 482 up->notify = 0; 483 up->notified = 0; 484 up->privatemem = 0; 485 procsetup(up); 486 qunlock(&up->debug); 487 if(up->hang) 488 up->procctl = Proc_stopme; 489 490 return execregs(entry, ssize, nargs); 491 } 492 493 int 494 shargs(char *s, int n, char **ap) 495 { 496 int i; 497 498 s += 2; 499 n -= 2; /* skip #! */ 500 for(i=0; s[i]!='\n'; i++) 501 if(i == n-1) 502 return 0; 503 s[i] = 0; 504 *ap = 0; 505 i = 0; 506 for(;;) { 507 while(*s==' ' || *s=='\t') 508 s++; 509 if(*s == 0) 510 break; 511 i++; 512 *ap++ = s; 513 *ap = 0; 514 while(*s && *s!=' ' && *s!='\t') 515 s++; 516 if(*s == 0) 517 break; 518 else 519 *s++ = 0; 520 } 521 return i; 522 } 523 524 int 525 return0(void*) 526 { 527 return 0; 528 } 529 530 long 531 syssleep(ulong *arg) 532 { 533 534 int n; 535 536 n = arg[0]; 537 if(n <= 0) { 538 if (up->edf && (up->edf->flags & Admitted)) 539 edfyield(); 540 else 541 yield(); 542 return 0; 543 } 544 if(n < TK2MS(1)) 545 n = TK2MS(1); 546 tsleep(&up->sleep, return0, 0, n); 547 return 0; 548 } 549 550 long 551 sysalarm(ulong *arg) 552 { 553 return procalarm(arg[0]); 554 } 555 556 long 557 sysexits(ulong *arg) 558 { 559 char *status; 560 char *inval = "invalid exit string"; 561 char buf[ERRMAX]; 562 563 status = (char*)arg[0]; 564 if(status){ 565 if(waserror()) 566 status = inval; 567 else{ 568 validaddr((ulong)status, 1, 0); 569 if(vmemchr(status, 0, ERRMAX) == 0){ 570 memmove(buf, status, ERRMAX); 571 buf[ERRMAX-1] = 0; 572 status = buf; 573 } 574 poperror(); 575 } 576 577 } 578 pexit(status, 1); 579 return 0; /* not reached */ 580 } 581 582 long 583 sys_wait(ulong *arg) 584 { 585 int pid; 586 Waitmsg w; 587 OWaitmsg *ow; 588 589 if(arg[0] == 0) 590 return pwait(nil); 591 592 validaddr(arg[0], sizeof(OWaitmsg), 1); 593 evenaddr(arg[0]); 594 pid = pwait(&w); 595 if(pid >= 0){ 596 ow = (OWaitmsg*)arg[0]; 597 readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE); 598 readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE); 599 readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE); 600 readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE); 601 strncpy(ow->msg, w.msg, sizeof(ow->msg)); 602 ow->msg[sizeof(ow->msg)-1] = '\0'; 603 } 604 return pid; 605 } 606 607 long 608 sysawait(ulong *arg) 609 { 610 int i; 611 int pid; 612 Waitmsg w; 613 ulong n; 614 615 n = arg[1]; 616 validaddr(arg[0], n, 1); 617 pid = pwait(&w); 618 if(pid < 0) 619 return -1; 620 i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q", 621 w.pid, 622 w.time[TUser], w.time[TSys], w.time[TReal], 623 w.msg); 624 625 return i; 626 } 627 628 void 629 werrstr(char *fmt, ...) 630 { 631 va_list va; 632 633 if(up == nil) 634 return; 635 636 va_start(va, fmt); 637 vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va); 638 va_end(va); 639 } 640 641 static long 642 generrstr(char *buf, uint nbuf) 643 { 644 char tmp[ERRMAX]; 645 646 if(nbuf == 0) 647 error(Ebadarg); 648 validaddr((ulong)buf, nbuf, 1); 649 if(nbuf > sizeof tmp) 650 nbuf = sizeof tmp; 651 memmove(tmp, buf, nbuf); 652 653 /* make sure it's NUL-terminated */ 654 tmp[nbuf-1] = '\0'; 655 memmove(buf, up->syserrstr, nbuf); 656 buf[nbuf-1] = '\0'; 657 memmove(up->syserrstr, tmp, nbuf); 658 return 0; 659 } 660 661 long 662 syserrstr(ulong *arg) 663 { 664 return generrstr((char*)arg[0], arg[1]); 665 } 666 667 /* compatibility for old binaries */ 668 long 669 sys_errstr(ulong *arg) 670 { 671 return generrstr((char*)arg[0], 64); 672 } 673 674 long 675 sysnotify(ulong *arg) 676 { 677 if(arg[0] != 0) 678 validaddr(arg[0], sizeof(ulong), 0); 679 up->notify = (int(*)(void*, char*))(arg[0]); 680 return 0; 681 } 682 683 long 684 sysnoted(ulong *arg) 685 { 686 if(arg[0]!=NRSTR && !up->notified) 687 error(Egreg); 688 return 0; 689 } 690 691 long 692 syssegbrk(ulong *arg) 693 { 694 int i; 695 ulong addr; 696 Segment *s; 697 698 addr = arg[0]; 699 for(i = 0; i < NSEG; i++) { 700 s = up->seg[i]; 701 if(s == 0 || addr < s->base || addr >= s->top) 702 continue; 703 switch(s->type&SG_TYPE) { 704 case SG_TEXT: 705 case SG_DATA: 706 case SG_STACK: 707 error(Ebadarg); 708 default: 709 return ibrk(arg[1], i); 710 } 711 } 712 713 error(Ebadarg); 714 return 0; /* not reached */ 715 } 716 717 long 718 syssegattach(ulong *arg) 719 { 720 return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]); 721 } 722 723 long 724 syssegdetach(ulong *arg) 725 { 726 int i; 727 ulong addr; 728 Segment *s; 729 730 qlock(&up->seglock); 731 if(waserror()){ 732 qunlock(&up->seglock); 733 nexterror(); 734 } 735 736 s = 0; 737 addr = arg[0]; 738 for(i = 0; i < NSEG; i++) 739 if(s = up->seg[i]) { 740 qlock(&s->lk); 741 if((addr >= s->base && addr < s->top) || 742 (s->top == s->base && addr == s->base)) 743 goto found; 744 qunlock(&s->lk); 745 } 746 747 error(Ebadarg); 748 749 found: 750 /* 751 * Check we are not detaching the initial stack segment. 752 */ 753 if(s == up->seg[SSEG]){ 754 qunlock(&s->lk); 755 error(Ebadarg); 756 } 757 up->seg[i] = 0; 758 qunlock(&s->lk); 759 putseg(s); 760 qunlock(&up->seglock); 761 poperror(); 762 763 /* Ensure we flush any entries from the lost segment */ 764 flushmmu(); 765 return 0; 766 } 767 768 long 769 syssegfree(ulong *arg) 770 { 771 Segment *s; 772 ulong from, to; 773 774 from = arg[0]; 775 s = seg(up, from, 1); 776 if(s == nil) 777 error(Ebadarg); 778 to = (from + arg[1]) & ~(BY2PG-1); 779 from = PGROUND(from); 780 781 if(to > s->top) { 782 qunlock(&s->lk); 783 error(Ebadarg); 784 } 785 786 mfreeseg(s, from, (to - from) / BY2PG); 787 qunlock(&s->lk); 788 flushmmu(); 789 790 return 0; 791 } 792 793 /* For binary compatibility */ 794 long 795 sysbrk_(ulong *arg) 796 { 797 return ibrk(arg[0], BSEG); 798 } 799 800 long 801 sysrendezvous(ulong *arg) 802 { 803 uintptr tag, val; 804 Proc *p, **l; 805 806 tag = arg[0]; 807 l = &REND(up->rgrp, tag); 808 up->rendval = ~(uintptr)0; 809 810 lock(up->rgrp); 811 for(p = *l; p; p = p->rendhash) { 812 if(p->rendtag == tag) { 813 *l = p->rendhash; 814 val = p->rendval; 815 p->rendval = arg[1]; 816 817 while(p->mach != 0) 818 ; 819 ready(p); 820 unlock(up->rgrp); 821 return val; 822 } 823 l = &p->rendhash; 824 } 825 826 /* Going to sleep here */ 827 up->rendtag = tag; 828 up->rendval = arg[1]; 829 up->rendhash = *l; 830 *l = up; 831 up->state = Rendezvous; 832 unlock(up->rgrp); 833 834 sched(); 835 836 return up->rendval; 837 } 838 839 /* 840 * The implementation of semaphores is complicated by needing 841 * to avoid rescheduling in syssemrelease, so that it is safe 842 * to call from real-time processes. This means syssemrelease 843 * cannot acquire any qlocks, only spin locks. 844 * 845 * Semacquire and semrelease must both manipulate the semaphore 846 * wait list. Lock-free linked lists only exist in theory, not 847 * in practice, so the wait list is protected by a spin lock. 848 * 849 * The semaphore value *addr is stored in user memory, so it 850 * cannot be read or written while holding spin locks. 851 * 852 * Thus, we can access the list only when holding the lock, and 853 * we can access the semaphore only when not holding the lock. 854 * This makes things interesting. Note that sleep's condition function 855 * is called while holding two locks - r and up->rlock - so it cannot 856 * access the semaphore value either. 857 * 858 * An acquirer announces its intention to try for the semaphore 859 * by putting a Sema structure onto the wait list and then 860 * setting Sema.waiting. After one last check of semaphore, 861 * the acquirer sleeps until Sema.waiting==0. A releaser of n 862 * must wake up n acquirers who have Sema.waiting set. It does 863 * this by clearing Sema.waiting and then calling wakeup. 864 * 865 * There are three interesting races here. 866 867 * The first is that in this particular sleep/wakeup usage, a single 868 * wakeup can rouse a process from two consecutive sleeps! 869 * The ordering is: 870 * 871 * (a) set Sema.waiting = 1 872 * (a) call sleep 873 * (b) set Sema.waiting = 0 874 * (a) check Sema.waiting inside sleep, return w/o sleeping 875 * (a) try for semaphore, fail 876 * (a) set Sema.waiting = 1 877 * (a) call sleep 878 * (b) call wakeup(a) 879 * (a) wake up again 880 * 881 * This is okay - semacquire will just go around the loop 882 * again. It does mean that at the top of the for(;;) loop in 883 * semacquire, phore.waiting might already be set to 1. 884 * 885 * The second is that a releaser might wake an acquirer who is 886 * interrupted before he can acquire the lock. Since 887 * release(n) issues only n wakeup calls -- only n can be used 888 * anyway -- if the interrupted process is not going to use his 889 * wakeup call he must pass it on to another acquirer. 890 * 891 * The third race is similar to the second but more subtle. An 892 * acquirer sets waiting=1 and then does a final canacquire() 893 * before going to sleep. The opposite order would result in 894 * missing wakeups that happen between canacquire and 895 * waiting=1. (In fact, the whole point of Sema.waiting is to 896 * avoid missing wakeups between canacquire() and sleep().) But 897 * there can be spurious wakeups between a successful 898 * canacquire() and the following semdequeue(). This wakeup is 899 * not useful to the acquirer, since he has already acquired 900 * the semaphore. Like in the previous case, though, the 901 * acquirer must pass the wakeup call along. 902 * 903 * This is all rather subtle. The code below has been verified 904 * with the spin model /sys/src/9/port/semaphore.p. The 905 * original code anticipated the second race but not the first 906 * or third, which were caught only with spin. The first race 907 * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it. 908 * It was lucky that my abstract model of sleep/wakeup still managed 909 * to preserve that behavior. 910 * 911 * I remain slightly concerned about memory coherence 912 * outside of locks. The spin model does not take 913 * queued processor writes into account so we have to 914 * think hard. The only variables accessed outside locks 915 * are the semaphore value itself and the boolean flag 916 * Sema.waiting. The value is only accessed with cmpswap, 917 * whose job description includes doing the right thing as 918 * far as memory coherence across processors. That leaves 919 * Sema.waiting. To handle it, we call coherence() before each 920 * read and after each write. - rsc 921 */ 922 923 /* Add semaphore p with addr a to list in seg. */ 924 static void 925 semqueue(Segment *s, long *a, Sema *p) 926 { 927 memset(p, 0, sizeof *p); 928 p->addr = a; 929 lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */ 930 p->next = &s->sema; 931 p->prev = s->sema.prev; 932 p->next->prev = p; 933 p->prev->next = p; 934 unlock(&s->sema); 935 } 936 937 /* Remove semaphore p from list in seg. */ 938 static void 939 semdequeue(Segment *s, Sema *p) 940 { 941 lock(&s->sema); 942 p->next->prev = p->prev; 943 p->prev->next = p->next; 944 unlock(&s->sema); 945 } 946 947 /* Wake up n waiters with addr a on list in seg. */ 948 static void 949 semwakeup(Segment *s, long *a, long n) 950 { 951 Sema *p; 952 953 lock(&s->sema); 954 for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){ 955 if(p->addr == a && p->waiting){ 956 p->waiting = 0; 957 coherence(); 958 wakeup(p); 959 n--; 960 } 961 } 962 unlock(&s->sema); 963 } 964 965 /* Add delta to semaphore and wake up waiters as appropriate. */ 966 static long 967 semrelease(Segment *s, long *addr, long delta) 968 { 969 long value; 970 971 do 972 value = *addr; 973 while(!cmpswap(addr, value, value+delta)); 974 semwakeup(s, addr, delta); 975 return value+delta; 976 } 977 978 /* Try to acquire semaphore using compare-and-swap */ 979 static int 980 canacquire(long *addr) 981 { 982 long value; 983 984 while((value=*addr) > 0) 985 if(cmpswap(addr, value, value-1)) 986 return 1; 987 return 0; 988 } 989 990 /* Should we wake up? */ 991 static int 992 semawoke(void *p) 993 { 994 coherence(); 995 return !((Sema*)p)->waiting; 996 } 997 998 /* Acquire semaphore (subtract 1). */ 999 static int 1000 semacquire(Segment *s, long *addr, int block) 1001 { 1002 int acquired; 1003 Sema phore; 1004 1005 if(canacquire(addr)) 1006 return 1; 1007 if(!block) 1008 return 0; 1009 1010 acquired = 0; 1011 semqueue(s, addr, &phore); 1012 for(;;){ 1013 phore.waiting = 1; 1014 coherence(); 1015 if(canacquire(addr)){ 1016 acquired = 1; 1017 break; 1018 } 1019 if(waserror()) 1020 break; 1021 sleep(&phore, semawoke, &phore); 1022 poperror(); 1023 } 1024 semdequeue(s, &phore); 1025 coherence(); /* not strictly necessary due to lock in semdequeue */ 1026 if(!phore.waiting) 1027 semwakeup(s, addr, 1); 1028 if(!acquired) 1029 nexterror(); 1030 return 1; 1031 } 1032 1033 long 1034 syssemacquire(ulong *arg) 1035 { 1036 int block; 1037 long *addr; 1038 Segment *s; 1039 1040 validaddr(arg[0], sizeof(long), 1); 1041 evenaddr(arg[0]); 1042 addr = (long*)arg[0]; 1043 block = arg[1]; 1044 1045 if((s = seg(up, (ulong)addr, 0)) == nil) 1046 error(Ebadarg); 1047 if(*addr < 0) 1048 error(Ebadarg); 1049 return semacquire(s, addr, block); 1050 } 1051 1052 long 1053 syssemrelease(ulong *arg) 1054 { 1055 long *addr, delta; 1056 Segment *s; 1057 1058 validaddr(arg[0], sizeof(long), 1); 1059 evenaddr(arg[0]); 1060 addr = (long*)arg[0]; 1061 delta = arg[1]; 1062 1063 if((s = seg(up, (ulong)addr, 0)) == nil) 1064 error(Ebadarg); 1065 if(delta < 0 || *addr < 0) 1066 error(Ebadarg); 1067 return semrelease(s, addr, arg[1]); 1068 } 1069