1 #include "u.h" 2 #include "tos.h" 3 #include "../port/lib.h" 4 #include "mem.h" 5 #include "dat.h" 6 #include "fns.h" 7 #include "../port/error.h" 8 #include "edf.h" 9 10 #include <a.out.h> 11 12 int shargs(char*, int, char**); 13 14 extern void checkpages(void); 15 extern void checkpagerefs(void); 16 17 long 18 sysr1(ulong*) 19 { 20 checkpagerefs(); 21 return 0; 22 } 23 24 long 25 sysrfork(ulong *arg) 26 { 27 Proc *p; 28 int n, i; 29 Fgrp *ofg; 30 Pgrp *opg; 31 Rgrp *org; 32 Egrp *oeg; 33 ulong pid, flag; 34 Mach *wm; 35 36 flag = arg[0]; 37 /* Check flags before we commit */ 38 if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) 39 error(Ebadarg); 40 if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG)) 41 error(Ebadarg); 42 if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG)) 43 error(Ebadarg); 44 45 if((flag&RFPROC) == 0) { 46 if(flag & (RFMEM|RFNOWAIT)) 47 error(Ebadarg); 48 if(flag & (RFFDG|RFCFDG)) { 49 ofg = up->fgrp; 50 if(flag & RFFDG) 51 up->fgrp = dupfgrp(ofg); 52 else 53 up->fgrp = dupfgrp(nil); 54 closefgrp(ofg); 55 } 56 if(flag & (RFNAMEG|RFCNAMEG)) { 57 opg = up->pgrp; 58 up->pgrp = newpgrp(); 59 if(flag & RFNAMEG) 60 pgrpcpy(up->pgrp, opg); 61 /* inherit noattach */ 62 up->pgrp->noattach = opg->noattach; 63 closepgrp(opg); 64 } 65 if(flag & RFNOMNT) 66 up->pgrp->noattach = 1; 67 if(flag & RFREND) { 68 org = up->rgrp; 69 up->rgrp = newrgrp(); 70 closergrp(org); 71 } 72 if(flag & (RFENVG|RFCENVG)) { 73 oeg = up->egrp; 74 up->egrp = smalloc(sizeof(Egrp)); 75 up->egrp->ref = 1; 76 if(flag & RFENVG) 77 envcpy(up->egrp, oeg); 78 closeegrp(oeg); 79 } 80 if(flag & RFNOTEG) 81 up->noteid = incref(¬eidalloc); 82 return 0; 83 } 84 85 p = newproc(); 86 87 p->fpsave = up->fpsave; 88 p->scallnr = up->scallnr; 89 p->s = up->s; 90 p->nerrlab = 0; 91 p->slash = up->slash; 92 p->dot = up->dot; 93 incref(p->dot); 94 95 memmove(p->note, up->note, sizeof(p->note)); 96 p->privatemem = up->privatemem; 97 p->noswap = up->noswap; 98 p->nnote = up->nnote; 99 p->notified = 0; 100 p->lastnote = up->lastnote; 101 p->notify = up->notify; 102 p->ureg = up->ureg; 103 p->dbgreg = 0; 104 105 /* Make a new set of memory segments */ 106 n = flag & RFMEM; 107 qlock(&p->seglock); 108 if(waserror()){ 109 qunlock(&p->seglock); 110 nexterror(); 111 } 112 for(i = 0; i < NSEG; i++) 113 if(up->seg[i]) 114 p->seg[i] = dupseg(up->seg, i, n); 115 qunlock(&p->seglock); 116 poperror(); 117 118 /* File descriptors */ 119 if(flag & (RFFDG|RFCFDG)) { 120 if(flag & RFFDG) 121 p->fgrp = dupfgrp(up->fgrp); 122 else 123 p->fgrp = dupfgrp(nil); 124 } 125 else { 126 p->fgrp = up->fgrp; 127 incref(p->fgrp); 128 } 129 130 /* Process groups */ 131 if(flag & (RFNAMEG|RFCNAMEG)) { 132 p->pgrp = newpgrp(); 133 if(flag & RFNAMEG) 134 pgrpcpy(p->pgrp, up->pgrp); 135 /* inherit noattach */ 136 p->pgrp->noattach = up->pgrp->noattach; 137 } 138 else { 139 p->pgrp = up->pgrp; 140 incref(p->pgrp); 141 } 142 if(flag & RFNOMNT) 143 up->pgrp->noattach = 1; 144 145 if(flag & RFREND) 146 p->rgrp = newrgrp(); 147 else { 148 incref(up->rgrp); 149 p->rgrp = up->rgrp; 150 } 151 152 /* Environment group */ 153 if(flag & (RFENVG|RFCENVG)) { 154 p->egrp = smalloc(sizeof(Egrp)); 155 p->egrp->ref = 1; 156 if(flag & RFENVG) 157 envcpy(p->egrp, up->egrp); 158 } 159 else { 160 p->egrp = up->egrp; 161 incref(p->egrp); 162 } 163 p->hang = up->hang; 164 p->procmode = up->procmode; 165 166 /* Craft a return frame which will cause the child to pop out of 167 * the scheduler in user mode with the return register zero 168 */ 169 forkchild(p, up->dbgreg); 170 171 p->parent = up; 172 p->parentpid = up->pid; 173 if(flag&RFNOWAIT) 174 p->parentpid = 0; 175 else { 176 lock(&up->exl); 177 up->nchild++; 178 unlock(&up->exl); 179 } 180 if((flag&RFNOTEG) == 0) 181 p->noteid = up->noteid; 182 183 p->fpstate = up->fpstate; 184 pid = p->pid; 185 memset(p->time, 0, sizeof(p->time)); 186 p->time[TReal] = MACHP(0)->ticks; 187 188 kstrdup(&p->text, up->text); 189 kstrdup(&p->user, up->user); 190 /* 191 * since the bss/data segments are now shareable, 192 * any mmu info about this process is now stale 193 * (i.e. has bad properties) and has to be discarded. 194 */ 195 flushmmu(); 196 p->basepri = up->basepri; 197 p->priority = up->basepri; 198 p->fixedpri = up->fixedpri; 199 p->mp = up->mp; 200 wm = up->wired; 201 if(wm) 202 procwired(p, wm->machno); 203 ready(p); 204 sched(); 205 return pid; 206 } 207 208 static ulong 209 l2be(long l) 210 { 211 uchar *cp; 212 213 cp = (uchar*)&l; 214 return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3]; 215 } 216 217 long 218 sysexec(ulong *arg) 219 { 220 Segment *s, *ts; 221 ulong t, d, b; 222 int i; 223 Chan *tc; 224 char **argv, **argp; 225 char *a, *charp, *args, *file; 226 char *progarg[sizeof(Exec)/2+1], *elem, progelem[64]; 227 ulong ssize, spage, nargs, nbytes, n, bssend; 228 int indir; 229 Exec exec; 230 char line[sizeof(Exec)]; 231 Fgrp *f; 232 Image *img; 233 ulong magic, text, entry, data, bss; 234 Tos *tos; 235 236 validaddr(arg[0], 1, 0); 237 file = (char*)arg[0]; 238 indir = 0; 239 elem = nil; 240 if(waserror()){ 241 free(elem); 242 nexterror(); 243 } 244 for(;;){ 245 tc = namec(file, Aopen, OEXEC, 0); 246 if(waserror()){ 247 cclose(tc); 248 nexterror(); 249 } 250 if(!indir) 251 kstrdup(&elem, up->genbuf); 252 253 n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0); 254 if(n < 2) 255 error(Ebadexec); 256 magic = l2be(exec.magic); 257 text = l2be(exec.text); 258 entry = l2be(exec.entry); 259 if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){ 260 if(text >= USTKTOP-UTZERO 261 || entry < UTZERO+sizeof(Exec) 262 || entry >= UTZERO+sizeof(Exec)+text) 263 error(Ebadexec); 264 break; /* for binary */ 265 } 266 267 /* 268 * Process #! /bin/sh args ... 269 */ 270 memmove(line, &exec, sizeof(Exec)); 271 if(indir || line[0]!='#' || line[1]!='!') 272 error(Ebadexec); 273 n = shargs(line, n, progarg); 274 if(n == 0) 275 error(Ebadexec); 276 indir = 1; 277 /* 278 * First arg becomes complete file name 279 */ 280 progarg[n++] = file; 281 progarg[n] = 0; 282 validaddr(arg[1], BY2WD, 1); 283 arg[1] += BY2WD; 284 file = progarg[0]; 285 if(strlen(elem) >= sizeof progelem) 286 error(Ebadexec); 287 strcpy(progelem, elem); 288 progarg[0] = progelem; 289 poperror(); 290 cclose(tc); 291 } 292 293 data = l2be(exec.data); 294 bss = l2be(exec.bss); 295 t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1); 296 d = (t + data + (BY2PG-1)) & ~(BY2PG-1); 297 bssend = t + data + bss; 298 b = (bssend + (BY2PG-1)) & ~(BY2PG-1); 299 if(t >= KZERO || d >= KZERO || b >= KZERO) 300 error(Ebadexec); 301 302 /* 303 * Args: pass 1: count 304 */ 305 nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */ 306 nargs = 0; 307 if(indir){ 308 argp = progarg; 309 while(*argp){ 310 a = *argp++; 311 nbytes += strlen(a) + 1; 312 nargs++; 313 } 314 } 315 evenaddr(arg[1]); 316 argp = (char**)arg[1]; 317 validaddr((ulong)argp, BY2WD, 0); 318 while(*argp){ 319 a = *argp++; 320 if(((ulong)argp&(BY2PG-1)) < BY2WD) 321 validaddr((ulong)argp, BY2WD, 0); 322 validaddr((ulong)a, 1, 0); 323 nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1; 324 nargs++; 325 } 326 ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1)); 327 328 /* 329 * 8-byte align SP for those (e.g. sparc) that need it. 330 * execregs() will subtract another 4 bytes for argc. 331 */ 332 if((ssize+4) & 7) 333 ssize += 4; 334 spage = (ssize+(BY2PG-1)) >> PGSHIFT; 335 336 /* 337 * Build the stack segment, putting it in kernel virtual for the moment 338 */ 339 if(spage > TSTKSIZ) 340 error(Enovmem); 341 342 qlock(&up->seglock); 343 if(waserror()){ 344 qunlock(&up->seglock); 345 nexterror(); 346 } 347 up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG); 348 349 /* 350 * Args: pass 2: assemble; the pages will be faulted in 351 */ 352 tos = (Tos*)(TSTKTOP - sizeof(Tos)); 353 tos->cyclefreq = m->cyclefreq; 354 cycles((uvlong*)&tos->pcycles); 355 tos->pcycles = -tos->pcycles; 356 tos->kcycles = tos->pcycles; 357 tos->clock = 0; 358 argv = (char**)(TSTKTOP - ssize); 359 charp = (char*)(TSTKTOP - nbytes); 360 args = charp; 361 if(indir) 362 argp = progarg; 363 else 364 argp = (char**)arg[1]; 365 366 for(i=0; i<nargs; i++){ 367 if(indir && *argp==0) { 368 indir = 0; 369 argp = (char**)arg[1]; 370 } 371 *argv++ = charp + (USTKTOP-TSTKTOP); 372 n = strlen(*argp) + 1; 373 memmove(charp, *argp++, n); 374 charp += n; 375 } 376 377 free(up->text); 378 up->text = elem; 379 elem = nil; /* so waserror() won't free elem */ 380 USED(elem); 381 382 /* copy args; easiest from new process's stack */ 383 n = charp - args; 384 if(n > 128) /* don't waste too much space on huge arg lists */ 385 n = 128; 386 a = up->args; 387 up->args = nil; 388 free(a); 389 up->args = smalloc(n); 390 memmove(up->args, args, n); 391 if(n>0 && up->args[n-1]!='\0'){ 392 /* make sure last arg is NUL-terminated */ 393 /* put NUL at UTF-8 character boundary */ 394 for(i=n-1; i>0; --i) 395 if(fullrune(up->args+i, n-i)) 396 break; 397 up->args[i] = 0; 398 n = i+1; 399 } 400 up->nargs = n; 401 402 /* 403 * Committed. 404 * Free old memory. 405 * Special segments are maintained across exec 406 */ 407 for(i = SSEG; i <= BSEG; i++) { 408 putseg(up->seg[i]); 409 /* prevent a second free if we have an error */ 410 up->seg[i] = 0; 411 } 412 for(i = BSEG+1; i < NSEG; i++) { 413 s = up->seg[i]; 414 if(s != 0 && (s->type&SG_CEXEC)) { 415 putseg(s); 416 up->seg[i] = 0; 417 } 418 } 419 420 /* 421 * Close on exec 422 */ 423 f = up->fgrp; 424 for(i=0; i<=f->maxfd; i++) 425 fdclose(i, CCEXEC); 426 427 /* Text. Shared. Attaches to cache image if possible */ 428 /* attachimage returns a locked cache image */ 429 img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT); 430 ts = img->s; 431 up->seg[TSEG] = ts; 432 ts->flushme = 1; 433 ts->fstart = 0; 434 ts->flen = sizeof(Exec)+text; 435 unlock(img); 436 437 /* Data. Shared. */ 438 s = newseg(SG_DATA, t, (d-t)>>PGSHIFT); 439 up->seg[DSEG] = s; 440 441 /* Attached by hand */ 442 incref(img); 443 s->image = img; 444 s->fstart = ts->fstart+ts->flen; 445 s->flen = data; 446 447 /* BSS. Zero fill on demand */ 448 up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT); 449 450 /* 451 * Move the stack 452 */ 453 s = up->seg[ESEG]; 454 up->seg[ESEG] = 0; 455 up->seg[SSEG] = s; 456 qunlock(&up->seglock); 457 poperror(); /* seglock */ 458 poperror(); /* elem */ 459 s->base = USTKTOP-USTKSIZE; 460 s->top = USTKTOP; 461 relocateseg(s, USTKTOP-TSTKTOP); 462 463 /* 464 * '/' processes are higher priority (hack to make /ip more responsive). 465 */ 466 if(devtab[tc->type]->dc == L'/') 467 up->basepri = PriRoot; 468 up->priority = up->basepri; 469 poperror(); 470 cclose(tc); 471 472 /* 473 * At this point, the mmu contains info about the old address 474 * space and needs to be flushed 475 */ 476 flushmmu(); 477 qlock(&up->debug); 478 up->nnote = 0; 479 up->notify = 0; 480 up->notified = 0; 481 up->privatemem = 0; 482 procsetup(up); 483 qunlock(&up->debug); 484 if(up->hang) 485 up->procctl = Proc_stopme; 486 487 return execregs(entry, ssize, nargs); 488 } 489 490 int 491 shargs(char *s, int n, char **ap) 492 { 493 int i; 494 495 s += 2; 496 n -= 2; /* skip #! */ 497 for(i=0; s[i]!='\n'; i++) 498 if(i == n-1) 499 return 0; 500 s[i] = 0; 501 *ap = 0; 502 i = 0; 503 for(;;) { 504 while(*s==' ' || *s=='\t') 505 s++; 506 if(*s == 0) 507 break; 508 i++; 509 *ap++ = s; 510 *ap = 0; 511 while(*s && *s!=' ' && *s!='\t') 512 s++; 513 if(*s == 0) 514 break; 515 else 516 *s++ = 0; 517 } 518 return i; 519 } 520 521 int 522 return0(void*) 523 { 524 return 0; 525 } 526 527 long 528 syssleep(ulong *arg) 529 { 530 531 int n; 532 533 n = arg[0]; 534 if(n <= 0) { 535 if (up->edf && (up->edf->flags & Admitted)) 536 edfyield(); 537 else 538 yield(); 539 return 0; 540 } 541 if(n < TK2MS(1)) 542 n = TK2MS(1); 543 tsleep(&up->sleep, return0, 0, n); 544 return 0; 545 } 546 547 long 548 sysalarm(ulong *arg) 549 { 550 return procalarm(arg[0]); 551 } 552 553 long 554 sysexits(ulong *arg) 555 { 556 char *status; 557 char *inval = "invalid exit string"; 558 char buf[ERRMAX]; 559 560 status = (char*)arg[0]; 561 if(status){ 562 if(waserror()) 563 status = inval; 564 else{ 565 validaddr((ulong)status, 1, 0); 566 if(vmemchr(status, 0, ERRMAX) == 0){ 567 memmove(buf, status, ERRMAX); 568 buf[ERRMAX-1] = 0; 569 status = buf; 570 } 571 poperror(); 572 } 573 574 } 575 pexit(status, 1); 576 return 0; /* not reached */ 577 } 578 579 long 580 sys_wait(ulong *arg) 581 { 582 int pid; 583 Waitmsg w; 584 OWaitmsg *ow; 585 586 if(arg[0] == 0) 587 return pwait(nil); 588 589 validaddr(arg[0], sizeof(OWaitmsg), 1); 590 evenaddr(arg[0]); 591 pid = pwait(&w); 592 if(pid >= 0){ 593 ow = (OWaitmsg*)arg[0]; 594 readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE); 595 readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE); 596 readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE); 597 readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE); 598 strncpy(ow->msg, w.msg, sizeof(ow->msg)); 599 ow->msg[sizeof(ow->msg)-1] = '\0'; 600 } 601 return pid; 602 } 603 604 long 605 sysawait(ulong *arg) 606 { 607 int i; 608 int pid; 609 Waitmsg w; 610 ulong n; 611 612 n = arg[1]; 613 validaddr(arg[0], n, 1); 614 pid = pwait(&w); 615 if(pid < 0) 616 return -1; 617 i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q", 618 w.pid, 619 w.time[TUser], w.time[TSys], w.time[TReal], 620 w.msg); 621 622 return i; 623 } 624 625 long 626 sysdeath(ulong*) 627 { 628 pprint("deprecated system call\n"); 629 pexit("Suicide", 0); 630 return 0; /* not reached */ 631 } 632 633 void 634 werrstr(char *fmt, ...) 635 { 636 va_list va; 637 638 if(up == nil) 639 return; 640 641 va_start(va, fmt); 642 vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va); 643 va_end(va); 644 } 645 646 static long 647 generrstr(char *buf, uint nbuf) 648 { 649 char tmp[ERRMAX]; 650 651 if(nbuf == 0) 652 error(Ebadarg); 653 validaddr((ulong)buf, nbuf, 1); 654 if(nbuf > sizeof tmp) 655 nbuf = sizeof tmp; 656 memmove(tmp, buf, nbuf); 657 658 /* make sure it's NUL-terminated */ 659 tmp[nbuf-1] = '\0'; 660 memmove(buf, up->syserrstr, nbuf); 661 buf[nbuf-1] = '\0'; 662 memmove(up->syserrstr, tmp, nbuf); 663 return 0; 664 } 665 666 long 667 syserrstr(ulong *arg) 668 { 669 return generrstr((char*)arg[0], arg[1]); 670 } 671 672 /* compatibility for old binaries */ 673 long 674 sys_errstr(ulong *arg) 675 { 676 return generrstr((char*)arg[0], 64); 677 } 678 679 long 680 sysnotify(ulong *arg) 681 { 682 if(arg[0] != 0) 683 validaddr(arg[0], sizeof(ulong), 0); 684 up->notify = (int(*)(void*, char*))(arg[0]); 685 return 0; 686 } 687 688 long 689 sysnoted(ulong *arg) 690 { 691 if(arg[0]!=NRSTR && !up->notified) 692 error(Egreg); 693 return 0; 694 } 695 696 long 697 syssegbrk(ulong *arg) 698 { 699 int i; 700 ulong addr; 701 Segment *s; 702 703 addr = arg[0]; 704 for(i = 0; i < NSEG; i++) { 705 s = up->seg[i]; 706 if(s == 0 || addr < s->base || addr >= s->top) 707 continue; 708 switch(s->type&SG_TYPE) { 709 case SG_TEXT: 710 case SG_DATA: 711 case SG_STACK: 712 error(Ebadarg); 713 default: 714 return ibrk(arg[1], i); 715 } 716 } 717 718 error(Ebadarg); 719 return 0; /* not reached */ 720 } 721 722 long 723 syssegattach(ulong *arg) 724 { 725 return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]); 726 } 727 728 long 729 syssegdetach(ulong *arg) 730 { 731 int i; 732 ulong addr; 733 Segment *s; 734 735 qlock(&up->seglock); 736 if(waserror()){ 737 qunlock(&up->seglock); 738 nexterror(); 739 } 740 741 s = 0; 742 addr = arg[0]; 743 for(i = 0; i < NSEG; i++) 744 if(s = up->seg[i]) { 745 qlock(&s->lk); 746 if((addr >= s->base && addr < s->top) || 747 (s->top == s->base && addr == s->base)) 748 goto found; 749 qunlock(&s->lk); 750 } 751 752 error(Ebadarg); 753 754 found: 755 /* 756 * Check we are not detaching the initial stack segment. 757 */ 758 if(s == up->seg[SSEG]){ 759 qunlock(&s->lk); 760 error(Ebadarg); 761 } 762 up->seg[i] = 0; 763 qunlock(&s->lk); 764 putseg(s); 765 qunlock(&up->seglock); 766 poperror(); 767 768 /* Ensure we flush any entries from the lost segment */ 769 flushmmu(); 770 return 0; 771 } 772 773 long 774 syssegfree(ulong *arg) 775 { 776 Segment *s; 777 ulong from, to; 778 779 from = arg[0]; 780 s = seg(up, from, 1); 781 if(s == nil) 782 error(Ebadarg); 783 to = (from + arg[1]) & ~(BY2PG-1); 784 from = PGROUND(from); 785 786 if(to > s->top) { 787 qunlock(&s->lk); 788 error(Ebadarg); 789 } 790 791 mfreeseg(s, from, (to - from) / BY2PG); 792 qunlock(&s->lk); 793 flushmmu(); 794 795 return 0; 796 } 797 798 /* For binary compatibility */ 799 long 800 sysbrk_(ulong *arg) 801 { 802 return ibrk(arg[0], BSEG); 803 } 804 805 long 806 sysrendezvous(ulong *arg) 807 { 808 uintptr tag, val; 809 Proc *p, **l; 810 811 tag = arg[0]; 812 l = &REND(up->rgrp, tag); 813 up->rendval = ~(uintptr)0; 814 815 lock(up->rgrp); 816 for(p = *l; p; p = p->rendhash) { 817 if(p->rendtag == tag) { 818 *l = p->rendhash; 819 val = p->rendval; 820 p->rendval = arg[1]; 821 822 while(p->mach != 0) 823 ; 824 ready(p); 825 unlock(up->rgrp); 826 return val; 827 } 828 l = &p->rendhash; 829 } 830 831 /* Going to sleep here */ 832 up->rendtag = tag; 833 up->rendval = arg[1]; 834 up->rendhash = *l; 835 *l = up; 836 up->state = Rendezvous; 837 unlock(up->rgrp); 838 839 sched(); 840 841 return up->rendval; 842 } 843 844 /* 845 * The implementation of semaphores is complicated by needing 846 * to avoid rescheduling in syssemrelease, so that it is safe 847 * to call from real-time processes. This means syssemrelease 848 * cannot acquire any qlocks, only spin locks. 849 * 850 * Semacquire and semrelease must both manipulate the semaphore 851 * wait list. Lock-free linked lists only exist in theory, not 852 * in practice, so the wait list is protected by a spin lock. 853 * 854 * The semaphore value *addr is stored in user memory, so it 855 * cannot be read or written while holding spin locks. 856 * 857 * Thus, we can access the list only when holding the lock, and 858 * we can access the semaphore only when not holding the lock. 859 * This makes things interesting. Note that sleep's condition function 860 * is called while holding two locks - r and up->rlock - so it cannot 861 * access the semaphore value either. 862 * 863 * An acquirer announces its intention to try for the semaphore 864 * by putting a Sema structure onto the wait list and then 865 * setting Sema.waiting. After one last check of semaphore, 866 * the acquirer sleeps until Sema.waiting==0. A releaser of n 867 * must wake up n acquirers who have Sema.waiting set. It does 868 * this by clearing Sema.waiting and then calling wakeup. 869 * 870 * There are three interesting races here. 871 872 * The first is that in this particular sleep/wakeup usage, a single 873 * wakeup can rouse a process from two consecutive sleeps! 874 * The ordering is: 875 * 876 * (a) set Sema.waiting = 1 877 * (a) call sleep 878 * (b) set Sema.waiting = 0 879 * (a) check Sema.waiting inside sleep, return w/o sleeping 880 * (a) try for semaphore, fail 881 * (a) set Sema.waiting = 1 882 * (a) call sleep 883 * (b) call wakeup(a) 884 * (a) wake up again 885 * 886 * This is okay - semacquire will just go around the loop 887 * again. It does mean that at the top of the for(;;) loop in 888 * semacquire, phore.waiting might already be set to 1. 889 * 890 * The second is that a releaser might wake an acquirer who is 891 * interrupted before he can acquire the lock. Since 892 * release(n) issues only n wakeup calls -- only n can be used 893 * anyway -- if the interrupted process is not going to use his 894 * wakeup call he must pass it on to another acquirer. 895 * 896 * The third race is similar to the second but more subtle. An 897 * acquirer sets waiting=1 and then does a final canacquire() 898 * before going to sleep. The opposite order would result in 899 * missing wakeups that happen between canacquire and 900 * waiting=1. (In fact, the whole point of Sema.waiting is to 901 * avoid missing wakeups between canacquire() and sleep().) But 902 * there can be spurious wakeups between a successful 903 * canacquire() and the following semdequeue(). This wakeup is 904 * not useful to the acquirer, since he has already acquired 905 * the semaphore. Like in the previous case, though, the 906 * acquirer must pass the wakeup call along. 907 * 908 * This is all rather subtle. The code below has been verified 909 * with the spin model /sys/src/9/port/semaphore.p. The 910 * original code anticipated the second race but not the first 911 * or third, which were caught only with spin. The first race 912 * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it. 913 * It was lucky that my abstract model of sleep/wakeup still managed 914 * to preserve that behavior. 915 * 916 * I remain slightly concerned about memory coherence 917 * outside of locks. The spin model does not take 918 * queued processor writes into account so we have to 919 * think hard. The only variables accessed outside locks 920 * are the semaphore value itself and the boolean flag 921 * Sema.waiting. The value is only accessed with cmpswap, 922 * whose job description includes doing the right thing as 923 * far as memory coherence across processors. That leaves 924 * Sema.waiting. To handle it, we call coherence() before each 925 * read and after each write. - rsc 926 */ 927 928 /* Add semaphore p with addr a to list in seg. */ 929 static void 930 semqueue(Segment *s, long *a, Sema *p) 931 { 932 memset(p, 0, sizeof *p); 933 p->addr = a; 934 lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */ 935 p->next = &s->sema; 936 p->prev = s->sema.prev; 937 p->next->prev = p; 938 p->prev->next = p; 939 unlock(&s->sema); 940 } 941 942 /* Remove semaphore p from list in seg. */ 943 static void 944 semdequeue(Segment *s, Sema *p) 945 { 946 lock(&s->sema); 947 p->next->prev = p->prev; 948 p->prev->next = p->next; 949 unlock(&s->sema); 950 } 951 952 /* Wake up n waiters with addr a on list in seg. */ 953 static void 954 semwakeup(Segment *s, long *a, long n) 955 { 956 Sema *p; 957 958 lock(&s->sema); 959 for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){ 960 if(p->addr == a && p->waiting){ 961 p->waiting = 0; 962 coherence(); 963 wakeup(p); 964 n--; 965 } 966 } 967 unlock(&s->sema); 968 } 969 970 /* Add delta to semaphore and wake up waiters as appropriate. */ 971 static long 972 semrelease(Segment *s, long *addr, long delta) 973 { 974 long value; 975 976 do 977 value = *addr; 978 while(!cmpswap(addr, value, value+delta)); 979 semwakeup(s, addr, delta); 980 return value+delta; 981 } 982 983 /* Try to acquire semaphore using compare-and-swap */ 984 static int 985 canacquire(long *addr) 986 { 987 long value; 988 989 while((value=*addr) > 0) 990 if(cmpswap(addr, value, value-1)) 991 return 1; 992 return 0; 993 } 994 995 /* Should we wake up? */ 996 static int 997 semawoke(void *p) 998 { 999 coherence(); 1000 return !((Sema*)p)->waiting; 1001 } 1002 1003 /* Acquire semaphore (subtract 1). */ 1004 static int 1005 semacquire(Segment *s, long *addr, int block) 1006 { 1007 int acquired; 1008 Sema phore; 1009 1010 if(canacquire(addr)) 1011 return 1; 1012 if(!block) 1013 return 0; 1014 1015 acquired = 0; 1016 semqueue(s, addr, &phore); 1017 for(;;){ 1018 phore.waiting = 1; 1019 coherence(); 1020 if(canacquire(addr)){ 1021 acquired = 1; 1022 break; 1023 } 1024 if(waserror()) 1025 break; 1026 sleep(&phore, semawoke, &phore); 1027 poperror(); 1028 } 1029 semdequeue(s, &phore); 1030 coherence(); /* not strictly necessary due to lock in semdequeue */ 1031 if(!phore.waiting) 1032 semwakeup(s, addr, 1); 1033 if(!acquired) 1034 nexterror(); 1035 return 1; 1036 } 1037 1038 long 1039 syssemacquire(ulong *arg) 1040 { 1041 int block; 1042 long *addr; 1043 Segment *s; 1044 1045 validaddr(arg[0], sizeof(long), 1); 1046 evenaddr(arg[0]); 1047 addr = (long*)arg[0]; 1048 block = arg[1]; 1049 1050 if((s = seg(up, (ulong)addr, 0)) == nil) 1051 error(Ebadarg); 1052 if(*addr < 0) 1053 error(Ebadarg); 1054 return semacquire(s, addr, block); 1055 } 1056 1057 long 1058 syssemrelease(ulong *arg) 1059 { 1060 long *addr, delta; 1061 Segment *s; 1062 1063 validaddr(arg[0], sizeof(long), 1); 1064 evenaddr(arg[0]); 1065 addr = (long*)arg[0]; 1066 delta = arg[1]; 1067 1068 if((s = seg(up, (ulong)addr, 0)) == nil) 1069 error(Ebadarg); 1070 if(delta < 0 || *addr < 0) 1071 error(Ebadarg); 1072 return semrelease(s, addr, arg[1]); 1073 } 1074