1 #include "u.h" 2 #include "tos.h" 3 #include "../port/lib.h" 4 #include "mem.h" 5 #include "dat.h" 6 #include "fns.h" 7 #include "../port/error.h" 8 #include "../port/edf.h" 9 10 #include <a.out.h> 11 12 int shargs(char*, int, char**); 13 14 extern void checkpages(void); 15 extern void checkpagerefs(void); 16 17 long 18 sysr1(ulong*) 19 { 20 checkpagerefs(); 21 return 0; 22 } 23 24 long 25 sysrfork(ulong *arg) 26 { 27 Proc *p; 28 int n, i; 29 Fgrp *ofg; 30 Pgrp *opg; 31 Rgrp *org; 32 Egrp *oeg; 33 ulong pid, flag; 34 Mach *wm; 35 36 flag = arg[0]; 37 /* Check flags before we commit */ 38 if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) 39 error(Ebadarg); 40 if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG)) 41 error(Ebadarg); 42 if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG)) 43 error(Ebadarg); 44 45 if((flag&RFPROC) == 0) { 46 if(flag & (RFMEM|RFNOWAIT)) 47 error(Ebadarg); 48 if(flag & (RFFDG|RFCFDG)) { 49 ofg = up->fgrp; 50 if(flag & RFFDG) 51 up->fgrp = dupfgrp(ofg); 52 else 53 up->fgrp = dupfgrp(nil); 54 closefgrp(ofg); 55 } 56 if(flag & (RFNAMEG|RFCNAMEG)) { 57 opg = up->pgrp; 58 up->pgrp = newpgrp(); 59 if(flag & RFNAMEG) 60 pgrpcpy(up->pgrp, opg); 61 /* inherit noattach */ 62 up->pgrp->noattach = opg->noattach; 63 closepgrp(opg); 64 } 65 if(flag & RFNOMNT) 66 up->pgrp->noattach = 1; 67 if(flag & RFREND) { 68 org = up->rgrp; 69 up->rgrp = newrgrp(); 70 closergrp(org); 71 } 72 if(flag & (RFENVG|RFCENVG)) { 73 oeg = up->egrp; 74 up->egrp = smalloc(sizeof(Egrp)); 75 up->egrp->ref = 1; 76 if(flag & RFENVG) 77 envcpy(up->egrp, oeg); 78 closeegrp(oeg); 79 } 80 if(flag & RFNOTEG) 81 up->noteid = incref(¬eidalloc); 82 return 0; 83 } 84 85 p = newproc(); 86 87 p->fpsave = up->fpsave; 88 p->scallnr = up->scallnr; 89 p->s = up->s; 90 p->nerrlab = 0; 91 p->slash = up->slash; 92 p->dot = up->dot; 93 incref(p->dot); 94 95 memmove(p->note, up->note, sizeof(p->note)); 96 p->privatemem = up->privatemem; 97 p->noswap = up->noswap; 98 p->nnote = up->nnote; 99 p->notified = 0; 100 p->lastnote = up->lastnote; 101 p->notify = up->notify; 102 p->ureg = up->ureg; 103 p->dbgreg = 0; 104 105 /* Make a new set of memory segments */ 106 n = flag & RFMEM; 107 qlock(&p->seglock); 108 if(waserror()){ 109 qunlock(&p->seglock); 110 nexterror(); 111 } 112 for(i = 0; i < NSEG; i++) 113 if(up->seg[i]) 114 p->seg[i] = dupseg(up->seg, i, n); 115 qunlock(&p->seglock); 116 poperror(); 117 118 /* File descriptors */ 119 if(flag & (RFFDG|RFCFDG)) { 120 if(flag & RFFDG) 121 p->fgrp = dupfgrp(up->fgrp); 122 else 123 p->fgrp = dupfgrp(nil); 124 } 125 else { 126 p->fgrp = up->fgrp; 127 incref(p->fgrp); 128 } 129 130 /* Process groups */ 131 if(flag & (RFNAMEG|RFCNAMEG)) { 132 p->pgrp = newpgrp(); 133 if(flag & RFNAMEG) 134 pgrpcpy(p->pgrp, up->pgrp); 135 /* inherit noattach */ 136 p->pgrp->noattach = up->pgrp->noattach; 137 } 138 else { 139 p->pgrp = up->pgrp; 140 incref(p->pgrp); 141 } 142 if(flag & RFNOMNT) 143 p->pgrp->noattach = 1; 144 145 if(flag & RFREND) 146 p->rgrp = newrgrp(); 147 else { 148 incref(up->rgrp); 149 p->rgrp = up->rgrp; 150 } 151 152 /* Environment group */ 153 if(flag & (RFENVG|RFCENVG)) { 154 p->egrp = smalloc(sizeof(Egrp)); 155 p->egrp->ref = 1; 156 if(flag & RFENVG) 157 envcpy(p->egrp, up->egrp); 158 } 159 else { 160 p->egrp = up->egrp; 161 incref(p->egrp); 162 } 163 p->hang = up->hang; 164 p->procmode = up->procmode; 165 166 /* Craft a return frame which will cause the child to pop out of 167 * the scheduler in user mode with the return register zero 168 */ 169 forkchild(p, up->dbgreg); 170 171 p->parent = up; 172 p->parentpid = up->pid; 173 if(flag&RFNOWAIT) 174 p->parentpid = 0; 175 else { 176 lock(&up->exl); 177 up->nchild++; 178 unlock(&up->exl); 179 } 180 if((flag&RFNOTEG) == 0) 181 p->noteid = up->noteid; 182 183 /* don't penalize the child, it hasn't done FP in a note handler. */ 184 p->fpstate = up->fpstate & ~FPillegal; 185 pid = p->pid; 186 memset(p->time, 0, sizeof(p->time)); 187 p->time[TReal] = MACHP(0)->ticks; 188 189 kstrdup(&p->text, up->text); 190 kstrdup(&p->user, up->user); 191 /* 192 * since the bss/data segments are now shareable, 193 * any mmu info about this process is now stale 194 * (i.e. has bad properties) and has to be discarded. 195 */ 196 flushmmu(); 197 p->basepri = up->basepri; 198 p->priority = up->basepri; 199 p->fixedpri = up->fixedpri; 200 p->mp = up->mp; 201 wm = up->wired; 202 if(wm) 203 procwired(p, wm->machno); 204 ready(p); 205 sched(); 206 return pid; 207 } 208 209 ulong 210 l2be(long l) 211 { 212 uchar *cp; 213 214 cp = (uchar*)&l; 215 return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3]; 216 } 217 218 long 219 sysexec(ulong *arg) 220 { 221 Segment *s, *ts; 222 ulong t, d, b; 223 int i; 224 Chan *tc; 225 char **argv, **argp; 226 char *a, *charp, *args, *file, *file0; 227 char *progarg[sizeof(Exec)/2+1], *elem, progelem[64]; 228 ulong ssize, spage, nargs, nbytes, n, bssend; 229 int indir; 230 Exec exec; 231 char line[sizeof(Exec)]; 232 Fgrp *f; 233 Image *img; 234 ulong magic, text, entry, data, bss; 235 Tos *tos; 236 237 indir = 0; 238 elem = nil; 239 validaddr(arg[0], 1, 0); 240 file0 = validnamedup((char*)arg[0], 1); 241 if(waserror()){ 242 free(file0); 243 free(elem); 244 nexterror(); 245 } 246 file = file0; 247 for(;;){ 248 tc = namec(file, Aopen, OEXEC, 0); 249 if(waserror()){ 250 cclose(tc); 251 nexterror(); 252 } 253 if(!indir) 254 kstrdup(&elem, up->genbuf); 255 256 n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0); 257 if(n < 2) 258 error(Ebadexec); 259 magic = l2be(exec.magic); 260 text = l2be(exec.text); 261 entry = l2be(exec.entry); 262 if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){ 263 if(text >= USTKTOP-UTZERO 264 || entry < UTZERO+sizeof(Exec) 265 || entry >= UTZERO+sizeof(Exec)+text) 266 error(Ebadexec); 267 break; /* for binary */ 268 } 269 270 /* 271 * Process #! /bin/sh args ... 272 */ 273 memmove(line, &exec, sizeof(Exec)); 274 if(indir || line[0]!='#' || line[1]!='!') 275 error(Ebadexec); 276 n = shargs(line, n, progarg); 277 if(n == 0) 278 error(Ebadexec); 279 indir = 1; 280 /* 281 * First arg becomes complete file name 282 */ 283 progarg[n++] = file; 284 progarg[n] = 0; 285 validaddr(arg[1], BY2WD, 1); 286 arg[1] += BY2WD; 287 file = progarg[0]; 288 if(strlen(elem) >= sizeof progelem) 289 error(Ebadexec); 290 strcpy(progelem, elem); 291 progarg[0] = progelem; 292 poperror(); 293 cclose(tc); 294 } 295 296 data = l2be(exec.data); 297 bss = l2be(exec.bss); 298 t = UTROUND(UTZERO+sizeof(Exec)+text); 299 d = (t + data + (BY2PG-1)) & ~(BY2PG-1); 300 bssend = t + data + bss; 301 b = (bssend + (BY2PG-1)) & ~(BY2PG-1); 302 if(t >= KZERO || d >= KZERO || b >= KZERO) 303 error(Ebadexec); 304 305 /* 306 * Args: pass 1: count 307 */ 308 nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */ 309 nargs = 0; 310 if(indir){ 311 argp = progarg; 312 while(*argp){ 313 a = *argp++; 314 nbytes += strlen(a) + 1; 315 nargs++; 316 } 317 } 318 evenaddr(arg[1]); 319 argp = (char**)arg[1]; 320 validaddr((ulong)argp, BY2WD, 0); 321 while(*argp){ 322 a = *argp++; 323 if(((ulong)argp&(BY2PG-1)) < BY2WD) 324 validaddr((ulong)argp, BY2WD, 0); 325 validaddr((ulong)a, 1, 0); 326 nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1; 327 nargs++; 328 } 329 ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1)); 330 331 /* 332 * 8-byte align SP for those (e.g. sparc) that need it. 333 * execregs() will subtract another 4 bytes for argc. 334 */ 335 if((ssize+4) & 7) 336 ssize += 4; 337 spage = (ssize+(BY2PG-1)) >> PGSHIFT; 338 339 /* 340 * Build the stack segment, putting it in kernel virtual for the moment 341 */ 342 if(spage > TSTKSIZ) 343 error(Enovmem); 344 345 qlock(&up->seglock); 346 if(waserror()){ 347 qunlock(&up->seglock); 348 nexterror(); 349 } 350 up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG); 351 352 /* 353 * Args: pass 2: assemble; the pages will be faulted in 354 */ 355 tos = (Tos*)(TSTKTOP - sizeof(Tos)); 356 tos->cyclefreq = m->cyclefreq; 357 cycles((uvlong*)&tos->pcycles); 358 tos->pcycles = -tos->pcycles; 359 tos->kcycles = tos->pcycles; 360 tos->clock = 0; 361 argv = (char**)(TSTKTOP - ssize); 362 charp = (char*)(TSTKTOP - nbytes); 363 args = charp; 364 if(indir) 365 argp = progarg; 366 else 367 argp = (char**)arg[1]; 368 369 for(i=0; i<nargs; i++){ 370 if(indir && *argp==0) { 371 indir = 0; 372 argp = (char**)arg[1]; 373 } 374 *argv++ = charp + (USTKTOP-TSTKTOP); 375 n = strlen(*argp) + 1; 376 memmove(charp, *argp++, n); 377 charp += n; 378 } 379 free(file0); 380 381 free(up->text); 382 up->text = elem; 383 elem = nil; /* so waserror() won't free elem */ 384 USED(elem); 385 386 /* copy args; easiest from new process's stack */ 387 n = charp - args; 388 if(n > 128) /* don't waste too much space on huge arg lists */ 389 n = 128; 390 a = up->args; 391 up->args = nil; 392 free(a); 393 up->args = smalloc(n); 394 memmove(up->args, args, n); 395 if(n>0 && up->args[n-1]!='\0'){ 396 /* make sure last arg is NUL-terminated */ 397 /* put NUL at UTF-8 character boundary */ 398 for(i=n-1; i>0; --i) 399 if(fullrune(up->args+i, n-i)) 400 break; 401 up->args[i] = 0; 402 n = i+1; 403 } 404 up->nargs = n; 405 406 /* 407 * Committed. 408 * Free old memory. 409 * Special segments are maintained across exec 410 */ 411 for(i = SSEG; i <= BSEG; i++) { 412 putseg(up->seg[i]); 413 /* prevent a second free if we have an error */ 414 up->seg[i] = 0; 415 } 416 for(i = BSEG+1; i < NSEG; i++) { 417 s = up->seg[i]; 418 if(s != 0 && (s->type&SG_CEXEC)) { 419 putseg(s); 420 up->seg[i] = 0; 421 } 422 } 423 424 /* 425 * Close on exec 426 */ 427 f = up->fgrp; 428 for(i=0; i<=f->maxfd; i++) 429 fdclose(i, CCEXEC); 430 431 /* Text. Shared. Attaches to cache image if possible */ 432 /* attachimage returns a locked cache image */ 433 img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT); 434 ts = img->s; 435 up->seg[TSEG] = ts; 436 ts->flushme = 1; 437 ts->fstart = 0; 438 ts->flen = sizeof(Exec)+text; 439 unlock(img); 440 441 /* Data. Shared. */ 442 s = newseg(SG_DATA, t, (d-t)>>PGSHIFT); 443 up->seg[DSEG] = s; 444 445 /* Attached by hand */ 446 incref(img); 447 s->image = img; 448 s->fstart = ts->fstart+ts->flen; 449 s->flen = data; 450 451 /* BSS. Zero fill on demand */ 452 up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT); 453 454 /* 455 * Move the stack 456 */ 457 s = up->seg[ESEG]; 458 up->seg[ESEG] = 0; 459 up->seg[SSEG] = s; 460 qunlock(&up->seglock); 461 poperror(); /* seglock */ 462 poperror(); /* elem */ 463 s->base = USTKTOP-USTKSIZE; 464 s->top = USTKTOP; 465 relocateseg(s, USTKTOP-TSTKTOP); 466 467 /* 468 * '/' processes are higher priority (hack to make /ip more responsive). 469 */ 470 if(devtab[tc->type]->dc == L'/') 471 up->basepri = PriRoot; 472 up->priority = up->basepri; 473 poperror(); 474 cclose(tc); 475 476 /* 477 * At this point, the mmu contains info about the old address 478 * space and needs to be flushed 479 */ 480 flushmmu(); 481 qlock(&up->debug); 482 up->nnote = 0; 483 up->notify = 0; 484 up->notified = 0; 485 up->privatemem = 0; 486 procsetup(up); 487 qunlock(&up->debug); 488 if(up->hang) 489 up->procctl = Proc_stopme; 490 491 return execregs(entry, ssize, nargs); 492 } 493 494 int 495 shargs(char *s, int n, char **ap) 496 { 497 int i; 498 499 s += 2; 500 n -= 2; /* skip #! */ 501 for(i=0; s[i]!='\n'; i++) 502 if(i == n-1) 503 return 0; 504 s[i] = 0; 505 *ap = 0; 506 i = 0; 507 for(;;) { 508 while(*s==' ' || *s=='\t') 509 s++; 510 if(*s == 0) 511 break; 512 i++; 513 *ap++ = s; 514 *ap = 0; 515 while(*s && *s!=' ' && *s!='\t') 516 s++; 517 if(*s == 0) 518 break; 519 else 520 *s++ = 0; 521 } 522 return i; 523 } 524 525 int 526 return0(void*) 527 { 528 return 0; 529 } 530 531 long 532 syssleep(ulong *arg) 533 { 534 535 int n; 536 537 n = arg[0]; 538 if(n <= 0) { 539 if (up->edf && (up->edf->flags & Admitted)) 540 edfyield(); 541 else 542 yield(); 543 return 0; 544 } 545 if(n < TK2MS(1)) 546 n = TK2MS(1); 547 tsleep(&up->sleep, return0, 0, n); 548 return 0; 549 } 550 551 long 552 sysalarm(ulong *arg) 553 { 554 return procalarm(arg[0]); 555 } 556 557 long 558 sysexits(ulong *arg) 559 { 560 char *status; 561 char *inval = "invalid exit string"; 562 char buf[ERRMAX]; 563 564 status = (char*)arg[0]; 565 if(status){ 566 if(waserror()) 567 status = inval; 568 else{ 569 validaddr((ulong)status, 1, 0); 570 if(vmemchr(status, 0, ERRMAX) == 0){ 571 memmove(buf, status, ERRMAX); 572 buf[ERRMAX-1] = 0; 573 status = buf; 574 } 575 poperror(); 576 } 577 578 } 579 pexit(status, 1); 580 return 0; /* not reached */ 581 } 582 583 long 584 sys_wait(ulong *arg) 585 { 586 int pid; 587 Waitmsg w; 588 OWaitmsg *ow; 589 590 if(arg[0] == 0) 591 return pwait(nil); 592 593 validaddr(arg[0], sizeof(OWaitmsg), 1); 594 evenaddr(arg[0]); 595 pid = pwait(&w); 596 if(pid >= 0){ 597 ow = (OWaitmsg*)arg[0]; 598 readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE); 599 readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE); 600 readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE); 601 readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE); 602 strncpy(ow->msg, w.msg, sizeof(ow->msg)); 603 ow->msg[sizeof(ow->msg)-1] = '\0'; 604 } 605 return pid; 606 } 607 608 long 609 sysawait(ulong *arg) 610 { 611 int i; 612 int pid; 613 Waitmsg w; 614 ulong n; 615 616 n = arg[1]; 617 validaddr(arg[0], n, 1); 618 pid = pwait(&w); 619 if(pid < 0) 620 return -1; 621 i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q", 622 w.pid, 623 w.time[TUser], w.time[TSys], w.time[TReal], 624 w.msg); 625 626 return i; 627 } 628 629 void 630 werrstr(char *fmt, ...) 631 { 632 va_list va; 633 634 if(up == nil) 635 return; 636 637 va_start(va, fmt); 638 vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va); 639 va_end(va); 640 } 641 642 static long 643 generrstr(char *buf, uint nbuf) 644 { 645 char tmp[ERRMAX]; 646 647 if(nbuf == 0) 648 error(Ebadarg); 649 validaddr((ulong)buf, nbuf, 1); 650 if(nbuf > sizeof tmp) 651 nbuf = sizeof tmp; 652 memmove(tmp, buf, nbuf); 653 654 /* make sure it's NUL-terminated */ 655 tmp[nbuf-1] = '\0'; 656 memmove(buf, up->syserrstr, nbuf); 657 buf[nbuf-1] = '\0'; 658 memmove(up->syserrstr, tmp, nbuf); 659 return 0; 660 } 661 662 long 663 syserrstr(ulong *arg) 664 { 665 return generrstr((char*)arg[0], arg[1]); 666 } 667 668 /* compatibility for old binaries */ 669 long 670 sys_errstr(ulong *arg) 671 { 672 return generrstr((char*)arg[0], 64); 673 } 674 675 long 676 sysnotify(ulong *arg) 677 { 678 if(arg[0] != 0) 679 validaddr(arg[0], sizeof(ulong), 0); 680 up->notify = (int(*)(void*, char*))(arg[0]); 681 return 0; 682 } 683 684 long 685 sysnoted(ulong *arg) 686 { 687 if(arg[0]!=NRSTR && !up->notified) 688 error(Egreg); 689 return 0; 690 } 691 692 long 693 syssegbrk(ulong *arg) 694 { 695 int i; 696 ulong addr; 697 Segment *s; 698 699 addr = arg[0]; 700 for(i = 0; i < NSEG; i++) { 701 s = up->seg[i]; 702 if(s == 0 || addr < s->base || addr >= s->top) 703 continue; 704 switch(s->type&SG_TYPE) { 705 case SG_TEXT: 706 case SG_DATA: 707 case SG_STACK: 708 error(Ebadarg); 709 default: 710 return ibrk(arg[1], i); 711 } 712 } 713 714 error(Ebadarg); 715 return 0; /* not reached */ 716 } 717 718 long 719 syssegattach(ulong *arg) 720 { 721 return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]); 722 } 723 724 long 725 syssegdetach(ulong *arg) 726 { 727 int i; 728 ulong addr; 729 Segment *s; 730 731 qlock(&up->seglock); 732 if(waserror()){ 733 qunlock(&up->seglock); 734 nexterror(); 735 } 736 737 s = 0; 738 addr = arg[0]; 739 for(i = 0; i < NSEG; i++) 740 if(s = up->seg[i]) { 741 qlock(&s->lk); 742 if((addr >= s->base && addr < s->top) || 743 (s->top == s->base && addr == s->base)) 744 goto found; 745 qunlock(&s->lk); 746 } 747 748 error(Ebadarg); 749 750 found: 751 /* 752 * Check we are not detaching the initial stack segment. 753 */ 754 if(s == up->seg[SSEG]){ 755 qunlock(&s->lk); 756 error(Ebadarg); 757 } 758 up->seg[i] = 0; 759 qunlock(&s->lk); 760 putseg(s); 761 qunlock(&up->seglock); 762 poperror(); 763 764 /* Ensure we flush any entries from the lost segment */ 765 flushmmu(); 766 return 0; 767 } 768 769 long 770 syssegfree(ulong *arg) 771 { 772 Segment *s; 773 ulong from, to; 774 775 from = arg[0]; 776 s = seg(up, from, 1); 777 if(s == nil) 778 error(Ebadarg); 779 to = (from + arg[1]) & ~(BY2PG-1); 780 from = PGROUND(from); 781 782 if(to > s->top) { 783 qunlock(&s->lk); 784 error(Ebadarg); 785 } 786 787 mfreeseg(s, from, (to - from) / BY2PG); 788 qunlock(&s->lk); 789 flushmmu(); 790 791 return 0; 792 } 793 794 /* For binary compatibility */ 795 long 796 sysbrk_(ulong *arg) 797 { 798 return ibrk(arg[0], BSEG); 799 } 800 801 long 802 sysrendezvous(ulong *arg) 803 { 804 uintptr tag, val; 805 Proc *p, **l; 806 807 tag = arg[0]; 808 l = &REND(up->rgrp, tag); 809 up->rendval = ~(uintptr)0; 810 811 lock(up->rgrp); 812 for(p = *l; p; p = p->rendhash) { 813 if(p->rendtag == tag) { 814 *l = p->rendhash; 815 val = p->rendval; 816 p->rendval = arg[1]; 817 818 while(p->mach != 0) 819 ; 820 ready(p); 821 unlock(up->rgrp); 822 return val; 823 } 824 l = &p->rendhash; 825 } 826 827 /* Going to sleep here */ 828 up->rendtag = tag; 829 up->rendval = arg[1]; 830 up->rendhash = *l; 831 *l = up; 832 up->state = Rendezvous; 833 unlock(up->rgrp); 834 835 sched(); 836 837 return up->rendval; 838 } 839 840 /* 841 * The implementation of semaphores is complicated by needing 842 * to avoid rescheduling in syssemrelease, so that it is safe 843 * to call from real-time processes. This means syssemrelease 844 * cannot acquire any qlocks, only spin locks. 845 * 846 * Semacquire and semrelease must both manipulate the semaphore 847 * wait list. Lock-free linked lists only exist in theory, not 848 * in practice, so the wait list is protected by a spin lock. 849 * 850 * The semaphore value *addr is stored in user memory, so it 851 * cannot be read or written while holding spin locks. 852 * 853 * Thus, we can access the list only when holding the lock, and 854 * we can access the semaphore only when not holding the lock. 855 * This makes things interesting. Note that sleep's condition function 856 * is called while holding two locks - r and up->rlock - so it cannot 857 * access the semaphore value either. 858 * 859 * An acquirer announces its intention to try for the semaphore 860 * by putting a Sema structure onto the wait list and then 861 * setting Sema.waiting. After one last check of semaphore, 862 * the acquirer sleeps until Sema.waiting==0. A releaser of n 863 * must wake up n acquirers who have Sema.waiting set. It does 864 * this by clearing Sema.waiting and then calling wakeup. 865 * 866 * There are three interesting races here. 867 868 * The first is that in this particular sleep/wakeup usage, a single 869 * wakeup can rouse a process from two consecutive sleeps! 870 * The ordering is: 871 * 872 * (a) set Sema.waiting = 1 873 * (a) call sleep 874 * (b) set Sema.waiting = 0 875 * (a) check Sema.waiting inside sleep, return w/o sleeping 876 * (a) try for semaphore, fail 877 * (a) set Sema.waiting = 1 878 * (a) call sleep 879 * (b) call wakeup(a) 880 * (a) wake up again 881 * 882 * This is okay - semacquire will just go around the loop 883 * again. It does mean that at the top of the for(;;) loop in 884 * semacquire, phore.waiting might already be set to 1. 885 * 886 * The second is that a releaser might wake an acquirer who is 887 * interrupted before he can acquire the lock. Since 888 * release(n) issues only n wakeup calls -- only n can be used 889 * anyway -- if the interrupted process is not going to use his 890 * wakeup call he must pass it on to another acquirer. 891 * 892 * The third race is similar to the second but more subtle. An 893 * acquirer sets waiting=1 and then does a final canacquire() 894 * before going to sleep. The opposite order would result in 895 * missing wakeups that happen between canacquire and 896 * waiting=1. (In fact, the whole point of Sema.waiting is to 897 * avoid missing wakeups between canacquire() and sleep().) But 898 * there can be spurious wakeups between a successful 899 * canacquire() and the following semdequeue(). This wakeup is 900 * not useful to the acquirer, since he has already acquired 901 * the semaphore. Like in the previous case, though, the 902 * acquirer must pass the wakeup call along. 903 * 904 * This is all rather subtle. The code below has been verified 905 * with the spin model /sys/src/9/port/semaphore.p. The 906 * original code anticipated the second race but not the first 907 * or third, which were caught only with spin. The first race 908 * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it. 909 * It was lucky that my abstract model of sleep/wakeup still managed 910 * to preserve that behavior. 911 * 912 * I remain slightly concerned about memory coherence 913 * outside of locks. The spin model does not take 914 * queued processor writes into account so we have to 915 * think hard. The only variables accessed outside locks 916 * are the semaphore value itself and the boolean flag 917 * Sema.waiting. The value is only accessed with cmpswap, 918 * whose job description includes doing the right thing as 919 * far as memory coherence across processors. That leaves 920 * Sema.waiting. To handle it, we call coherence() before each 921 * read and after each write. - rsc 922 */ 923 924 /* Add semaphore p with addr a to list in seg. */ 925 static void 926 semqueue(Segment *s, long *a, Sema *p) 927 { 928 memset(p, 0, sizeof *p); 929 p->addr = a; 930 lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */ 931 p->next = &s->sema; 932 p->prev = s->sema.prev; 933 p->next->prev = p; 934 p->prev->next = p; 935 unlock(&s->sema); 936 } 937 938 /* Remove semaphore p from list in seg. */ 939 static void 940 semdequeue(Segment *s, Sema *p) 941 { 942 lock(&s->sema); 943 p->next->prev = p->prev; 944 p->prev->next = p->next; 945 unlock(&s->sema); 946 } 947 948 /* Wake up n waiters with addr a on list in seg. */ 949 static void 950 semwakeup(Segment *s, long *a, long n) 951 { 952 Sema *p; 953 954 lock(&s->sema); 955 for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){ 956 if(p->addr == a && p->waiting){ 957 p->waiting = 0; 958 coherence(); 959 wakeup(p); 960 n--; 961 } 962 } 963 unlock(&s->sema); 964 } 965 966 /* Add delta to semaphore and wake up waiters as appropriate. */ 967 static long 968 semrelease(Segment *s, long *addr, long delta) 969 { 970 long value; 971 972 do 973 value = *addr; 974 while(!cmpswap(addr, value, value+delta)); 975 semwakeup(s, addr, delta); 976 return value+delta; 977 } 978 979 /* Try to acquire semaphore using compare-and-swap */ 980 static int 981 canacquire(long *addr) 982 { 983 long value; 984 985 while((value=*addr) > 0) 986 if(cmpswap(addr, value, value-1)) 987 return 1; 988 return 0; 989 } 990 991 /* Should we wake up? */ 992 static int 993 semawoke(void *p) 994 { 995 coherence(); 996 return !((Sema*)p)->waiting; 997 } 998 999 /* Acquire semaphore (subtract 1). */ 1000 static int 1001 semacquire(Segment *s, long *addr, int block) 1002 { 1003 int acquired; 1004 Sema phore; 1005 1006 if(canacquire(addr)) 1007 return 1; 1008 if(!block) 1009 return 0; 1010 1011 acquired = 0; 1012 semqueue(s, addr, &phore); 1013 for(;;){ 1014 phore.waiting = 1; 1015 coherence(); 1016 if(canacquire(addr)){ 1017 acquired = 1; 1018 break; 1019 } 1020 if(waserror()) 1021 break; 1022 sleep(&phore, semawoke, &phore); 1023 poperror(); 1024 } 1025 semdequeue(s, &phore); 1026 coherence(); /* not strictly necessary due to lock in semdequeue */ 1027 if(!phore.waiting) 1028 semwakeup(s, addr, 1); 1029 if(!acquired) 1030 nexterror(); 1031 return 1; 1032 } 1033 1034 /* Acquire semaphore or time-out */ 1035 static int 1036 tsemacquire(Segment *s, long *addr, ulong ms) 1037 { 1038 int acquired, timedout; 1039 ulong t, elms; 1040 Sema phore; 1041 1042 if(canacquire(addr)) 1043 return 1; 1044 if(ms == 0) 1045 return 0; 1046 acquired = timedout = 0; 1047 semqueue(s, addr, &phore); 1048 for(;;){ 1049 phore.waiting = 1; 1050 coherence(); 1051 if(canacquire(addr)){ 1052 acquired = 1; 1053 break; 1054 } 1055 if(waserror()) 1056 break; 1057 t = m->ticks; 1058 tsleep(&phore, semawoke, &phore, ms); 1059 elms = TK2MS(m->ticks - t); 1060 poperror(); 1061 if(elms >= ms){ 1062 timedout = 1; 1063 break; 1064 } 1065 ms -= elms; 1066 } 1067 semdequeue(s, &phore); 1068 coherence(); /* not strictly necessary due to lock in semdequeue */ 1069 if(!phore.waiting) 1070 semwakeup(s, addr, 1); 1071 if(timedout) 1072 return 0; 1073 if(!acquired) 1074 nexterror(); 1075 return 1; 1076 } 1077 1078 long 1079 syssemacquire(ulong *arg) 1080 { 1081 int block; 1082 long *addr; 1083 Segment *s; 1084 1085 validaddr(arg[0], sizeof(long), 1); 1086 evenaddr(arg[0]); 1087 addr = (long*)arg[0]; 1088 block = arg[1]; 1089 1090 if((s = seg(up, (ulong)addr, 0)) == nil) 1091 error(Ebadarg); 1092 if(*addr < 0) 1093 error(Ebadarg); 1094 return semacquire(s, addr, block); 1095 } 1096 1097 long 1098 systsemacquire(ulong *arg) 1099 { 1100 long *addr; 1101 ulong ms; 1102 Segment *s; 1103 1104 validaddr(arg[0], sizeof(long), 1); 1105 evenaddr(arg[0]); 1106 addr = (long*)arg[0]; 1107 ms = arg[1]; 1108 1109 if((s = seg(up, (ulong)addr, 0)) == nil) 1110 error(Ebadarg); 1111 if(*addr < 0) 1112 error(Ebadarg); 1113 return tsemacquire(s, addr, ms); 1114 } 1115 1116 long 1117 syssemrelease(ulong *arg) 1118 { 1119 long *addr, delta; 1120 Segment *s; 1121 1122 validaddr(arg[0], sizeof(long), 1); 1123 evenaddr(arg[0]); 1124 addr = (long*)arg[0]; 1125 delta = arg[1]; 1126 1127 if((s = seg(up, (ulong)addr, 0)) == nil) 1128 error(Ebadarg); 1129 /* delta == 0 is a no-op, not a release */ 1130 if(delta < 0 || *addr < 0) 1131 error(Ebadarg); 1132 return semrelease(s, addr, delta); 1133 } 1134