xref: /plan9-contrib/sys/src/9/port/proc.c (revision ec59a3ddbfceee0efe34584c2c9981a5e5ff1ec4)
1 #include	<u.h>
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 #include	"edf.h"
8 #include	<trace.h>
9 
10 int	coopsched;
11 int	schedgain = 30;	/* units in seconds */
12 int	nrdy;
13 Ref	noteidalloc;
14 
15 void updatecpu(Proc*);
16 int reprioritize(Proc*);
17 
18 ulong	delayedscheds;	/* statistics */
19 long skipscheds;
20 long preempts;
21 ulong load;
22 
23 static Ref	pidalloc;
24 
25 static struct Procalloc
26 {
27 	Lock;
28 	Proc*	ht[128];
29 	Proc*	arena;
30 	Proc*	free;
31 } procalloc;
32 
33 enum
34 {
35 	Q=10,
36 	DQ=4,
37 	Scaling=2,
38 };
39 
40 Schedq	runq[Nrq];
41 ulong	runvec;
42 
43 char *statename[] =
44 {	/* BUG: generate automatically */
45 	"Dead",
46 	"Moribund",
47 	"Ready",
48 	"Scheding",
49 	"Running",
50 	"Queueing",
51 	"QueueingR",
52 	"QueueingW",
53 	"Wakeme",
54 	"Broken",
55 	"Stopped",
56 	"Rendez",
57 	"Waitrelease",
58 };
59 
60 static void pidhash(Proc*);
61 static void pidunhash(Proc*);
62 static void rebalance(void);
63 
64 /*
65  * Always splhi()'ed.
66  */
67 void
68 schedinit(void)		/* never returns */
69 {
70 	Edf *e;
71 
72 	setlabel(&m->sched);
73 	if(up) {
74 		if((e = up->edf) && (e->flags & Admitted))
75 			edfrecord(up);
76 		m->proc = 0;
77 		switch(up->state) {
78 		case Running:
79 			ready(up);
80 			break;
81 		case Moribund:
82 			up->state = Dead;
83 			edfstop(up);
84 			if (up->edf)
85 				free(up->edf);
86 			up->edf = nil;
87 
88 			/*
89 			 * Holding locks from pexit:
90 			 * 	procalloc
91 			 *	palloc
92 			 */
93 			mmurelease(up);
94 
95 			up->qnext = procalloc.free;
96 			procalloc.free = up;
97 
98 			unlock(&palloc);
99 			unlock(&procalloc);
100 			break;
101 		}
102 		up->mach = nil;
103 		updatecpu(up);
104 		up = nil;
105 	}
106 	sched();
107 }
108 
109 /*
110  *  If changing this routine, look also at sleep().  It
111  *  contains a copy of the guts of sched().
112  */
113 void
114 sched(void)
115 {
116 	Proc *p;
117 
118 	if(m->ilockdepth)
119 		panic("ilockdepth %d, last lock 0x%p at 0x%lux, sched called from 0x%lux",
120 			m->ilockdepth, up?up->lastilock:nil,
121 			(up && up->lastilock)?up->lastilock->pc:0,
122 			getcallerpc(&p+2));
123 
124 	if(up){
125 		/*
126 		 * Delay the sched until the process gives up the locks
127 		 * it is holding.  This avoids dumb lock loops.
128 		 * Don't delay if the process is Moribund.
129 		 * It called sched to die.
130 		 * But do sched eventually.  This avoids a missing unlock
131 		 * from hanging the entire kernel.
132 		 * But don't reschedule procs holding palloc or procalloc.
133 		 * Those are far too important to be holding while asleep.
134 		 *
135 		 * This test is not exact.  There can still be a few instructions
136 		 * in the middle of taslock when a process holds a lock
137 		 * but Lock.p has not yet been initialized.
138 		 */
139 		if(up->nlocks.ref)
140 		if(up->state != Moribund)
141 		if(up->delaysched < 20
142 		|| palloc.Lock.p == up
143 		|| procalloc.Lock.p == up){
144 			up->delaysched++;
145  			delayedscheds++;
146 			return;
147 		}
148 		up->delaysched = 0;
149 
150 		splhi();
151 
152 		/* statistics */
153 		m->cs++;
154 
155 		procsave(up);
156 		if(setlabel(&up->sched)){
157 			procrestore(up);
158 			spllo();
159 			return;
160 		}
161 		gotolabel(&m->sched);
162 	}
163 	p = runproc();
164 	if(!p->edf){
165 		updatecpu(p);
166 		p->priority = reprioritize(p);
167 	}
168 	if(p != m->readied)
169 		m->schedticks = m->ticks + HZ/10;
170 	m->readied = 0;
171 	up = p;
172 	up->state = Running;
173 	up->mach = MACHP(m->machno);
174 	m->proc = up;
175 	mmuswitch(up);
176 	gotolabel(&up->sched);
177 }
178 
179 int
180 anyready(void)
181 {
182 	return runvec;
183 }
184 
185 int
186 anyhigher(void)
187 {
188 	return runvec & ~((1<<(up->priority+1))-1);
189 }
190 
191 /*
192  *  here once per clock tick to see if we should resched
193  */
194 void
195 hzsched(void)
196 {
197 	/* once a second, rebalance will reprioritize ready procs */
198 	if(m->machno == 0)
199 		rebalance();
200 
201 	/* unless preempted, get to run for at least 100ms */
202 	if(anyhigher()
203 	|| (!up->fixedpri && m->ticks > m->schedticks && anyready())){
204 		m->readied = nil;	/* avoid cooperative scheduling */
205 		up->delaysched++;
206 	}
207 }
208 
209 /*
210  *  here at the end of non-clock interrupts to see if we should preempt the
211  *  current process.  Returns 1 if preempted, 0 otherwise.
212  */
213 int
214 preempted(void)
215 {
216 	if(up && up->state == Running)
217 	if(up->preempted == 0)
218 	if(anyhigher())
219 	if(!active.exiting){
220 		m->readied = nil;	/* avoid cooperative scheduling */
221 		up->preempted = 1;
222 		sched();
223 		splhi();
224 		up->preempted = 0;
225 		return 1;
226 	}
227 	return 0;
228 }
229 
230 /*
231  * Update the cpu time average for this particular process,
232  * which is about to change from up -> not up or vice versa.
233  * p->lastupdate is the last time an updatecpu happened.
234  *
235  * The cpu time average is a decaying average that lasts
236  * about D clock ticks.  D is chosen to be approximately
237  * the cpu time of a cpu-intensive "quick job".  A job has to run
238  * for approximately D clock ticks before we home in on its
239  * actual cpu usage.  Thus if you manage to get in and get out
240  * quickly, you won't be penalized during your burst.  Once you
241  * start using your share of the cpu for more than about D
242  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
243  * below all the other quick jobs.  Interactive tasks, because
244  * they basically always use less than their fair share of cpu,
245  * will be rewarded.
246  *
247  * If the process has not been running, then we want to
248  * apply the filter
249  *
250  *	cpu = cpu * (D-1)/D
251  *
252  * n times, yielding
253  *
254  *	cpu = cpu * ((D-1)/D)^n
255  *
256  * but D is big enough that this is approximately
257  *
258  * 	cpu = cpu * (D-n)/D
259  *
260  * so we use that instead.
261  *
262  * If the process has been running, we apply the filter to
263  * 1 - cpu, yielding a similar equation.  Note that cpu is
264  * stored in fixed point (* 1000).
265  *
266  * Updatecpu must be called before changing up, in order
267  * to maintain accurate cpu usage statistics.  It can be called
268  * at any time to bring the stats for a given proc up-to-date.
269  */
270 void
271 updatecpu(Proc *p)
272 {
273 	int n, t, ocpu;
274 	int D = schedgain*HZ*Scaling;
275 
276 	if(p->edf)
277 		return;
278 
279 	t = MACHP(0)->ticks*Scaling + Scaling/2;
280 	n = t - p->lastupdate;
281 	p->lastupdate = t;
282 
283 	if(n == 0)
284 		return;
285 	if(n > D)
286 		n = D;
287 
288 	ocpu = p->cpu;
289 	if(p != up)
290 		p->cpu = (ocpu*(D-n))/D;
291 	else{
292 		t = 1000 - ocpu;
293 		t = (t*(D-n))/D;
294 		p->cpu = 1000 - t;
295 	}
296 
297 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
298 }
299 
300 /*
301  * On average, p has used p->cpu of a cpu recently.
302  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
303  * too much, penalize it.  If it has been getting not enough, reward it.
304  * I don't think you can get much more than your fair share that
305  * often, so most of the queues are for using less.  Having a priority
306  * of 3 means you're just right.  Having a higher priority (up to p->basepri)
307  * means you're not using as much as you could.
308  */
309 int
310 reprioritize(Proc *p)
311 {
312 	int fairshare, n, load, ratio;
313 
314 	load = MACHP(0)->load;
315 	if(load == 0)
316 		return p->basepri;
317 
318 	/*
319 	 *  fairshare = 1.000 * conf.nproc * 1.000/load,
320 	 * except the decimal point is moved three places
321 	 * on both load and fairshare.
322 	 */
323 	fairshare = (conf.nmach*1000*1000)/load;
324 	n = p->cpu;
325 	if(n == 0)
326 		n = 1;
327 	ratio = (fairshare+n/2) / n;
328 	if(ratio > p->basepri)
329 		ratio = p->basepri;
330 	if(ratio < 0)
331 		panic("reprioritize");
332 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
333 	return ratio;
334 }
335 
336 /*
337  * add a process to a scheduling queue
338  */
339 void
340 queueproc(Schedq *rq, Proc *p)
341 {
342 	int pri;
343 
344 	pri = rq - runq;
345 	lock(runq);
346 	p->priority = pri;
347 	p->rnext = 0;
348 	if(rq->tail)
349 		rq->tail->rnext = p;
350 	else
351 		rq->head = p;
352 	rq->tail = p;
353 	rq->n++;
354 	nrdy++;
355 	runvec |= 1<<pri;
356 	unlock(runq);
357 }
358 
359 /*
360  *  try to remove a process from a scheduling queue (called splhi)
361  */
362 Proc*
363 dequeueproc(Schedq *rq, Proc *tp)
364 {
365 	Proc *l, *p;
366 
367 	if(!canlock(runq))
368 		return nil;
369 
370 	/*
371 	 *  the queue may have changed before we locked runq,
372 	 *  refind the target process.
373 	 */
374 	l = 0;
375 	for(p = rq->head; p; p = p->rnext){
376 		if(p == tp)
377 			break;
378 		l = p;
379 	}
380 
381 	/*
382 	 *  p->mach==0 only when process state is saved
383 	 */
384 	if(p == 0 || p->mach){
385 		unlock(runq);
386 		return nil;
387 	}
388 	if(p->rnext == 0)
389 		rq->tail = l;
390 	if(l)
391 		l->rnext = p->rnext;
392 	else
393 		rq->head = p->rnext;
394 	if(rq->head == nil)
395 		runvec &= ~(1<<(rq-runq));
396 	rq->n--;
397 	nrdy--;
398 	if(p->state != Ready)
399 		print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
400 
401 	unlock(runq);
402 	return p;
403 }
404 
405 /*
406  *  ready(p) picks a new priority for a process and sticks it in the
407  *  runq for that priority.
408  */
409 void
410 ready(Proc *p)
411 {
412 	int s, pri;
413 	Schedq *rq;
414 	void (*pt)(Proc*, int, vlong);
415 
416 	s = splhi();
417 	if(edfready(p)){
418 		splx(s);
419 		return;
420 	}
421 
422 	if(up != p)
423 		m->readied = p;	/* group scheduling */
424 
425 	updatecpu(p);
426 	pri = reprioritize(p);
427 	p->priority = pri;
428 	rq = &runq[pri];
429 	p->state = Ready;
430 	queueproc(rq, p);
431 	pt = proctrace;
432 	if(pt)
433 		pt(p, SReady, 0);
434 	splx(s);
435 }
436 
437 /*
438  *  yield the processor and drop our priority
439  */
440 void
441 yield(void)
442 {
443 	if(anyready()){
444 		/* pretend we just used 1/2 tick */
445 		up->lastupdate -= Scaling/2;
446 		sched();
447 	}
448 }
449 
450 /*
451  *  recalculate priorities once a second.  We need to do this
452  *  since priorities will otherwise only be recalculated when
453  *  the running process blocks.
454  */
455 ulong balancetime;
456 
457 static void
458 rebalance(void)
459 {
460 	int pri, npri, t, x;
461 	Schedq *rq;
462 	Proc *p;
463 
464 	t = m->ticks;
465 	if(t - balancetime < HZ)
466 		return;
467 	balancetime = t;
468 
469 	for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
470 another:
471 		p = rq->head;
472 		if(p == nil)
473 			continue;
474 		if(p->mp != MACHP(m->machno))
475 			continue;
476 		if(pri == p->basepri)
477 			continue;
478 		updatecpu(p);
479 		npri = reprioritize(p);
480 		if(npri != pri){
481 			x = splhi();
482 			p = dequeueproc(rq, p);
483 			if(p)
484 				queueproc(&runq[npri], p);
485 			splx(x);
486 			goto another;
487 		}
488 	}
489 }
490 
491 
492 /*
493  *  pick a process to run
494  */
495 Proc*
496 runproc(void)
497 {
498 	Schedq *rq;
499 	Proc *p;
500 	ulong start, now;
501 	int i;
502 	void (*pt)(Proc*, int, vlong);
503 
504 	start = perfticks();
505 
506 	/* cooperative scheduling until the clock ticks */
507 	if(coopsched && (p=m->readied) && p->mach==0 && p->state==Ready
508 	&& runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
509 		skipscheds++;
510 		rq = &runq[p->priority];
511 		goto found;
512 	}
513 
514 	preempts++;
515 
516 loop:
517 	/*
518 	 *  find a process that last ran on this processor (affinity),
519 	 *  or one that hasn't moved in a while (load balancing).  Every
520 	 *  time around the loop affinity goes down.
521 	 */
522 	spllo();
523 	for(i = 0;; i++){
524 		/*
525 		 *  find the highest priority target process that this
526 		 *  processor can run given affinity constraints.
527 		 *
528 		 */
529 		for(rq = &runq[Nrq-1]; rq >= runq; rq--){
530 			for(p = rq->head; p; p = p->rnext){
531 				if(p->mp == nil || p->mp == MACHP(m->machno)
532 				|| (!p->wired && i > 0))
533 					goto found;
534 			}
535 		}
536 
537 		/* waste time or halt the CPU */
538 		idlehands();
539 
540 		/* remember how much time we're here */
541 		now = perfticks();
542 		m->perf.inidle += now-start;
543 		start = now;
544 	}
545 
546 found:
547 	splhi();
548 	p = dequeueproc(rq, p);
549 	if(p == nil)
550 		goto loop;
551 
552 	p->state = Scheding;
553 	p->mp = MACHP(m->machno);
554 
555 	if(edflock(p)){
556 		edfrun(p, rq == &runq[PriEdf]);	/* start deadline timer and do admin */
557 		edfunlock();
558 	}
559 	pt = proctrace;
560 	if(pt)
561 		pt(p, SRun, 0);
562 	return p;
563 }
564 
565 int
566 canpage(Proc *p)
567 {
568 	int ok = 0;
569 
570 	splhi();
571 	lock(runq);
572 	/* Only reliable way to see if we are Running */
573 	if(p->mach == 0) {
574 		p->newtlb = 1;
575 		ok = 1;
576 	}
577 	unlock(runq);
578 	spllo();
579 
580 	return ok;
581 }
582 
583 Proc*
584 newproc(void)
585 {
586 	Proc *p;
587 
588 	lock(&procalloc);
589 	for(;;) {
590 		if(p = procalloc.free)
591 			break;
592 
593 		unlock(&procalloc);
594 		resrcwait("no procs");
595 		lock(&procalloc);
596 	}
597 	procalloc.free = p->qnext;
598 	unlock(&procalloc);
599 
600 	p->state = Scheding;
601 	p->psstate = "New";
602 	p->mach = 0;
603 	p->qnext = 0;
604 	p->nchild = 0;
605 	p->nwait = 0;
606 	p->waitq = 0;
607 	p->parent = 0;
608 	p->pgrp = 0;
609 	p->egrp = 0;
610 	p->fgrp = 0;
611 	p->rgrp = 0;
612 	p->pdbg = 0;
613 	p->fpstate = FPinit;
614 	p->kp = 0;
615 	p->procctl = 0;
616 	p->notepending = 0;
617 	p->ureg = 0;
618 	p->privatemem = 0;
619 	p->noswap = 0;
620 	p->errstr = p->errbuf0;
621 	p->syserrstr = p->errbuf1;
622 	p->errbuf0[0] = '\0';
623 	p->errbuf1[0] = '\0';
624 	p->nlocks.ref = 0;
625 	p->delaysched = 0;
626 	p->trace = 0;
627 	kstrdup(&p->user, "*nouser");
628 	kstrdup(&p->text, "*notext");
629 	kstrdup(&p->args, "");
630 	p->nargs = 0;
631 	p->setargs = 0;
632 	memset(p->seg, 0, sizeof p->seg);
633 	p->pid = incref(&pidalloc);
634 	pidhash(p);
635 	p->noteid = incref(&noteidalloc);
636 	if(p->pid==0 || p->noteid==0)
637 		panic("pidalloc");
638 	if(p->kstack == 0)
639 		p->kstack = smalloc(KSTACK);
640 
641 	/* sched params */
642 	p->mp = 0;
643 	p->wired = 0;
644 	procpriority(p, PriNormal, 0);
645 	p->cpu = 0;
646 	p->lastupdate = MACHP(0)->ticks*Scaling;
647 	p->edf = nil;
648 
649 	return p;
650 }
651 
652 /*
653  * wire this proc to a machine
654  */
655 void
656 procwired(Proc *p, int bm)
657 {
658 	Proc *pp;
659 	int i;
660 	char nwired[MAXMACH];
661 	Mach *wm;
662 
663 	if(bm < 0){
664 		/* pick a machine to wire to */
665 		memset(nwired, 0, sizeof(nwired));
666 		p->wired = 0;
667 		pp = proctab(0);
668 		for(i=0; i<conf.nproc; i++, pp++){
669 			wm = pp->wired;
670 			if(wm && pp->pid)
671 				nwired[wm->machno]++;
672 		}
673 		bm = 0;
674 		for(i=0; i<conf.nmach; i++)
675 			if(nwired[i] < nwired[bm])
676 				bm = i;
677 	} else {
678 		/* use the virtual machine requested */
679 		bm = bm % conf.nmach;
680 	}
681 
682 	p->wired = MACHP(bm);
683 	p->mp = p->wired;
684 }
685 
686 void
687 procpriority(Proc *p, int pri, int fixed)
688 {
689 	if(pri >= Npriq)
690 		pri = Npriq - 1;
691 	else if(pri < 0)
692 		pri = 0;
693 	p->basepri = pri;
694 	p->priority = pri;
695 	if(fixed){
696 		p->fixedpri = 1;
697 	} else {
698 		p->fixedpri = 0;
699 	}
700 }
701 
702 void
703 procinit0(void)		/* bad planning - clashes with devproc.c */
704 {
705 	Proc *p;
706 	int i;
707 
708 	procalloc.free = xalloc(conf.nproc*sizeof(Proc));
709 	if(procalloc.free == nil){
710 		xsummary();
711 		panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
712 	}
713 	procalloc.arena = procalloc.free;
714 
715 	p = procalloc.free;
716 	for(i=0; i<conf.nproc-1; i++,p++)
717 		p->qnext = p+1;
718 	p->qnext = 0;
719 }
720 
721 /*
722  *  sleep if a condition is not true.  Another process will
723  *  awaken us after it sets the condition.  When we awaken
724  *  the condition may no longer be true.
725  *
726  *  we lock both the process and the rendezvous to keep r->p
727  *  and p->r synchronized.
728  */
729 void
730 sleep(Rendez *r, int (*f)(void*), void *arg)
731 {
732 	int s;
733 	void (*pt)(Proc*, int, vlong);
734 
735 	s = splhi();
736 
737 	if(up->nlocks.ref)
738 		print("process %lud sleeps with %lud locks held, last lock 0x%p locked at pc 0x%lux, sleep called from 0x%lux\n",
739 			up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
740 	lock(r);
741 	lock(&up->rlock);
742 	if(r->p){
743 		print("double sleep called from 0x%lux, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
744 		dumpstack();
745 	}
746 
747 	/*
748 	 *  Wakeup only knows there may be something to do by testing
749 	 *  r->p in order to get something to lock on.
750 	 *  Flush that information out to memory in case the sleep is
751 	 *  committed.
752 	 */
753 	r->p = up;
754 
755 	if((*f)(arg) || up->notepending){
756 		/*
757 		 *  if condition happened or a note is pending
758 		 *  never mind
759 		 */
760 		r->p = nil;
761 		unlock(&up->rlock);
762 		unlock(r);
763 	} else {
764 		/*
765 		 *  now we are committed to
766 		 *  change state and call scheduler
767 		 */
768 		pt = proctrace;
769 		if(pt)
770 			pt(up, SSleep, 0);
771 		up->state = Wakeme;
772 		up->r = r;
773 
774 		/* statistics */
775 		m->cs++;
776 
777 		procsave(up);
778 		if(setlabel(&up->sched)) {
779 			/*
780 			 *  here when the process is awakened
781 			 */
782 			procrestore(up);
783 			spllo();
784 		} else {
785 			/*
786 			 *  here to go to sleep (i.e. stop Running)
787 			 */
788 			unlock(&up->rlock);
789 			unlock(r);
790 			gotolabel(&m->sched);
791 		}
792 	}
793 
794 	if(up->notepending) {
795 		up->notepending = 0;
796 		splx(s);
797 		if(up->procctl == Proc_exitme && up->closingfgrp)
798 			forceclosefgrp();
799 		error(Eintr);
800 	}
801 
802 	splx(s);
803 }
804 
805 static int
806 tfn(void *arg)
807 {
808 	return up->trend == nil || up->tfn(arg);
809 }
810 
811 void
812 twakeup(Ureg*, Timer *t)
813 {
814 	Proc *p;
815 	Rendez *trend;
816 
817 	p = t->ta;
818 	trend = p->trend;
819 	p->trend = 0;
820 	if(trend)
821 		wakeup(trend);
822 }
823 
824 void
825 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
826 {
827 	if (up->tt){
828 		print("tsleep: timer active: mode %d, tf 0x%lux\n", up->tmode, up->tf);
829 		timerdel(up);
830 	}
831 	up->tns = MS2NS(ms);
832 	up->tf = twakeup;
833 	up->tmode = Trelative;
834 	up->ta = up;
835 	up->trend = r;
836 	up->tfn = fn;
837 	timeradd(up);
838 
839 	if(waserror()){
840 		timerdel(up);
841 		nexterror();
842 	}
843 	sleep(r, tfn, arg);
844 	if (up->tt)
845 		timerdel(up);
846 	up->twhen = 0;
847 	poperror();
848 }
849 
850 /*
851  *  Expects that only one process can call wakeup for any given Rendez.
852  *  We hold both locks to ensure that r->p and p->r remain consistent.
853  *  Richard Miller has a better solution that doesn't require both to
854  *  be held simultaneously, but I'm a paranoid - presotto.
855  */
856 Proc*
857 wakeup(Rendez *r)
858 {
859 	Proc *p;
860 	int s;
861 
862 	s = splhi();
863 
864 	lock(r);
865 	p = r->p;
866 
867 	if(p != nil){
868 		lock(&p->rlock);
869 		if(p->state != Wakeme || p->r != r)
870 			panic("wakeup: state");
871 		r->p = nil;
872 		p->r = nil;
873 		ready(p);
874 		unlock(&p->rlock);
875 	}
876 	unlock(r);
877 
878 	splx(s);
879 
880 	return p;
881 }
882 
883 /*
884  *  if waking a sleeping process, this routine must hold both
885  *  p->rlock and r->lock.  However, it can't know them in
886  *  the same order as wakeup causing a possible lock ordering
887  *  deadlock.  We break the deadlock by giving up the p->rlock
888  *  lock if we can't get the r->lock and retrying.
889  */
890 int
891 postnote(Proc *p, int dolock, char *n, int flag)
892 {
893 	int s, ret;
894 	Rendez *r;
895 	Proc *d, **l;
896 
897 	if(dolock)
898 		qlock(&p->debug);
899 
900 	if(flag != NUser && (p->notify == 0 || p->notified))
901 		p->nnote = 0;
902 
903 	ret = 0;
904 	if(p->nnote < NNOTE) {
905 		strcpy(p->note[p->nnote].msg, n);
906 		p->note[p->nnote++].flag = flag;
907 		ret = 1;
908 	}
909 	p->notepending = 1;
910 	if(dolock)
911 		qunlock(&p->debug);
912 
913 	/* this loop is to avoid lock ordering problems. */
914 	for(;;){
915 		s = splhi();
916 		lock(&p->rlock);
917 		r = p->r;
918 
919 		/* waiting for a wakeup? */
920 		if(r == nil)
921 			break;	/* no */
922 
923 		/* try for the second lock */
924 		if(canlock(r)){
925 			if(p->state != Wakeme || r->p != p)
926 				panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
927 			p->r = nil;
928 			r->p = nil;
929 			ready(p);
930 			unlock(r);
931 			break;
932 		}
933 
934 		/* give other process time to get out of critical section and try again */
935 		unlock(&p->rlock);
936 		splx(s);
937 		sched();
938 	}
939 	unlock(&p->rlock);
940 	splx(s);
941 
942 	if(p->state != Rendezvous)
943 		return ret;
944 
945 	/* Try and pull out of a rendezvous */
946 	lock(p->rgrp);
947 	if(p->state == Rendezvous) {
948 		p->rendval = ~0;
949 		l = &REND(p->rgrp, p->rendtag);
950 		for(d = *l; d; d = d->rendhash) {
951 			if(d == p) {
952 				*l = p->rendhash;
953 				break;
954 			}
955 			l = &d->rendhash;
956 		}
957 		ready(p);
958 	}
959 	unlock(p->rgrp);
960 	return ret;
961 }
962 
963 /*
964  * weird thing: keep at most NBROKEN around
965  */
966 #define	NBROKEN 4
967 struct
968 {
969 	QLock;
970 	int	n;
971 	Proc	*p[NBROKEN];
972 }broken;
973 
974 void
975 addbroken(Proc *p)
976 {
977 	qlock(&broken);
978 	if(broken.n == NBROKEN) {
979 		ready(broken.p[0]);
980 		memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
981 		--broken.n;
982 	}
983 	broken.p[broken.n++] = p;
984 	qunlock(&broken);
985 
986 	edfstop(up);
987 	p->state = Broken;
988 	p->psstate = 0;
989 	sched();
990 }
991 
992 void
993 unbreak(Proc *p)
994 {
995 	int b;
996 
997 	qlock(&broken);
998 	for(b=0; b < broken.n; b++)
999 		if(broken.p[b] == p) {
1000 			broken.n--;
1001 			memmove(&broken.p[b], &broken.p[b+1],
1002 					sizeof(Proc*)*(NBROKEN-(b+1)));
1003 			ready(p);
1004 			break;
1005 		}
1006 	qunlock(&broken);
1007 }
1008 
1009 int
1010 freebroken(void)
1011 {
1012 	int i, n;
1013 
1014 	qlock(&broken);
1015 	n = broken.n;
1016 	for(i=0; i<n; i++) {
1017 		ready(broken.p[i]);
1018 		broken.p[i] = 0;
1019 	}
1020 	broken.n = 0;
1021 	qunlock(&broken);
1022 	return n;
1023 }
1024 
1025 void
1026 pexit(char *exitstr, int freemem)
1027 {
1028 	Proc *p;
1029 	Segment **s, **es;
1030 	long utime, stime;
1031 	Waitq *wq, *f, *next;
1032 	Fgrp *fgrp;
1033 	Egrp *egrp;
1034 	Rgrp *rgrp;
1035 	Pgrp *pgrp;
1036 	Chan *dot;
1037 	void (*pt)(Proc*, int, vlong);
1038 
1039 	up->alarm = 0;
1040 	if (up->tt)
1041 		timerdel(up);
1042 	pt = proctrace;
1043 	if(pt)
1044 		pt(up, SDead, 0);
1045 
1046 	/* nil out all the resources under lock (free later) */
1047 	qlock(&up->debug);
1048 	fgrp = up->fgrp;
1049 	up->fgrp = nil;
1050 	egrp = up->egrp;
1051 	up->egrp = nil;
1052 	rgrp = up->rgrp;
1053 	up->rgrp = nil;
1054 	pgrp = up->pgrp;
1055 	up->pgrp = nil;
1056 	dot = up->dot;
1057 	up->dot = nil;
1058 	qunlock(&up->debug);
1059 
1060 	if(fgrp)
1061 		closefgrp(fgrp);
1062 	if(egrp)
1063 		closeegrp(egrp);
1064 	if(rgrp)
1065 		closergrp(rgrp);
1066 	if(dot)
1067 		cclose(dot);
1068 	if(pgrp)
1069 		closepgrp(pgrp);
1070 
1071 	/*
1072 	 * if not a kernel process and have a parent,
1073 	 * do some housekeeping.
1074 	 */
1075 	if(up->kp == 0) {
1076 		p = up->parent;
1077 		if(p == 0) {
1078 			if(exitstr == 0)
1079 				exitstr = "unknown";
1080 			panic("boot process died: %s", exitstr);
1081 		}
1082 
1083 		while(waserror())
1084 			;
1085 
1086 		wq = smalloc(sizeof(Waitq));
1087 		poperror();
1088 
1089 		wq->w.pid = up->pid;
1090 		utime = up->time[TUser] + up->time[TCUser];
1091 		stime = up->time[TSys] + up->time[TCSys];
1092 		wq->w.time[TUser] = tk2ms(utime);
1093 		wq->w.time[TSys] = tk2ms(stime);
1094 		wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1095 		if(exitstr && exitstr[0])
1096 			snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1097 		else
1098 			wq->w.msg[0] = '\0';
1099 
1100 		lock(&p->exl);
1101 		/*
1102 		 *  If my parent is no longer alive, or if there would be more
1103 		 *  than 128 zombie child processes for my parent, then don't
1104 		 *  leave a wait record behind.  This helps prevent badly
1105 		 *  written daemon processes from accumulating lots of wait
1106 		 *  records.
1107 		 */
1108 		if(p->pid == up->parentpid && p->state != Broken && p->nwait < 128) {
1109 			p->nchild--;
1110 			p->time[TCUser] += utime;
1111 			p->time[TCSys] += stime;
1112 
1113 			wq->next = p->waitq;
1114 			p->waitq = wq;
1115 			p->nwait++;
1116 
1117 			wakeup(&p->waitr);
1118 			unlock(&p->exl);
1119 		}
1120 		else {
1121 			unlock(&p->exl);
1122 			free(wq);
1123 		}
1124 	}
1125 
1126 	if(!freemem)
1127 		addbroken(up);
1128 
1129 	qlock(&up->seglock);
1130 	es = &up->seg[NSEG];
1131 	for(s = up->seg; s < es; s++) {
1132 		if(*s) {
1133 			putseg(*s);
1134 			*s = 0;
1135 		}
1136 	}
1137 	qunlock(&up->seglock);
1138 
1139 	lock(&up->exl);		/* Prevent my children from leaving waits */
1140 	pidunhash(up);
1141 	up->pid = 0;
1142 	wakeup(&up->waitr);
1143 	unlock(&up->exl);
1144 
1145 	for(f = up->waitq; f; f = next) {
1146 		next = f->next;
1147 		free(f);
1148 	}
1149 
1150 	/* release debuggers */
1151 	qlock(&up->debug);
1152 	if(up->pdbg) {
1153 		wakeup(&up->pdbg->sleep);
1154 		up->pdbg = 0;
1155 	}
1156 	qunlock(&up->debug);
1157 
1158 	/* Sched must not loop for these locks */
1159 	lock(&procalloc);
1160 	lock(&palloc);
1161 
1162 	edfstop(up);
1163 	up->state = Moribund;
1164 	sched();
1165 	panic("pexit");
1166 }
1167 
1168 int
1169 haswaitq(void *x)
1170 {
1171 	Proc *p;
1172 
1173 	p = (Proc *)x;
1174 	return p->waitq != 0;
1175 }
1176 
1177 ulong
1178 pwait(Waitmsg *w)
1179 {
1180 	ulong cpid;
1181 	Waitq *wq;
1182 
1183 	if(!canqlock(&up->qwaitr))
1184 		error(Einuse);
1185 
1186 	if(waserror()) {
1187 		qunlock(&up->qwaitr);
1188 		nexterror();
1189 	}
1190 
1191 	lock(&up->exl);
1192 	if(up->nchild == 0 && up->waitq == 0) {
1193 		unlock(&up->exl);
1194 		error(Enochild);
1195 	}
1196 	unlock(&up->exl);
1197 
1198 	sleep(&up->waitr, haswaitq, up);
1199 
1200 	lock(&up->exl);
1201 	wq = up->waitq;
1202 	up->waitq = wq->next;
1203 	up->nwait--;
1204 	unlock(&up->exl);
1205 
1206 	qunlock(&up->qwaitr);
1207 	poperror();
1208 
1209 	if(w)
1210 		memmove(w, &wq->w, sizeof(Waitmsg));
1211 	cpid = wq->w.pid;
1212 	free(wq);
1213 	return cpid;
1214 }
1215 
1216 Proc*
1217 proctab(int i)
1218 {
1219 	return &procalloc.arena[i];
1220 }
1221 
1222 void
1223 dumpaproc(Proc *p)
1224 {
1225 	ulong bss;
1226 	char *s;
1227 
1228 	if(p == 0)
1229 		return;
1230 
1231 	bss = 0;
1232 	if(p->seg[BSEG])
1233 		bss = p->seg[BSEG]->top;
1234 
1235 	s = p->psstate;
1236 	if(s == 0)
1237 		s = statename[p->state];
1238 	print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1239 		p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1240 		p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1241 }
1242 
1243 void
1244 procdump(void)
1245 {
1246 	int i;
1247 	Proc *p;
1248 
1249 	if(up)
1250 		print("up %lud\n", up->pid);
1251 	else
1252 		print("no current process\n");
1253 	for(i=0; i<conf.nproc; i++) {
1254 		p = &procalloc.arena[i];
1255 		if(p->state == Dead)
1256 			continue;
1257 
1258 		dumpaproc(p);
1259 	}
1260 }
1261 
1262 /*
1263  *  wait till all processes have flushed their mmu
1264  *  state about segement s
1265  */
1266 void
1267 procflushseg(Segment *s)
1268 {
1269 	int i, ns, nm, nwait;
1270 	Proc *p;
1271 
1272 	/*
1273 	 *  tell all processes with this
1274 	 *  segment to flush their mmu's
1275 	 */
1276 	nwait = 0;
1277 	for(i=0; i<conf.nproc; i++) {
1278 		p = &procalloc.arena[i];
1279 		if(p->state == Dead)
1280 			continue;
1281 		for(ns = 0; ns < NSEG; ns++)
1282 			if(p->seg[ns] == s){
1283 				p->newtlb = 1;
1284 				for(nm = 0; nm < conf.nmach; nm++){
1285 					if(MACHP(nm)->proc == p){
1286 						MACHP(nm)->flushmmu = 1;
1287 						nwait++;
1288 					}
1289 				}
1290 				break;
1291 			}
1292 	}
1293 
1294 	if(nwait == 0)
1295 		return;
1296 
1297 	/*
1298 	 *  wait for all processors to take a clock interrupt
1299 	 *  and flush their mmu's
1300 	 */
1301 	for(nm = 0; nm < conf.nmach; nm++)
1302 		if(MACHP(nm) != m)
1303 			while(MACHP(nm)->flushmmu)
1304 				sched();
1305 }
1306 
1307 void
1308 scheddump(void)
1309 {
1310 	Proc *p;
1311 	Schedq *rq;
1312 
1313 	for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1314 		if(rq->head == 0)
1315 			continue;
1316 		print("rq%ld:", rq-runq);
1317 		for(p = rq->head; p; p = p->rnext)
1318 			print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1319 		print("\n");
1320 		delay(150);
1321 	}
1322 	print("nrdy %d\n", nrdy);
1323 }
1324 
1325 void
1326 kproc(char *name, void (*func)(void *), void *arg)
1327 {
1328 	Proc *p;
1329 	static Pgrp *kpgrp;
1330 
1331 	p = newproc();
1332 	p->psstate = 0;
1333 	p->procmode = 0640;
1334 	p->kp = 1;
1335 	p->noswap = 1;
1336 
1337 	p->fpsave = up->fpsave;
1338 	p->scallnr = up->scallnr;
1339 	p->s = up->s;
1340 	p->nerrlab = 0;
1341 	p->slash = up->slash;
1342 	p->dot = up->dot;
1343 	if(p->dot)
1344 		incref(p->dot);
1345 
1346 	memmove(p->note, up->note, sizeof(p->note));
1347 	p->nnote = up->nnote;
1348 	p->notified = 0;
1349 	p->lastnote = up->lastnote;
1350 	p->notify = up->notify;
1351 	p->ureg = 0;
1352 	p->dbgreg = 0;
1353 
1354 	procpriority(p, PriKproc, 0);
1355 
1356 	kprocchild(p, func, arg);
1357 
1358 	kstrdup(&p->user, eve);
1359 	kstrdup(&p->text, name);
1360 	if(kpgrp == 0)
1361 		kpgrp = newpgrp();
1362 	p->pgrp = kpgrp;
1363 	incref(kpgrp);
1364 
1365 	memset(p->time, 0, sizeof(p->time));
1366 	p->time[TReal] = MACHP(0)->ticks;
1367 	ready(p);
1368 	/*
1369 	 *  since the bss/data segments are now shareable,
1370 	 *  any mmu info about this process is now stale
1371 	 *  and has to be discarded.
1372 	 */
1373 	p->newtlb = 1;
1374 	flushmmu();
1375 }
1376 
1377 /*
1378  *  called splhi() by notify().  See comment in notify for the
1379  *  reasoning.
1380  */
1381 void
1382 procctl(Proc *p)
1383 {
1384 	char *state;
1385 	ulong s;
1386 
1387 	switch(p->procctl) {
1388 	case Proc_exitbig:
1389 		spllo();
1390 		pexit("Killed: Insufficient physical memory", 1);
1391 
1392 	case Proc_exitme:
1393 		spllo();		/* pexit has locks in it */
1394 		pexit("Killed", 1);
1395 
1396 	case Proc_traceme:
1397 		if(p->nnote == 0)
1398 			return;
1399 		/* No break */
1400 
1401 	case Proc_stopme:
1402 		p->procctl = 0;
1403 		state = p->psstate;
1404 		p->psstate = "Stopped";
1405 		/* free a waiting debugger */
1406 		s = spllo();
1407 		qlock(&p->debug);
1408 		if(p->pdbg) {
1409 			wakeup(&p->pdbg->sleep);
1410 			p->pdbg = 0;
1411 		}
1412 		qunlock(&p->debug);
1413 		splhi();
1414 		p->state = Stopped;
1415 		sched();
1416 		p->psstate = state;
1417 		splx(s);
1418 		return;
1419 	}
1420 }
1421 
1422 #include "errstr.h"
1423 
1424 void
1425 error(char *err)
1426 {
1427 	spllo();
1428 
1429 	assert(up->nerrlab < NERR);
1430 	kstrcpy(up->errstr, err, ERRMAX);
1431 	setlabel(&up->errlab[NERR-1]);
1432 	nexterror();
1433 }
1434 
1435 void
1436 nexterror(void)
1437 {
1438 	gotolabel(&up->errlab[--up->nerrlab]);
1439 }
1440 
1441 void
1442 exhausted(char *resource)
1443 {
1444 	char buf[ERRMAX];
1445 
1446 	sprint(buf, "no free %s", resource);
1447 	iprint("%s\n", buf);
1448 	error(buf);
1449 }
1450 
1451 void
1452 killbig(char *why)
1453 {
1454 	int i;
1455 	Segment *s;
1456 	ulong l, max;
1457 	Proc *p, *ep, *kp;
1458 
1459 	max = 0;
1460 	kp = 0;
1461 	ep = procalloc.arena+conf.nproc;
1462 	for(p = procalloc.arena; p < ep; p++) {
1463 		if(p->state == Dead || p->kp)
1464 			continue;
1465 		l = 0;
1466 		for(i=1; i<NSEG; i++) {
1467 			s = p->seg[i];
1468 			if(s != 0)
1469 				l += s->top - s->base;
1470 		}
1471 		if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1472 			kp = p;
1473 			max = l;
1474 		}
1475 	}
1476 
1477 	print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1478 	for(p = procalloc.arena; p < ep; p++) {
1479 		if(p->state == Dead || p->kp)
1480 			continue;
1481 		if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1482 			p->procctl = Proc_exitbig;
1483 	}
1484 	kp->procctl = Proc_exitbig;
1485 	for(i = 0; i < NSEG; i++) {
1486 		s = kp->seg[i];
1487 		if(s != 0 && canqlock(&s->lk)) {
1488 			mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1489 			qunlock(&s->lk);
1490 		}
1491 	}
1492 }
1493 
1494 /*
1495  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1496  *  eve changes.
1497  */
1498 void
1499 renameuser(char *old, char *new)
1500 {
1501 	Proc *p, *ep;
1502 
1503 	ep = procalloc.arena+conf.nproc;
1504 	for(p = procalloc.arena; p < ep; p++)
1505 		if(p->user!=nil && strcmp(old, p->user)==0)
1506 			kstrdup(&p->user, new);
1507 }
1508 
1509 /*
1510  *  time accounting called by clock() splhi'd
1511  */
1512 void
1513 accounttime(void)
1514 {
1515 	Proc *p;
1516 	ulong n, per;
1517 	static ulong nrun;
1518 
1519 	p = m->proc;
1520 	if(p) {
1521 		nrun++;
1522 		p->time[p->insyscall]++;
1523 	}
1524 
1525 	/* calculate decaying duty cycles */
1526 	n = perfticks();
1527 	per = n - m->perf.last;
1528 	m->perf.last = n;
1529 	per = (m->perf.period*(HZ-1) + per)/HZ;
1530 	if(per != 0)
1531 		m->perf.period = per;
1532 
1533 	m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1534 	m->perf.inidle = 0;
1535 
1536 	m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1537 	m->perf.inintr = 0;
1538 
1539 	/* only one processor gets to compute system load averages */
1540 	if(m->machno != 0)
1541 		return;
1542 
1543 	/*
1544 	 * calculate decaying load average.
1545 	 * if we decay by (n-1)/n then it takes
1546 	 * n clock ticks to go from load L to .36 L once
1547 	 * things quiet down.  it takes about 5 n clock
1548 	 * ticks to go to zero.  so using HZ means this is
1549 	 * approximately the load over the last second,
1550 	 * with a tail lasting about 5 seconds.
1551 	 */
1552 	n = nrun;
1553 	nrun = 0;
1554 	n = (nrdy+n)*1000;
1555 	m->load = (m->load*(HZ-1)+n)/HZ;
1556 }
1557 
1558 static void
1559 pidhash(Proc *p)
1560 {
1561 	int h;
1562 
1563 	h = p->pid % nelem(procalloc.ht);
1564 	lock(&procalloc);
1565 	p->pidhash = procalloc.ht[h];
1566 	procalloc.ht[h] = p;
1567 	unlock(&procalloc);
1568 }
1569 
1570 static void
1571 pidunhash(Proc *p)
1572 {
1573 	int h;
1574 	Proc **l;
1575 
1576 	h = p->pid % nelem(procalloc.ht);
1577 	lock(&procalloc);
1578 	for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1579 		if(*l == p){
1580 			*l = p->pidhash;
1581 			break;
1582 		}
1583 	unlock(&procalloc);
1584 }
1585 
1586 int
1587 procindex(ulong pid)
1588 {
1589 	Proc *p;
1590 	int h;
1591 	int s;
1592 
1593 	s = -1;
1594 	h = pid % nelem(procalloc.ht);
1595 	lock(&procalloc);
1596 	for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1597 		if(p->pid == pid){
1598 			s = p - procalloc.arena;
1599 			break;
1600 		}
1601 	unlock(&procalloc);
1602 	return s;
1603 }
1604