xref: /plan9-contrib/sys/src/9k/port/proc.c (revision 4498a2438a086713d814ef7bd3d66ca21846c7c5)
1 #include	<u.h>
2 #include	"../port/lib.h"
3 #include	"mem.h"
4 #include	"dat.h"
5 #include	"fns.h"
6 #include	"../port/error.h"
7 
8 #include	"../port/edf.h"
9 #include	"errstr.h"
10 #include	<ptrace.h>
11 
12 int	nrdy;
13 Ref	noteidalloc;
14 
15 ulong delayedscheds;	/* statistics */
16 long skipscheds;
17 long preempts;
18 
19 static Ref pidalloc;
20 
21 struct Procalloc procalloc;
22 
23 extern Proc* psalloc(void);
24 extern void pshash(Proc*);
25 extern void psrelease(Proc*);
26 extern void psunhash(Proc*);
27 
28 enum
29 {
30 	Scaling=2,
31 };
32 
33 static int reprioritize(Proc*);
34 static void updatecpu(Proc*);
35 static int schedgain = 30;	/* units in seconds */
36 
37 static void rebalance(void);
38 static ulong balancetime;
39 
40 Schedq	runq[Nrq];
41 ulong	runvec;
42 
43 char *statename[] =
44 {	/* BUG: generate automatically */
45 	"Dead",
46 	"Moribund",
47 	"Ready",
48 	"Scheding",
49 	"Running",
50 	"Queueing",
51 	"QueueingR",
52 	"QueueingW",
53 	"Wakeme",
54 	"Broken",
55 	"Stopped",
56 	"Rendez",
57 	"Waitrelease",
58 };
59 
60 /*
61  * Always splhi()'ed.
62  */
63 void
schedinit(void)64 schedinit(void)		/* never returns */
65 {
66 	Edf *e;
67 
68 	setlabel(&m->sched);
69 	if(up) {
70 		if((e = up->edf) && (e->flags & Admitted))
71 			edfrecord(up);
72 		updatecpu(up);
73 		m->proc = nil;
74 		switch(up->state) {
75 		case Running:
76 			ready(up);
77 			break;
78 		case Moribund:
79 			up->state = Dead;
80 
81 			/*
82 			 * Holding locks from pexit:
83 			 * 	procalloc
84 			 *	palloc
85 			 */
86 			mmurelease(up);
87 			unlock(&palloc);
88 
89 			psrelease(up);
90 			unlock(&procalloc);
91 			break;
92 		}
93 		up->mach = nil;
94 		up = nil;
95 	}
96 	sched();
97 }
98 
99 /*
100  *  If changing this routine, look also at sleep().  It
101  *  contains a copy of the guts of sched().
102  */
103 void
sched(void)104 sched(void)
105 {
106 	Proc *p;
107 
108 	if(m->ilockdepth)
109 		panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
110 			m->machno,
111 			m->ilockdepth,
112 			up? up->lastilock: nil,
113 			(up && up->lastilock)? up->lastilock->pc: m->ilockpc,
114 			getcallerpc(&p+2));
115 
116 	if(up){
117 		/*
118 		 * Delay the sched until the process gives up the locks
119 		 * it is holding.  This avoids dumb lock loops.
120 		 * Don't delay if the process is Moribund.
121 		 * It called sched to die.
122 		 * But do sched eventually.  This avoids a missing unlock
123 		 * from hanging the entire kernel.
124 		 * But don't reschedule procs holding palloc or procalloc.
125 		 * Those are far too important to be holding while asleep.
126 		 *
127 		 * This test is not exact.  There can still be a few
128 		 * instructions in the middle of taslock when a process
129 		 * holds a lock but Lock.p has not yet been initialized.
130 		 */
131 		if(up->nlocks)
132 		if(up->state != Moribund)
133 		if(up->delaysched < 20
134 		|| palloc.Lock.p == up
135 		|| procalloc.Lock.p == up){
136 			up->delaysched++;
137 			delayedscheds++;
138 			return;
139 		}
140 		up->delaysched = 0;
141 
142 		splhi();
143 
144 		/* statistics */
145 		m->cs++;
146 
147 		procsave(up);
148 		if(setlabel(&up->sched)){
149 			procrestore(up);
150 			spllo();
151 			return;
152 		}
153 		gotolabel(&m->sched);
154 	}
155 	p = runproc();
156 	if(!p->edf){
157 		updatecpu(p);
158 		p->priority = reprioritize(p);
159 	}
160 	if(p != m->readied)
161 		m->schedticks = m->ticks + HZ/10;
162 	m->readied = nil;
163 	up = p;
164 	up->state = Running;
165 	up->mach = m;
166 	m->proc = up;
167 	mmuswitch(up);
168 	gotolabel(&up->sched);
169 }
170 
171 int
anyready(void)172 anyready(void)
173 {
174 	return runvec;
175 }
176 
177 int
anyhigher(void)178 anyhigher(void)
179 {
180 	return runvec & ~((1<<(up->priority+1))-1);
181 }
182 
183 /*
184  *  here once per clock tick to see if we should resched
185  */
186 void
hzsched(void)187 hzsched(void)
188 {
189 	/* once a second, rebalance will reprioritize ready procs */
190 	if(m->machno == 0)
191 		rebalance();
192 
193 	/* unless preempted, get to run for at least 100ms */
194 	if(anyhigher()
195 	|| (!up->fixedpri && m->ticks > m->schedticks && anyready())){
196 		m->readied = nil;	/* avoid cooperative scheduling */
197 		up->delaysched++;
198 	}
199 }
200 
201 /*
202  *  here at the end of non-clock interrupts to see if we should preempt the
203  *  current process.  Returns 1 if preempted, 0 otherwise.
204  */
205 int
preempted(void)206 preempted(void)
207 {
208 	if(up && up->state == Running)
209 	if(up->preempted == 0)
210 	if(anyhigher())
211 	if(!active.exiting){
212 		m->readied = nil;	/* avoid cooperative scheduling */
213 		up->preempted = 1;
214 		sched();
215 		splhi();
216 		up->preempted = 0;
217 		return 1;
218 	}
219 	return 0;
220 }
221 
222 /*
223  * Update the cpu time average for this particular process,
224  * which is about to change from up -> not up or vice versa.
225  * p->lastupdate is the last time an updatecpu happened.
226  *
227  * The cpu time average is a decaying average that lasts
228  * about D clock ticks.  D is chosen to be approximately
229  * the cpu time of a cpu-intensive "quick job".  A job has to run
230  * for approximately D clock ticks before we home in on its
231  * actual cpu usage.  Thus if you manage to get in and get out
232  * quickly, you won't be penalized during your burst.  Once you
233  * start using your share of the cpu for more than about D
234  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
235  * below all the other quick jobs.  Interactive tasks, because
236  * they basically always use less than their fair share of cpu,
237  * will be rewarded.
238  *
239  * If the process has not been running, then we want to
240  * apply the filter
241  *
242  *	cpu = cpu * (D-1)/D
243  *
244  * n times, yielding
245  *
246  *	cpu = cpu * ((D-1)/D)^n
247  *
248  * but D is big enough that this is approximately
249  *
250  * 	cpu = cpu * (D-n)/D
251  *
252  * so we use that instead.
253  *
254  * If the process has been running, we apply the filter to
255  * 1 - cpu, yielding a similar equation.  Note that cpu is
256  * stored in fixed point (* 1000).
257  *
258  * Updatecpu must be called before changing up, in order
259  * to maintain accurate cpu usage statistics.  It can be called
260  * at any time to bring the stats for a given proc up-to-date.
261  */
262 static void
updatecpu(Proc * p)263 updatecpu(Proc *p)
264 {
265 	int D, n, t, ocpu;
266 
267 	if(p->edf)
268 		return;
269 
270 	t = sys->ticks*Scaling + Scaling/2;
271 	n = t - p->lastupdate;
272 	p->lastupdate = t;
273 
274 	if(n == 0)
275 		return;
276 	D = schedgain*HZ*Scaling;
277 	if(n > D)
278 		n = D;
279 
280 	ocpu = p->cpu;
281 	if(p != up)
282 		p->cpu = (ocpu*(D-n))/D;
283 	else{
284 		t = 1000 - ocpu;
285 		t = (t*(D-n))/D;
286 		p->cpu = 1000 - t;
287 	}
288 
289 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
290 }
291 
292 /*
293  * On average, p has used p->cpu of a cpu recently.
294  * Its fair share is sys.nonline/m->load of a cpu.  If it has been getting
295  * too much, penalize it.  If it has been getting not enough, reward it.
296  * I don't think you can get much more than your fair share that
297  * often, so most of the queues are for using less.  Having a priority
298  * of 3 means you're just right.  Having a higher priority (up to p->basepri)
299  * means you're not using as much as you could.
300  */
301 static int
reprioritize(Proc * p)302 reprioritize(Proc *p)
303 {
304 	int fairshare, n, load, ratio;
305 
306 	load = sys->machptr[0]->load;
307 	if(load == 0)
308 		return p->basepri;
309 
310 	/*
311 	 *  fairshare = 1.000 * PROCMAX * 1.000/load,
312 	 * except the decimal point is moved three places
313 	 * on both load and fairshare.
314 	 */
315 	fairshare = (sys->nonline*1000*1000)/load;
316 	n = p->cpu;
317 	if(n == 0)
318 		n = 1;
319 	ratio = (fairshare+n/2) / n;
320 	if(ratio > p->basepri)
321 		ratio = p->basepri;
322 	if(ratio < 0)
323 		panic("reprioritize");
324 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
325 	return ratio;
326 }
327 
328 /*
329  * add a process to a scheduling queue
330  */
331 void
queueproc(Schedq * rq,Proc * p)332 queueproc(Schedq *rq, Proc *p)
333 {
334 	int pri;
335 
336 	pri = rq - runq;
337 	lock(runq);
338 	p->priority = pri;
339 	p->rnext = 0;
340 	if(rq->tail)
341 		rq->tail->rnext = p;
342 	else
343 		rq->head = p;
344 	rq->tail = p;
345 	rq->n++;
346 	nrdy++;
347 	runvec |= 1<<pri;
348 	unlock(runq);
349 }
350 
351 /*
352  *  try to remove a process from a scheduling queue (called splhi)
353  */
354 Proc*
dequeueproc(Schedq * rq,Proc * tp)355 dequeueproc(Schedq *rq, Proc *tp)
356 {
357 	Proc *l, *p;
358 
359 	if(!canlock(runq))
360 		return nil;
361 
362 	/*
363 	 *  the queue may have changed before we locked runq,
364 	 *  refind the target process.
365 	 */
366 	l = 0;
367 	for(p = rq->head; p; p = p->rnext){
368 		if(p == tp)
369 			break;
370 		l = p;
371 	}
372 
373 	/*
374 	 *  p->mach==0 only when process state is saved
375 	 */
376 	if(p == nil || p->mach){
377 		unlock(runq);
378 		return nil;
379 	}
380 	if(p->rnext == nil)
381 		rq->tail = l;
382 	if(l)
383 		l->rnext = p->rnext;
384 	else
385 		rq->head = p->rnext;
386 	if(rq->head == nil)
387 		runvec &= ~(1<<(rq-runq));
388 	rq->n--;
389 	nrdy--;
390 	if(p->state != Ready)
391 		print("dequeueproc %s %d %s\n", p->text, p->pid, statename[p->state]);
392 
393 	unlock(runq);
394 	return p;
395 }
396 
397 /*
398  *  ready(p) picks a new priority for a process and sticks it in the
399  *  runq for that priority.
400  */
401 void
ready(Proc * p)402 ready(Proc *p)
403 {
404 	Mreg s;
405 	int pri;
406 	Schedq *rq;
407 	void (*pt)(Proc*, int, vlong, vlong);
408 
409 	s = splhi();
410 	if(edfready(p)){
411 		splx(s);
412 		return;
413 	}
414 
415 	if(up != p && (p->wired == nil || p->wired == m))
416 		m->readied = p;	/* group scheduling */
417 
418 	updatecpu(p);
419 	pri = reprioritize(p);
420 	p->priority = pri;
421 	rq = &runq[pri];
422 	p->state = Ready;
423 	queueproc(rq, p);
424 	pt = proctrace;
425 	if(pt)
426 		pt(p, SReady, 0, 0);
427 	splx(s);
428 }
429 
430 /*
431  *  yield the processor and drop our priority
432  */
433 void
yield(void)434 yield(void)
435 {
436 	if(anyready()){
437 		/* pretend we just used 1/2 tick */
438 		up->lastupdate -= Scaling/2;
439 		sched();
440 	}
441 }
442 
443 /*
444  *  recalculate priorities once a second.  We need to do this
445  *  since priorities will otherwise only be recalculated when
446  *  the running process blocks.
447  */
448 static void
rebalance(void)449 rebalance(void)
450 {
451 	Mreg s;
452 	int pri, npri, t;
453 	Schedq *rq;
454 	Proc *p;
455 
456 	t = m->ticks;
457 	if(t - balancetime < HZ)
458 		return;
459 	balancetime = t;
460 
461 	for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
462 another:
463 		p = rq->head;
464 		if(p == nil)
465 			continue;
466 		if(p->mp != m)
467 			continue;
468 		if(pri == p->basepri)
469 			continue;
470 		updatecpu(p);
471 		npri = reprioritize(p);
472 		if(npri != pri){
473 			s = splhi();
474 			p = dequeueproc(rq, p);
475 			if(p)
476 				queueproc(&runq[npri], p);
477 			splx(s);
478 			goto another;
479 		}
480 	}
481 }
482 
483 
484 /*
485  *  pick a process to run
486  */
487 Proc*
runproc(void)488 runproc(void)
489 {
490 	Schedq *rq;
491 	Proc *p;
492 	ulong start, now;
493 	int i;
494 	void (*pt)(Proc*, int, vlong, vlong);
495 
496 	start = perfticks();
497 
498 	/* cooperative scheduling until the clock ticks */
499 	if((p=m->readied) && p->mach==0 && p->state==Ready
500 	&& (p->wired == nil || p->wired == m)
501 	&& runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
502 		skipscheds++;
503 		rq = &runq[p->priority];
504 		goto found;
505 	}
506 
507 	preempts++;
508 
509 loop:
510 	/*
511 	 *  find a process that last ran on this processor (affinity),
512 	 *  or one that hasn't moved in a while (load balancing).  Every
513 	 *  time around the loop affinity goes down.
514 	 */
515 	spllo();
516 	for(i = 0;; i++){
517 		/*
518 		 *  find the highest priority target process that this
519 		 *  processor can run given affinity constraints.
520 		 *
521 		 */
522 		for(rq = &runq[Nrq-1]; rq >= runq; rq--){
523 			for(p = rq->head; p; p = p->rnext){
524 				if(p->mp == nil || p->mp == m
525 				|| (!p->wired && i > 0))
526 					goto found;
527 			}
528 		}
529 
530 		/* waste time or halt the CPU */
531 		idlehands();
532 
533 		/* remember how much time we're here */
534 		now = perfticks();
535 		m->perf.inidle += now-start;
536 		start = now;
537 	}
538 
539 found:
540 	splhi();
541 	p = dequeueproc(rq, p);
542 	if(p == nil)
543 		goto loop;
544 
545 	p->state = Scheding;
546 	p->mp = m;
547 
548 	if(edflock(p)){
549 		edfrun(p, rq == &runq[PriEdf]);	/* start deadline timer and do admin */
550 		edfunlock();
551 	}
552 	pt = proctrace;
553 	if(pt)
554 		pt(p, SRun, 0, 0);
555 	return p;
556 }
557 
558 int
canpage(Proc * p)559 canpage(Proc *p)
560 {
561 	int ok;
562 
563 	splhi();
564 	lock(runq);
565 	/* Only reliable way to see if we are Running */
566 	if(p->mach == 0) {
567 		p->newtlb = 1;
568 		ok = 1;
569 	}
570 	else
571 		ok = 0;
572 	unlock(runq);
573 	spllo();
574 
575 	return ok;
576 }
577 
578 Proc*
newproc(void)579 newproc(void)
580 {
581 	Proc *p;
582 
583 	p = psalloc();
584 
585 	p->state = Scheding;
586 	p->psstate = "New";
587 	p->mach = nil;
588 	p->qnext = nil;
589 	p->nchild = 0;
590 	p->nwait = 0;
591 	p->waitq = nil;
592 	p->parent = nil;
593 	p->pgrp = nil;
594 	p->egrp = nil;
595 	p->fgrp = nil;
596 	p->rgrp = nil;
597 	p->pdbg = 0;
598 	p->kp = 0;
599 	if(up != nil && up->procctl == Proc_tracesyscall)
600 		p->procctl = Proc_tracesyscall;
601 	else
602 		p->procctl = 0;
603 	p->syscalltrace = nil;
604 	p->notepending = 0;
605 	p->ureg = nil;
606 	p->privatemem = 0;
607 	p->errstr = p->errbuf0;
608 	p->syserrstr = p->errbuf1;
609 	p->errbuf0[0] = '\0';
610 	p->errbuf1[0] = '\0';
611 	p->nlocks = 0;
612 	p->delaysched = 0;
613 	p->trace = 0;
614 	kstrdup(&p->user, "*nouser");
615 	kstrdup(&p->text, "*notext");
616 	kstrdup(&p->args, "");
617 	p->nargs = 0;
618 	p->setargs = 0;
619 	memset(p->seg, 0, sizeof p->seg);
620 	p->pid = incref(&pidalloc);
621 	pshash(p);
622 	p->noteid = incref(&noteidalloc);
623 	if(p->pid <= 0 || p->noteid <= 0)
624 		panic("pidalloc");
625 	if(p->kstack == nil)
626 		p->kstack = smalloc(KSTACK);
627 
628 	/* sched params */
629 	p->mp = nil;
630 	p->wired = nil;
631 	procpriority(p, PriNormal, 0);
632 	p->cpu = 0;
633 	p->lastupdate = sys->ticks*Scaling;
634 	p->edf = nil;
635 
636 	return p;
637 }
638 
639 /*
640  * wire this proc to a machine
641  */
642 void
procwired(Proc * p,int bm)643 procwired(Proc *p, int bm)
644 {
645 	Proc *pp;
646 	int i;
647 	char nwired[MACHMAX];
648 	Mach *wm, *mp;
649 
650 	if(bm < 0){
651 		/* pick a machine to wire to */
652 		memset(nwired, 0, sizeof(nwired));
653 		p->wired = nil;
654 		for(i=0; (pp = psincref(i)) != nil; i++){
655 			wm = pp->wired;
656 			if(wm && pp->pid)
657 				nwired[wm->machno]++;
658 			psdecref(pp);
659 		}
660 		bm = 0;
661 		for(i=0; i<MACHMAX; i++){
662 			if((mp = sys->machptr[i]) == nil || !mp->online)
663 				continue;
664 			if(nwired[i] < nwired[bm])
665 				bm = i;
666 		}
667 	} else {
668 		/* use the virtual machine requested */
669 		bm = bm % MACHMAX;
670 	}
671 
672 	p->wired = sys->machptr[bm];
673 	p->mp = p->wired;
674 }
675 
676 void
procpriority(Proc * p,int pri,int fixed)677 procpriority(Proc *p, int pri, int fixed)
678 {
679 	if(pri >= Npriq)
680 		pri = Npriq - 1;
681 	else if(pri < 0)
682 		pri = 0;
683 	p->basepri = pri;
684 	p->priority = pri;
685 	if(fixed){
686 		p->fixedpri = 1;
687 	} else {
688 		p->fixedpri = 0;
689 	}
690 }
691 
692 /*
693  *  sleep if a condition is not true.  Another process will
694  *  awaken us after it sets the condition.  When we awaken
695  *  the condition may no longer be true.
696  *
697  *  we lock both the process and the rendezvous to keep r->p
698  *  and p->r synchronized.
699  */
700 void
sleep(Rendez * r,int (* f)(void *),void * arg)701 sleep(Rendez *r, int (*f)(void*), void *arg)
702 {
703 	Mreg s;
704 	void (*pt)(Proc*, int, vlong, vlong);
705 
706 	s = splhi();
707 
708 	if(up->nlocks)
709 		print("process %d sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n",
710 			up->pid, up->nlocks, up->lastlock, up->lastlock->pc, getcallerpc(&r));
711 	lock(r);
712 	lock(&up->rlock);
713 	if(r->p){
714 		print("double sleep called from %#p, %d %d\n",
715 			getcallerpc(&r), r->p->pid, up->pid);
716 		dumpstack();
717 	}
718 
719 	/*
720 	 *  Wakeup only knows there may be something to do by testing
721 	 *  r->p in order to get something to lock on.
722 	 *  Flush that information out to memory in case the sleep is
723 	 *  committed.
724 	 */
725 	r->p = up;
726 
727 	if((*f)(arg) || up->notepending){
728 		/*
729 		 *  if condition happened or a note is pending
730 		 *  never mind
731 		 */
732 		r->p = nil;
733 		unlock(&up->rlock);
734 		unlock(r);
735 	} else {
736 		/*
737 		 *  now we are committed to
738 		 *  change state and call scheduler
739 		 */
740 		pt = proctrace;
741 		if(pt)
742 			pt(up, SSleep, 0, 0);
743 		up->state = Wakeme;
744 		up->r = r;
745 
746 		/* statistics */
747 		m->cs++;
748 
749 		procsave(up);
750 		if(setlabel(&up->sched)) {
751 			/*
752 			 *  here when the process is awakened
753 			 */
754 			procrestore(up);
755 			spllo();
756 		} else {
757 			/*
758 			 *  here to go to sleep (i.e. stop Running)
759 			 */
760 			unlock(&up->rlock);
761 			unlock(r);
762 			gotolabel(&m->sched);
763 		}
764 	}
765 
766 	if(up->notepending) {
767 		up->notepending = 0;
768 		splx(s);
769 		if(up->procctl == Proc_exitme && up->closingfgrp)
770 			forceclosefgrp();
771 		error(Eintr);
772 	}
773 
774 	splx(s);
775 }
776 
777 static int
tfn(void * arg)778 tfn(void *arg)
779 {
780 	return up->trend == nil || up->tfn(arg);
781 }
782 
783 void
twakeup(Ureg *,Timer * t)784 twakeup(Ureg*, Timer *t)
785 {
786 	Proc *p;
787 	Rendez *trend;
788 
789 	p = t->ta;
790 	trend = p->trend;
791 	p->trend = 0;
792 	if(trend)
793 		wakeup(trend);
794 }
795 
796 void
tsleep(Rendez * r,int (* fn)(void *),void * arg,long ms)797 tsleep(Rendez *r, int (*fn)(void*), void *arg, long ms)
798 {
799 	if (up->tt){
800 		print("tsleep: timer active: mode %d, tf %#p\n",
801 			up->tmode, up->tf);
802 		timerdel(up);
803 	}
804 	up->tns = MS2NS(ms);
805 	up->tf = twakeup;
806 	up->tmode = Trelative;
807 	up->ta = up;
808 	up->trend = r;
809 	up->tfn = fn;
810 	timeradd(up);
811 
812 	if(waserror()){
813 		timerdel(up);
814 		nexterror();
815 	}
816 	sleep(r, tfn, arg);
817 	if (up->tt)
818 		timerdel(up);
819 	up->twhen = 0;
820 	poperror();
821 }
822 
823 /*
824  *  Expects that only one process can call wakeup for any given Rendez.
825  *  We hold both locks to ensure that r->p and p->r remain consistent.
826  *  Richard Miller has a better solution that doesn't require both to
827  *  be held simultaneously, but I'm a paranoid - presotto.
828  */
829 Proc*
wakeup(Rendez * r)830 wakeup(Rendez *r)
831 {
832 	Mreg s;
833 	Proc *p;
834 
835 	s = splhi();
836 
837 	lock(r);
838 	p = r->p;
839 
840 	if(p != nil){
841 		lock(&p->rlock);
842 		if(p->state != Wakeme || p->r != r)
843 			panic("wakeup: state");
844 		r->p = nil;
845 		p->r = nil;
846 		ready(p);
847 		unlock(&p->rlock);
848 	}
849 	unlock(r);
850 
851 	splx(s);
852 
853 	return p;
854 }
855 
856 /*
857  *  if waking a sleeping process, this routine must hold both
858  *  p->rlock and r->lock.  However, it can't know them in
859  *  the same order as wakeup causing a possible lock ordering
860  *  deadlock.  We break the deadlock by giving up the p->rlock
861  *  lock if we can't get the r->lock and retrying.
862  */
863 int
postnote(Proc * p,int dolock,char * n,int flag)864 postnote(Proc *p, int dolock, char *n, int flag)
865 {
866 	Mreg s;
867 	int ret;
868 	Rendez *r;
869 	Proc *d, **l;
870 
871 	if(dolock && m->ilockdepth != 0)
872 		panic("postnote: caller %#p: ilockdepth %d note %s",
873 			getcallerpc(&p), m->ilockdepth, n);
874 	if(dolock)
875 		qlock(&p->debug);
876 
877 	if(flag != NUser && (p->notify == nil || p->notified))
878 		p->nnote = 0;
879 
880 	ret = 0;
881 	if(p->nnote < NNOTE) {
882 		strcpy(p->note[p->nnote].msg, n);
883 		p->note[p->nnote++].flag = flag;
884 		ret = 1;
885 	}
886 	p->notepending = 1;
887 	if(dolock)
888 		qunlock(&p->debug);
889 
890 	/* this loop is to avoid lock ordering problems. */
891 	for(;;){
892 		s = splhi();
893 		lock(&p->rlock);
894 		r = p->r;
895 
896 		/* waiting for a wakeup? */
897 		if(r == nil)
898 			break;	/* no */
899 
900 		/* try for the second lock */
901 		if(canlock(r)){
902 			if(p->state != Wakeme || r->p != p)
903 				panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
904 			p->r = nil;
905 			r->p = nil;
906 			ready(p);
907 			unlock(r);
908 			break;
909 		}
910 
911 		/* give other process time to get out of critical section and try again */
912 		unlock(&p->rlock);
913 		splx(s);
914 		sched();
915 	}
916 	unlock(&p->rlock);
917 	splx(s);
918 
919 	if(p->state != Rendezvous)
920 		return ret;
921 
922 	/* Try and pull out of a rendezvous */
923 	lock(p->rgrp);
924 	if(p->state == Rendezvous) {
925 		p->rendval = ~0;
926 		l = &REND(p->rgrp, p->rendtag);
927 		for(d = *l; d; d = d->rendhash) {
928 			if(d == p) {
929 				*l = p->rendhash;
930 				break;
931 			}
932 			l = &d->rendhash;
933 		}
934 		ready(p);
935 	}
936 	unlock(p->rgrp);
937 	return ret;
938 }
939 
940 /*
941  * weird thing: keep at most NBROKEN around
942  */
943 #define	NBROKEN 4
944 struct
945 {
946 	QLock;
947 	int	n;
948 	Proc	*p[NBROKEN];
949 }broken;
950 
951 void
addbroken(Proc * p)952 addbroken(Proc *p)
953 {
954 	qlock(&broken);
955 	if(broken.n == NBROKEN) {
956 		ready(broken.p[0]);
957 		memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
958 		--broken.n;
959 	}
960 	broken.p[broken.n++] = p;
961 	qunlock(&broken);
962 
963 	edfstop(up);
964 	p->state = Broken;
965 	p->psstate = 0;
966 	sched();
967 }
968 
969 void
unbreak(Proc * p)970 unbreak(Proc *p)
971 {
972 	int b;
973 
974 	qlock(&broken);
975 	for(b=0; b < broken.n; b++)
976 		if(broken.p[b] == p) {
977 			broken.n--;
978 			memmove(&broken.p[b], &broken.p[b+1],
979 					sizeof(Proc*)*(NBROKEN-(b+1)));
980 			ready(p);
981 			break;
982 		}
983 	qunlock(&broken);
984 }
985 
986 int
freebroken(void)987 freebroken(void)
988 {
989 	int i, n;
990 
991 	qlock(&broken);
992 	n = broken.n;
993 	for(i=0; i<n; i++) {
994 		ready(broken.p[i]);
995 		broken.p[i] = 0;
996 	}
997 	broken.n = 0;
998 	qunlock(&broken);
999 	return n;
1000 }
1001 
1002 void
pexit(char * exitstr,int freemem)1003 pexit(char *exitstr, int freemem)
1004 {
1005 	Proc *p;
1006 	Segment **s, **es;
1007 	long utime, stime;
1008 	Waitq *wq, *f, *next;
1009 	Fgrp *fgrp;
1010 	Egrp *egrp;
1011 	Rgrp *rgrp;
1012 	Pgrp *pgrp;
1013 	Chan *dot;
1014 	void (*pt)(Proc*, int, vlong, vlong);
1015 
1016 	if(up->syscalltrace != nil)
1017 		free(up->syscalltrace);
1018 	up->syscalltrace = nil;
1019 	up->alarm = 0;
1020 	if (up->tt)
1021 		timerdel(up);
1022 	pt = proctrace;
1023 	if(pt)
1024 		pt(up, SDead, 0, 0);
1025 
1026 	/* nil out all the resources under lock (free later) */
1027 	qlock(&up->debug);
1028 	fgrp = up->fgrp;
1029 	up->fgrp = nil;
1030 	egrp = up->egrp;
1031 	up->egrp = nil;
1032 	rgrp = up->rgrp;
1033 	up->rgrp = nil;
1034 	pgrp = up->pgrp;
1035 	up->pgrp = nil;
1036 	dot = up->dot;
1037 	up->dot = nil;
1038 	qunlock(&up->debug);
1039 
1040 	if(fgrp)
1041 		closefgrp(fgrp);
1042 	if(egrp)
1043 		closeegrp(egrp);
1044 	if(rgrp)
1045 		closergrp(rgrp);
1046 	if(dot)
1047 		cclose(dot);
1048 	if(pgrp)
1049 		closepgrp(pgrp);
1050 
1051 	/*
1052 	 * if not a kernel process and have a parent,
1053 	 * do some housekeeping.
1054 	 */
1055 	if(up->kp == 0) {
1056 		p = up->parent;
1057 		if(p == nil) {
1058 			if(exitstr == nil)
1059 				exitstr = "unknown";
1060 			panic("boot process died: %s", exitstr);
1061 		}
1062 
1063 		while(waserror())
1064 			;
1065 
1066 		wq = smalloc(sizeof(Waitq));
1067 		poperror();
1068 
1069 		wq->w.pid = up->pid;
1070 		utime = up->time[TUser] + up->time[TCUser];
1071 		stime = up->time[TSys] + up->time[TCSys];
1072 		wq->w.time[TUser] = tk2ms(utime);
1073 		wq->w.time[TSys] = tk2ms(stime);
1074 		wq->w.time[TReal] = tk2ms(sys->ticks - up->time[TReal]);
1075 		if(exitstr && exitstr[0])
1076 			snprint(wq->w.msg, sizeof(wq->w.msg), "%s %d: %s",
1077 				up->text, up->pid, exitstr);
1078 		else
1079 			wq->w.msg[0] = '\0';
1080 
1081 		lock(&p->exl);
1082 		/*
1083 		 * Check that parent is still alive.
1084 		 */
1085 		if(p->pid == up->parentpid && p->state != Broken) {
1086 			p->nchild--;
1087 			p->time[TCUser] += utime;
1088 			p->time[TCSys] += stime;
1089 			/*
1090 			 * If there would be more than 128 wait records
1091 			 * processes for my parent, then don't leave a wait
1092 			 * record behind.  This helps prevent badly written
1093 			 * daemon processes from accumulating lots of wait
1094 			 * records.
1095 		 	 */
1096 			if(p->nwait < 128) {
1097 				wq->next = p->waitq;
1098 				p->waitq = wq;
1099 				p->nwait++;
1100 				wq = nil;
1101 				wakeup(&p->waitr);
1102 			}
1103 		}
1104 		unlock(&p->exl);
1105 		if(wq)
1106 			free(wq);
1107 	}
1108 
1109 	if(!freemem)
1110 		addbroken(up);
1111 
1112 	qlock(&up->seglock);
1113 	es = &up->seg[NSEG];
1114 	for(s = up->seg; s < es; s++) {
1115 		if(*s) {
1116 			putseg(*s);
1117 			*s = 0;
1118 		}
1119 	}
1120 	qunlock(&up->seglock);
1121 
1122 	lock(&up->exl);		/* Prevent my children from leaving waits */
1123 	psunhash(up);
1124 	up->pid = 0;
1125 	wakeup(&up->waitr);
1126 	unlock(&up->exl);
1127 
1128 	for(f = up->waitq; f; f = next) {
1129 		next = f->next;
1130 		free(f);
1131 	}
1132 
1133 	/* release debuggers */
1134 	qlock(&up->debug);
1135 	if(up->pdbg) {
1136 		wakeup(&up->pdbg->sleep);
1137 		up->pdbg = 0;
1138 	}
1139 	qunlock(&up->debug);
1140 
1141 	edfstop(up);
1142 	if(up->edf != nil){
1143 		free(up->edf);
1144 		up->edf = nil;
1145 	}
1146 
1147 	/* Sched must not loop for these locks */
1148 	lock(&procalloc);
1149 	lock(&palloc);
1150 
1151 	up->state = Moribund;
1152 	sched();
1153 	panic("pexit");
1154 }
1155 
1156 int
haswaitq(void * x)1157 haswaitq(void *x)
1158 {
1159 	Proc *p;
1160 
1161 	p = (Proc *)x;
1162 	return p->waitq != 0;
1163 }
1164 
1165 int
pwait(Waitmsg * w)1166 pwait(Waitmsg *w)
1167 {
1168 	int cpid;
1169 	Waitq *wq;
1170 
1171 	if(!canqlock(&up->qwaitr))
1172 		error(Einuse);
1173 
1174 	if(waserror()) {
1175 		qunlock(&up->qwaitr);
1176 		nexterror();
1177 	}
1178 
1179 	lock(&up->exl);
1180 	if(up->nchild == 0 && up->waitq == nil) {
1181 		unlock(&up->exl);
1182 		error(Enochild);
1183 	}
1184 	unlock(&up->exl);
1185 
1186 	sleep(&up->waitr, haswaitq, up);
1187 
1188 	lock(&up->exl);
1189 	wq = up->waitq;
1190 	up->waitq = wq->next;
1191 	up->nwait--;
1192 	unlock(&up->exl);
1193 
1194 	qunlock(&up->qwaitr);
1195 	poperror();
1196 
1197 	if(w)
1198 		memmove(w, &wq->w, sizeof(Waitmsg));
1199 	cpid = wq->w.pid;
1200 	free(wq);
1201 
1202 	return cpid;
1203 }
1204 
1205 void
dumpaproc(Proc * p)1206 dumpaproc(Proc *p)
1207 {
1208 	uintptr bss;
1209 	char *s;
1210 
1211 	if(p == nil)
1212 		return;
1213 
1214 	bss = 0;
1215 	if(p->seg[BSEG])
1216 		bss = p->seg[BSEG]->top;
1217 
1218 	s = p->psstate;
1219 	if(s == nil)
1220 		s = statename[p->state];
1221 	print("%3d:%10s pc %#p dbgpc %#p  %8s (%s) ut %ld st %ld bss %#p qpc %#p nl %d nd %lud lpc %#p pri %lud\n",
1222 		p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state],
1223 		p->time[0], p->time[1], bss, p->qpc, p->nlocks,
1224 		p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1225 }
1226 
1227 void
procdump(void)1228 procdump(void)
1229 {
1230 	int i;
1231 	Proc *p;
1232 
1233 	if(up != nil)
1234 		print("up %d\n", up->pid);
1235 	else
1236 		print("no current process\n");
1237 	for(i=0; (p = psincref(i)) != nil; i++) {
1238 		if(p->state != Dead)
1239 			dumpaproc(p);
1240 		psdecref(p);
1241 	}
1242 }
1243 
1244 /*
1245  *  wait till all processes have flushed their mmu
1246  *  state about segment s
1247  */
1248 void
procflushseg(Segment * s)1249 procflushseg(Segment *s)
1250 {
1251 	int i, ns, nm, nwait;
1252 	Proc *p;
1253 	Mach *mp;
1254 
1255 	/*
1256 	 *  tell all processes with this
1257 	 *  segment to flush their mmu's
1258 	 */
1259 	nwait = 0;
1260 	for(i=0; (p = psincref(i)) != nil; i++) {
1261 		if(p->state == Dead){
1262 			psdecref(p);
1263 			continue;
1264 		}
1265 		for(ns = 0; ns < NSEG; ns++){
1266 			if(p->seg[ns] == s){
1267 				p->newtlb = 1;
1268 				for(nm = 0; nm < MACHMAX; nm++){
1269 					if((mp = sys->machptr[nm]) == nil || !mp->online)
1270 						continue;
1271 					if(mp->proc == p){
1272 						mp->mmuflush = 1;
1273 						nwait++;
1274 					}
1275 				}
1276 				break;
1277 			}
1278 		}
1279 		psdecref(p);
1280 	}
1281 
1282 	if(nwait == 0)
1283 		return;
1284 
1285 	/*
1286 	 *  wait for all processors to take a clock interrupt
1287 	 *  and flush their mmu's
1288 	 */
1289 	for(i = 0; i < MACHMAX; i++){
1290 		if((mp = sys->machptr[i]) == nil || !mp->online || mp == m)
1291 			continue;
1292 		while(mp->mmuflush)
1293 			sched();
1294 	}
1295 }
1296 
1297 void
scheddump(void)1298 scheddump(void)
1299 {
1300 	Proc *p;
1301 	Schedq *rq;
1302 
1303 	for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1304 		if(rq->head == 0)
1305 			continue;
1306 		print("rq%ld:", rq-runq);
1307 		for(p = rq->head; p; p = p->rnext)
1308 			print(" %d(%lud)", p->pid, m->ticks - p->readytime);
1309 		print("\n");
1310 		delay(150);
1311 	}
1312 	print("nrdy %d\n", nrdy);
1313 }
1314 
1315 void
kproc(char * name,void (* func)(void *),void * arg)1316 kproc(char *name, void (*func)(void *), void *arg)
1317 {
1318 	Proc *p;
1319 	static Pgrp *kpgrp;
1320 
1321 	p = newproc();
1322 	p->psstate = 0;
1323 	p->procmode = 0640;
1324 	p->kp = 1;
1325 
1326 	p->scallnr = up->scallnr;
1327 	memmove(p->arg, up->arg, sizeof(up->arg));
1328 	p->nerrlab = 0;
1329 	p->slash = up->slash;
1330 	p->dot = up->dot;
1331 	if(p->dot)
1332 		incref(p->dot);
1333 
1334 	memmove(p->note, up->note, sizeof(p->note));
1335 	p->nnote = up->nnote;
1336 	p->notified = 0;
1337 	p->lastnote = up->lastnote;
1338 	p->notify = up->notify;
1339 	p->ureg = nil;
1340 	p->dbgreg = nil;
1341 
1342 	procpriority(p, PriKproc, 0);
1343 
1344 	kprocchild(p, func, arg);
1345 
1346 	kstrdup(&p->user, eve);
1347 	kstrdup(&p->text, name);
1348 	if(kpgrp == nil)
1349 		kpgrp = newpgrp();
1350 	p->pgrp = kpgrp;
1351 	incref(kpgrp);
1352 
1353 	memset(p->time, 0, sizeof(p->time));
1354 	p->time[TReal] = sys->ticks;
1355 	ready(p);
1356 }
1357 
1358 /*
1359  *  called splhi() by notify().  See comment in notify for the
1360  *  reasoning.
1361  */
1362 void
procctl(Proc * p)1363 procctl(Proc *p)
1364 {
1365 	Mreg s;
1366 	char *state;
1367 
1368 	switch(p->procctl) {
1369 	case Proc_exitbig:
1370 		spllo();
1371 		pexit("Killed: Insufficient physical memory", 1);
1372 
1373 	case Proc_exitme:
1374 		spllo();		/* pexit has locks in it */
1375 		pexit("Killed", 1);
1376 
1377 	case Proc_traceme:
1378 		if(p->nnote == 0)
1379 			return;
1380 		/* No break */
1381 
1382 	case Proc_stopme:
1383 		p->procctl = 0;
1384 		state = p->psstate;
1385 		p->psstate = "Stopped";
1386 		/* free a waiting debugger */
1387 		s = spllo();
1388 		qlock(&p->debug);
1389 		if(p->pdbg) {
1390 			wakeup(&p->pdbg->sleep);
1391 			p->pdbg = 0;
1392 		}
1393 		qunlock(&p->debug);
1394 		splhi();
1395 		p->state = Stopped;
1396 		sched();
1397 		p->psstate = state;
1398 		splx(s);
1399 		return;
1400 	}
1401 }
1402 
1403 void
error(char * err)1404 error(char *err)
1405 {
1406 	spllo();
1407 
1408 	assert(up->nerrlab < NERR);
1409 	kstrcpy(up->errstr, err, ERRMAX);
1410 	setlabel(&up->errlab[NERR-1]);
1411 	nexterror();
1412 }
1413 
1414 void
nexterror(void)1415 nexterror(void)
1416 {
1417 	gotolabel(&up->errlab[--up->nerrlab]);
1418 }
1419 
1420 void
exhausted(char * resource)1421 exhausted(char *resource)
1422 {
1423 	char buf[ERRMAX];
1424 
1425 	snprint(buf, sizeof buf, "no free %s", resource);
1426 	iprint("%s\n", buf);
1427 	error(buf);
1428 }
1429 
1430 void
killbig(char * why)1431 killbig(char *why)
1432 {
1433 	int i, x;
1434 	Segment *s;
1435 	ulong l, max;
1436 	Proc *p, *kp;
1437 
1438 	max = 0;
1439 	kp = nil;
1440 	for(x = 0; (p = psincref(x)) != nil; x++) {
1441 		if(p->state == Dead || p->kp){
1442 			psdecref(p);
1443 			continue;
1444 		}
1445 		l = 0;
1446 		for(i=1; i<NSEG; i++) {
1447 			s = p->seg[i];
1448 			if(s != 0)
1449 				l += s->top - s->base;
1450 		}
1451 		if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1452 			if(kp != nil)
1453 				psdecref(kp);
1454 			kp = p;
1455 			max = l;
1456 		}
1457 		else
1458 			psdecref(p);
1459 	}
1460 	if(kp == nil)
1461 		return;
1462 
1463 	print("%d: %s killed: %s\n", kp->pid, kp->text, why);
1464 	for(x = 0; (p = psincref(x)) != nil; x++) {
1465 		if(p->state == Dead || p->kp){
1466 			psdecref(p);
1467 			continue;
1468 		}
1469 		if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1470 			p->procctl = Proc_exitbig;
1471 		psdecref(p);
1472 	}
1473 
1474 	kp->procctl = Proc_exitbig;
1475 	for(i = 0; i < NSEG; i++) {
1476 		s = kp->seg[i];
1477 		if(s != nil && canqlock(&s->lk)) {
1478 			mfreeseg(s, s->base, s->top);
1479 			qunlock(&s->lk);
1480 		}
1481 	}
1482 	psdecref(kp);
1483 }
1484 
1485 /*
1486  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1487  *  eve changes.
1488  */
1489 void
renameuser(char * old,char * new)1490 renameuser(char *old, char *new)
1491 {
1492 	int i;
1493 	Proc *p;
1494 
1495 	for(i = 0; (p = psincref(i)) != nil; i++){
1496 		if(p->user!=nil && strcmp(old, p->user)==0)
1497 			kstrdup(&p->user, new);
1498 		psdecref(p);
1499 	}
1500 }
1501 
1502 /*
1503  *  time accounting called by clock() splhi'd
1504  */
1505 void
accounttime(void)1506 accounttime(void)
1507 {
1508 	Proc *p;
1509 	ulong n, per;
1510 	static ulong nrun;
1511 
1512 	p = m->proc;
1513 	if(p != nil) {
1514 		nrun++;
1515 		p->time[p->insyscall]++;
1516 	}
1517 
1518 	/* calculate decaying duty cycles */
1519 	n = perfticks();
1520 	per = n - m->perf.last;
1521 	m->perf.last = n;
1522 	per = (m->perf.period*(HZ-1) + per)/HZ;
1523 	if(per != 0)
1524 		m->perf.period = per;
1525 
1526 	m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1527 	m->perf.inidle = 0;
1528 
1529 	m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1530 	m->perf.inintr = 0;
1531 
1532 	/* only one processor gets to compute system load averages */
1533 	if(m->machno != 0)
1534 		return;
1535 
1536 	/*
1537 	 * calculate decaying load average.
1538 	 * if we decay by (n-1)/n then it takes
1539 	 * n clock ticks to go from load L to .36 L once
1540 	 * things quiet down.  it takes about 5 n clock
1541 	 * ticks to go to zero.  so using HZ means this is
1542 	 * approximately the load over the last second,
1543 	 * with a tail lasting about 5 seconds.
1544 	 */
1545 	n = nrun;
1546 	nrun = 0;
1547 	n = (nrdy+n)*1000;
1548 	m->load = (m->load*(HZ-1)+n)/HZ;
1549 }
1550 
1551