2 #include "../port/lib.h"
6 #include "../port/error.h"
10 int schedgain = 30; /* units in seconds */
14 void updatecpu(Proc*);
15 int reprioritize(Proc*);
17 ulong delayedscheds; /* statistics */
24 static struct Procalloc
43 { /* BUG: generate automatically */
59 static void pidhash(Proc*);
60 static void pidunhash(Proc*);
61 static void rebalance(void);
67 schedinit(void) /* never returns */
73 if((e = up->edf) && (e->flags & Admitted))
88 * Holding locks from pexit:
94 up->qnext = procalloc.free;
109 * If changing this routine, look also at sleep(). It
110 * contains a copy of the guts of sched().
118 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
121 up? up->lastilock: nil,
122 (up && up->lastilock)? up->lastilock->pc: 0,
126 * Delay the sched until the process gives up the locks
127 * it is holding. This avoids dumb lock loops.
128 * Don't delay if the process is Moribund.
129 * It called sched to die.
130 * But do sched eventually. This avoids a missing unlock
131 * from hanging the entire kernel.
132 * But don't reschedule procs holding palloc or procalloc.
133 * Those are far too important to be holding while asleep.
135 * This test is not exact. There can still be a few instructions
136 * in the middle of taslock when a process holds a lock
137 * but Lock.p has not yet been initialized.
140 if(up->state != Moribund)
141 if(up->delaysched < 20
142 || palloc.Lock.p == up
143 || procalloc.Lock.p == up){
156 if(setlabel(&up->sched)){
161 gotolabel(&m->sched);
166 p->priority = reprioritize(p);
169 m->schedticks = m->ticks + HZ/10;
173 up->mach = MACHP(m->machno);
176 gotolabel(&up->sched);
188 return runvec & ~((1<<(up->priority+1))-1);
192 * here once per clock tick to see if we should resched
197 /* once a second, rebalance will reprioritize ready procs */
201 /* unless preempted, get to run for at least 100ms */
203 || (!up->fixedpri && m->ticks > m->schedticks && anyready())){
204 m->readied = nil; /* avoid cooperative scheduling */
210 * here at the end of non-clock interrupts to see if we should preempt the
211 * current process. Returns 1 if preempted, 0 otherwise.
216 if(up && up->state == Running)
217 if(up->preempted == 0)
220 m->readied = nil; /* avoid cooperative scheduling */
231 * Update the cpu time average for this particular process,
232 * which is about to change from up -> not up or vice versa.
233 * p->lastupdate is the last time an updatecpu happened.
235 * The cpu time average is a decaying average that lasts
236 * about D clock ticks. D is chosen to be approximately
237 * the cpu time of a cpu-intensive "quick job". A job has to run
238 * for approximately D clock ticks before we home in on its
239 * actual cpu usage. Thus if you manage to get in and get out
240 * quickly, you won't be penalized during your burst. Once you
241 * start using your share of the cpu for more than about D
242 * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
243 * below all the other quick jobs. Interactive tasks, because
244 * they basically always use less than their fair share of cpu,
247 * If the process has not been running, then we want to
250 * cpu = cpu * (D-1)/D
254 * cpu = cpu * ((D-1)/D)^n
256 * but D is big enough that this is approximately
258 * cpu = cpu * (D-n)/D
260 * so we use that instead.
262 * If the process has been running, we apply the filter to
263 * 1 - cpu, yielding a similar equation. Note that cpu is
264 * stored in fixed point (* 1000).
266 * Updatecpu must be called before changing up, in order
267 * to maintain accurate cpu usage statistics. It can be called
268 * at any time to bring the stats for a given proc up-to-date.
274 int D = schedgain*HZ*Scaling;
279 t = MACHP(0)->ticks*Scaling + Scaling/2;
280 n = t - p->lastupdate;
290 p->cpu = (ocpu*(D-n))/D;
297 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
301 * On average, p has used p->cpu of a cpu recently.
302 * Its fair share is conf.nmach/m->load of a cpu. If it has been getting
303 * too much, penalize it. If it has been getting not enough, reward it.
304 * I don't think you can get much more than your fair share that
305 * often, so most of the queues are for using less. Having a priority
306 * of 3 means you're just right. Having a higher priority (up to p->basepri)
307 * means you're not using as much as you could.
310 reprioritize(Proc *p)
312 int fairshare, n, load, ratio;
314 load = MACHP(0)->load;
319 * fairshare = 1.000 * conf.nproc * 1.000/load,
320 * except the decimal point is moved three places
321 * on both load and fairshare.
323 fairshare = (conf.nmach*1000*1000)/load;
327 ratio = (fairshare+n/2) / n;
328 if(ratio > p->basepri)
331 panic("reprioritize");
332 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
337 * add a process to a scheduling queue
340 queueproc(Schedq *rq, Proc *p)
360 * try to remove a process from a scheduling queue (called splhi)
363 dequeueproc(Schedq *rq, Proc *tp)
371 * the queue may have changed before we locked runq,
372 * refind the target process.
375 for(p = rq->head; p; p = p->rnext){
382 * p->mach==0 only when process state is saved
384 if(p == 0 || p->mach){
395 runvec &= ~(1<<(rq-runq));
398 if(p->state != Ready)
399 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
406 * ready(p) picks a new priority for a process and sticks it in the
407 * runq for that priority.
414 void (*pt)(Proc*, int, vlong);
423 m->readied = p; /* group scheduling */
426 pri = reprioritize(p);
438 * yield the processor and drop our priority
444 /* pretend we just used 1/2 tick */
445 up->lastupdate -= Scaling/2;
451 * recalculate priorities once a second. We need to do this
452 * since priorities will otherwise only be recalculated when
453 * the running process blocks.
465 if(t - balancetime < HZ)
469 for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
474 if(p->mp != MACHP(m->machno))
476 if(pri == p->basepri)
479 npri = reprioritize(p);
482 p = dequeueproc(rq, p);
484 queueproc(&runq[npri], p);
493 * pick a process to run
502 void (*pt)(Proc*, int, vlong);
506 /* cooperative scheduling until the clock ticks */
507 if((p=m->readied) && p->mach==0 && p->state==Ready
508 && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
510 rq = &runq[p->priority];
518 * find a process that last ran on this processor (affinity),
519 * or one that hasn't moved in a while (load balancing). Every
520 * time around the loop affinity goes down.
525 * find the highest priority target process that this
526 * processor can run given affinity constraints.
529 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
530 for(p = rq->head; p; p = p->rnext){
531 if(p->mp == nil || p->mp == MACHP(m->machno)
532 || (!p->wired && i > 0))
537 /* waste time or halt the CPU */
540 /* remember how much time we're here */
542 m->perf.inidle += now-start;
548 p = dequeueproc(rq, p);
553 p->mp = MACHP(m->machno);
556 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
572 /* Only reliable way to see if we are Running */
591 if(p = procalloc.free)
594 snprint(msg, sizeof msg, "no procs; %s forking",
595 up? up->text: "kernel");
600 procalloc.free = p->qnext;
618 if(up && up->procctl == Proc_tracesyscall)
619 p->procctl = Proc_tracesyscall;
627 p->errstr = p->errbuf0;
628 p->syserrstr = p->errbuf1;
629 p->errbuf0[0] = '\0';
630 p->errbuf1[0] = '\0';
634 kstrdup(&p->user, "*nouser");
635 kstrdup(&p->text, "*notext");
636 kstrdup(&p->args, "");
639 memset(p->seg, 0, sizeof p->seg);
640 p->pid = incref(&pidalloc);
642 p->noteid = incref(¬eidalloc);
643 if(p->pid==0 || p->noteid==0)
646 p->kstack = smalloc(KSTACK);
651 procpriority(p, PriNormal, 0);
653 p->lastupdate = MACHP(0)->ticks*Scaling;
660 * wire this proc to a machine
663 procwired(Proc *p, int bm)
667 char nwired[MAXMACH];
671 /* pick a machine to wire to */
672 memset(nwired, 0, sizeof(nwired));
675 for(i=0; i<conf.nproc; i++, pp++){
678 nwired[wm->machno]++;
681 for(i=0; i<conf.nmach; i++)
682 if(nwired[i] < nwired[bm])
685 /* use the virtual machine requested */
686 bm = bm % conf.nmach;
689 p->wired = MACHP(bm);
694 procpriority(Proc *p, int pri, int fixed)
710 procinit0(void) /* bad planning - clashes with devproc.c */
715 procalloc.free = xalloc(conf.nproc*sizeof(Proc));
716 if(procalloc.free == nil){
718 panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
720 procalloc.arena = procalloc.free;
723 for(i=0; i<conf.nproc-1; i++,p++)
729 * sleep if a condition is not true. Another process will
730 * awaken us after it sets the condition. When we awaken
731 * the condition may no longer be true.
733 * we lock both the process and the rendezvous to keep r->p
734 * and p->r synchronized.
737 sleep(Rendez *r, int (*f)(void*), void *arg)
740 void (*pt)(Proc*, int, vlong);
745 print("process %lud sleeps with %lud locks held, last lock %#p locked at pc %#lux, sleep called from %#p\n",
746 up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
750 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
755 * Wakeup only knows there may be something to do by testing
756 * r->p in order to get something to lock on.
757 * Flush that information out to memory in case the sleep is
762 if((*f)(arg) || up->notepending){
764 * if condition happened or a note is pending
772 * now we are committed to
773 * change state and call scheduler
785 if(setlabel(&up->sched)) {
787 * here when the process is awakened
793 * here to go to sleep (i.e. stop Running)
797 gotolabel(&m->sched);
801 if(up->notepending) {
804 if(up->procctl == Proc_exitme && up->closingfgrp)
815 return up->trend == nil || up->tfn(arg);
819 twakeup(Ureg*, Timer *t)
832 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
835 print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf);
840 up->tmode = Trelative;
858 * Expects that only one process can call wakeup for any given Rendez.
859 * We hold both locks to ensure that r->p and p->r remain consistent.
860 * Richard Miller has a better solution that doesn't require both to
861 * be held simultaneously, but I'm a paranoid - presotto.
876 if(p->state != Wakeme || p->r != r){
877 iprint("%p %p %d\n", p->r, r, p->state);
878 panic("wakeup: state");
893 * if waking a sleeping process, this routine must hold both
894 * p->rlock and r->lock. However, it can't know them in
895 * the same order as wakeup causing a possible lock ordering
896 * deadlock. We break the deadlock by giving up the p->rlock
897 * lock if we can't get the r->lock and retrying.
900 postnote(Proc *p, int dolock, char *n, int flag)
909 if(flag != NUser && (p->notify == 0 || p->notified))
913 if(p->nnote < NNOTE) {
914 strcpy(p->note[p->nnote].msg, n);
915 p->note[p->nnote++].flag = flag;
922 /* this loop is to avoid lock ordering problems. */
928 /* waiting for a wakeup? */
932 /* try for the second lock */
934 if(p->state != Wakeme || r->p != p)
935 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
943 /* give other process time to get out of critical section and try again */
951 if(p->state != Rendezvous)
954 /* Try and pull out of a rendezvous */
956 if(p->state == Rendezvous) {
958 l = &REND(p->rgrp, p->rendtag);
959 for(d = *l; d; d = d->rendhash) {
973 * weird thing: keep at most NBROKEN around
987 if(broken.n == NBROKEN) {
989 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
992 broken.p[broken.n++] = p;
1007 for(b=0; b < broken.n; b++)
1008 if(broken.p[b] == p) {
1010 memmove(&broken.p[b], &broken.p[b+1],
1011 sizeof(Proc*)*(NBROKEN-(b+1)));
1025 for(i=0; i<n; i++) {
1035 pexit(char *exitstr, int freemem)
1040 Waitq *wq, *f, *next;
1046 void (*pt)(Proc*, int, vlong);
1048 if(up->syscalltrace)
1049 free(up->syscalltrace);
1057 /* nil out all the resources under lock (free later) */
1069 qunlock(&up->debug);
1083 * if not a kernel process and have a parent,
1084 * do some housekeeping.
1090 exitstr = "unknown";
1091 panic("boot process died: %s", exitstr);
1097 wq = smalloc(sizeof(Waitq));
1100 wq->w.pid = up->pid;
1101 utime = up->time[TUser] + up->time[TCUser];
1102 stime = up->time[TSys] + up->time[TCSys];
1103 wq->w.time[TUser] = tk2ms(utime);
1104 wq->w.time[TSys] = tk2ms(stime);
1105 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1106 if(exitstr && exitstr[0])
1107 snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1109 wq->w.msg[0] = '\0';
1113 * Check that parent is still alive.
1115 if(p->pid == up->parentpid && p->state != Broken) {
1117 p->time[TCUser] += utime;
1118 p->time[TCSys] += stime;
1120 * If there would be more than 128 wait records
1121 * processes for my parent, then don't leave a wait
1122 * record behind. This helps prevent badly written
1123 * daemon processes from accumulating lots of wait
1126 if(p->nwait < 128) {
1127 wq->next = p->waitq;
1142 qlock(&up->seglock);
1143 es = &up->seg[NSEG];
1144 for(s = up->seg; s < es; s++) {
1150 qunlock(&up->seglock);
1152 lock(&up->exl); /* Prevent my children from leaving waits */
1158 for(f = up->waitq; f; f = next) {
1163 /* release debuggers */
1166 wakeup(&up->pdbg->sleep);
1169 qunlock(&up->debug);
1171 /* Sched must not loop for these locks */
1176 up->state = Moribund;
1187 return p->waitq != 0;
1196 if(!canqlock(&up->qwaitr))
1200 qunlock(&up->qwaitr);
1205 if(up->nchild == 0 && up->waitq == 0) {
1211 sleep(&up->waitr, haswaitq, up);
1215 up->waitq = wq->next;
1219 qunlock(&up->qwaitr);
1223 memmove(w, &wq->w, sizeof(Waitmsg));
1232 return &procalloc.arena[i];
1246 bss = p->seg[BSEG]->top;
1250 s = statename[p->state];
1251 print("%3lud:%10s pc %8lux dbgpc %8lux %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1252 p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state],
1253 p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1263 print("up %lud\n", up->pid);
1265 print("no current process\n");
1266 for(i=0; i<conf.nproc; i++) {
1267 p = &procalloc.arena[i];
1268 if(p->state == Dead)
1276 * wait till all processes have flushed their mmu
1277 * state about segement s
1280 procflushseg(Segment *s)
1282 int i, ns, nm, nwait;
1286 * tell all processes with this
1287 * segment to flush their mmu's
1290 for(i=0; i<conf.nproc; i++) {
1291 p = &procalloc.arena[i];
1292 if(p->state == Dead)
1294 for(ns = 0; ns < NSEG; ns++)
1295 if(p->seg[ns] == s){
1297 for(nm = 0; nm < conf.nmach; nm++){
1298 if(MACHP(nm)->proc == p){
1299 MACHP(nm)->flushmmu = 1;
1311 * wait for all processors to take a clock interrupt
1312 * and flush their mmu's
1314 for(nm = 0; nm < conf.nmach; nm++)
1316 while(MACHP(nm)->flushmmu)
1326 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1329 print("rq%ld:", rq-runq);
1330 for(p = rq->head; p; p = p->rnext)
1331 print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1335 print("nrdy %d\n", nrdy);
1339 kproc(char *name, void (*func)(void *), void *arg)
1350 p->fpsave = up->fpsave;
1351 p->scallnr = up->scallnr;
1354 p->slash = up->slash;
1359 memmove(p->note, up->note, sizeof(p->note));
1360 p->nnote = up->nnote;
1362 p->lastnote = up->lastnote;
1363 p->notify = up->notify;
1367 procpriority(p, PriKproc, 0);
1369 kprocchild(p, func, arg);
1371 kstrdup(&p->user, eve);
1372 kstrdup(&p->text, name);
1378 memset(p->time, 0, sizeof(p->time));
1379 p->time[TReal] = MACHP(0)->ticks;
1384 * called splhi() by notify(). See comment in notify for the
1393 switch(p->procctl) {
1396 pexit("Killed: Insufficient physical memory", 1);
1399 spllo(); /* pexit has locks in it */
1410 p->psstate = "Stopped";
1411 /* free a waiting debugger */
1415 wakeup(&p->pdbg->sleep);
1435 assert(up->nerrlab < NERR);
1436 kstrcpy(up->errstr, err, ERRMAX);
1437 setlabel(&up->errlab[NERR-1]);
1444 gotolabel(&up->errlab[--up->nerrlab]);
1448 exhausted(char *resource)
1452 snprint(buf, sizeof buf, "no free %s", resource);
1453 iprint("%s\n", buf);
1467 ep = procalloc.arena+conf.nproc;
1468 for(p = procalloc.arena; p < ep; p++) {
1469 if(p->state == Dead || p->kp)
1472 for(i=1; i<NSEG; i++) {
1475 l += s->top - s->base;
1477 if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1483 print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1484 for(p = procalloc.arena; p < ep; p++) {
1485 if(p->state == Dead || p->kp)
1487 if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1488 p->procctl = Proc_exitbig;
1490 kp->procctl = Proc_exitbig;
1491 for(i = 0; i < NSEG; i++) {
1493 if(s != 0 && canqlock(&s->lk)) {
1494 mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1501 * change ownership to 'new' of all processes owned by 'old'. Used when
1505 renameuser(char *old, char *new)
1509 ep = procalloc.arena+conf.nproc;
1510 for(p = procalloc.arena; p < ep; p++)
1511 if(p->user!=nil && strcmp(old, p->user)==0)
1512 kstrdup(&p->user, new);
1516 * time accounting called by clock() splhi'd
1528 p->time[p->insyscall]++;
1531 /* calculate decaying duty cycles */
1533 per = n - m->perf.last;
1535 per = (m->perf.period*(HZ-1) + per)/HZ;
1537 m->perf.period = per;
1539 m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1542 m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1545 /* only one processor gets to compute system load averages */
1550 * calculate decaying load average.
1551 * if we decay by (n-1)/n then it takes
1552 * n clock ticks to go from load L to .36 L once
1553 * things quiet down. it takes about 5 n clock
1554 * ticks to go to zero. so using HZ means this is
1555 * approximately the load over the last second,
1556 * with a tail lasting about 5 seconds.
1561 m->load = (m->load*(HZ-1)+n)/HZ;
1569 h = p->pid % nelem(procalloc.ht);
1571 p->pidhash = procalloc.ht[h];
1572 procalloc.ht[h] = p;
1582 h = p->pid % nelem(procalloc.ht);
1584 for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1593 procindex(ulong pid)
1600 h = pid % nelem(procalloc.ht);
1602 for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1604 s = p - procalloc.arena;