2 #include "../port/lib.h"
6 #include "../port/error.h"
10 int schedgain = 30; /* units in seconds */
13 void updatecpu(Proc*);
14 int reprioritize(Proc*);
16 ulong delayedscheds; /* statistics */
21 static struct Procalloc
40 { /* BUG: generate automatically */
56 static void pidfree(Proc*);
57 static void rebalance(void);
63 schedinit(void) /* never returns */
69 if((e = up->edf) != nil && (e->flags & Admitted))
84 * Holding locks from pexit:
94 up->qnext = procalloc.free;
97 /* proc is free now, make sure unlock() wont touch it */
98 up = procalloc.Lock.p = nil;
110 * If changing this routine, look also at sleep(). It
111 * contains a copy of the guts of sched().
119 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
122 up != nil ? up->lastilock: nil,
123 (up != nil && up->lastilock != nil) ? up->lastilock->pc: 0,
127 * Delay the sched until the process gives up the locks
128 * it is holding. This avoids dumb lock loops.
129 * Don't delay if the process is Moribund.
130 * It called sched to die.
131 * But do sched eventually. This avoids a missing unlock
132 * from hanging the entire kernel.
133 * But don't reschedule procs holding palloc or procalloc.
134 * Those are far too important to be holding while asleep.
136 * This test is not exact. There can still be a few instructions
137 * in the middle of taslock when a process holds a lock
138 * but Lock.p has not yet been initialized.
141 if(up->state != Moribund)
142 if(up->delaysched < 20
143 || palloc.Lock.p == up
144 || fscache.Lock.p == up
145 || procalloc.Lock.p == up){
158 if(setlabel(&up->sched)){
163 gotolabel(&m->sched);
168 p->priority = reprioritize(p);
171 m->schedticks = m->ticks + HZ/10;
175 up->mach = MACHP(m->machno);
178 gotolabel(&up->sched);
190 return runvec & ~((1<<(up->priority+1))-1);
194 * here once per clock tick to see if we should resched
199 /* once a second, rebalance will reprioritize ready procs */
203 /* unless preempted, get to run for at least 100ms */
205 || (!up->fixedpri && (long)(m->ticks - m->schedticks) > 0 && anyready())){
206 m->readied = nil; /* avoid cooperative scheduling */
212 * here at the end of non-clock interrupts to see if we should preempt the
213 * current process. Returns 1 if preempted, 0 otherwise.
218 if(up != nil && up->state == Running)
219 if(up->preempted == 0)
222 m->readied = nil; /* avoid cooperative scheduling */
233 * Update the cpu time average for this particular process,
234 * which is about to change from up -> not up or vice versa.
235 * p->lastupdate is the last time an updatecpu happened.
237 * The cpu time average is a decaying average that lasts
238 * about D clock ticks. D is chosen to be approximately
239 * the cpu time of a cpu-intensive "quick job". A job has to run
240 * for approximately D clock ticks before we home in on its
241 * actual cpu usage. Thus if you manage to get in and get out
242 * quickly, you won't be penalized during your burst. Once you
243 * start using your share of the cpu for more than about D
244 * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
245 * below all the other quick jobs. Interactive tasks, because
246 * they basically always use less than their fair share of cpu,
249 * If the process has not been running, then we want to
252 * cpu = cpu * (D-1)/D
256 * cpu = cpu * ((D-1)/D)^n
258 * but D is big enough that this is approximately
260 * cpu = cpu * (D-n)/D
262 * so we use that instead.
264 * If the process has been running, we apply the filter to
265 * 1 - cpu, yielding a similar equation. Note that cpu is
266 * stored in fixed point (* 1000).
268 * Updatecpu must be called before changing up, in order
269 * to maintain accurate cpu usage statistics. It can be called
270 * at any time to bring the stats for a given proc up-to-date.
280 t = MACHP(0)->ticks*Scaling + Scaling/2;
281 n = t - p->lastupdate;
286 D = schedgain*HZ*Scaling;
292 p->cpu = (ocpu*(D-n))/D;
298 //iprint("pid %lud %s for %lud cpu %lud -> %lud\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
302 * On average, p has used p->cpu of a cpu recently.
303 * Its fair share is conf.nmach/m->load of a cpu. If it has been getting
304 * too much, penalize it. If it has been getting not enough, reward it.
305 * I don't think you can get much more than your fair share that
306 * often, so most of the queues are for using less. Having a priority
307 * of 3 means you're just right. Having a higher priority (up to p->basepri)
308 * means you're not using as much as you could.
311 reprioritize(Proc *p)
313 int fairshare, n, load, ratio;
315 load = MACHP(0)->load;
320 * fairshare = 1.000 * conf.nmach * 1.000/load,
321 * except the decimal point is moved three places
322 * on both load and fairshare.
324 fairshare = (conf.nmach*1000*1000)/load;
328 ratio = (fairshare+n/2) / n;
329 if(ratio > p->basepri)
332 panic("reprioritize");
333 //iprint("pid %lud cpu %lud load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
338 * add a process to a scheduling queue
341 queueproc(Schedq *rq, Proc *p)
361 * try to remove a process from a scheduling queue (called splhi)
364 dequeueproc(Schedq *rq, Proc *tp)
372 * the queue may have changed before we locked runq,
373 * refind the target process.
376 for(p = rq->head; p != nil; p = p->rnext){
383 * p->mach==0 only when process state is saved
385 if(p == nil || p->mach != nil){
396 runvec &= ~(1<<(rq-runq));
399 if(p->state != Ready)
400 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
407 * ready(p) picks a new priority for a process and sticks it in the
408 * runq for that priority.
415 void (*pt)(Proc*, int, vlong);
417 if(p->state == Ready){
418 print("double ready %s %lud pc %p\n", p->text, p->pid, getcallerpc(&p));
428 if(up != p && (p->wired == nil || p->wired == MACHP(m->machno)))
429 m->readied = p; /* group scheduling */
432 pri = reprioritize(p);
444 * yield the processor and drop our priority
450 /* pretend we just used 1/2 tick */
451 up->lastupdate -= Scaling/2;
457 * recalculate priorities once a second. We need to do this
458 * since priorities will otherwise only be recalculated when
459 * the running process blocks.
472 if(t - balancetime < HZ)
476 for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
481 if(pri == p->basepri)
484 npri = reprioritize(p);
487 p = dequeueproc(rq, p);
489 queueproc(&runq[npri], p);
498 * pick a process to run
507 void (*pt)(Proc*, int, vlong);
511 /* cooperative scheduling until the clock ticks */
512 if((p = m->readied) != nil && p->mach == nil && p->state == Ready
513 && (p->wired == nil || p->wired == MACHP(m->machno))
514 && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
516 rq = &runq[p->priority];
524 * find a process that last ran on this processor (affinity),
525 * or one that hasn't moved in a while (load balancing). Every
526 * time around the loop affinity goes down.
531 * find the highest priority target process that this
532 * processor can run given affinity constraints.
535 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
536 for(p = rq->head; p != nil; p = p->rnext){
537 if(p->mp == nil || p->mp == MACHP(m->machno)
538 || (p->wired == nil && i > 0))
543 /* waste time or halt the CPU */
546 /* remember how much time we're here */
548 m->perf.inidle += now-start;
554 p = dequeueproc(rq, p);
559 p->mp = MACHP(m->machno);
562 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
578 /* Only reliable way to see if we are Running */
597 if((p = procalloc.free) != nil)
600 snprint(msg, sizeof msg, "no procs; %s forking",
601 up != nil ? up->text: "kernel");
606 procalloc.free = p->qnext;
626 p->syscalltrace = nil;
631 p->errstr = p->errbuf0;
632 p->syserrstr = p->errbuf1;
633 p->errbuf0[0] = '\0';
634 p->errbuf1[0] = '\0';
638 kstrdup(&p->user, "*nouser");
639 kstrdup(&p->text, "*notext");
640 kstrdup(&p->args, "");
643 memset(p->seg, 0, sizeof p->seg);
645 p->noteid = pidalloc(p);
647 p->kstack = smalloc(KSTACK);
652 procpriority(p, PriNormal, 0);
654 p->lastupdate = MACHP(0)->ticks*Scaling;
661 * wire this proc to a machine
664 procwired(Proc *p, int bm)
668 char nwired[MAXMACH];
672 /* pick a machine to wire to */
673 memset(nwired, 0, sizeof(nwired));
676 for(i=0; i<conf.nproc; i++, pp++){
678 if(wm != nil && pp->pid)
679 nwired[wm->machno]++;
682 for(i=0; i<conf.nmach; i++)
683 if(nwired[i] < nwired[bm])
686 /* use the virtual machine requested */
687 bm = bm % conf.nmach;
690 p->wired = MACHP(bm);
695 procpriority(Proc *p, int pri, int fixed)
711 procinit0(void) /* bad planning - clashes with devproc.c */
716 p = xalloc(conf.nproc*sizeof(Proc));
719 panic("cannot allocate %lud procs (%ludMB)", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
723 for(i=0; i<conf.nproc-1; i++, p++)
729 * sleep if a condition is not true. Another process will
730 * awaken us after it sets the condition. When we awaken
731 * the condition may no longer be true.
733 * we lock both the process and the rendezvous to keep r->p
734 * and p->r synchronized.
737 sleep(Rendez *r, int (*f)(void*), void *arg)
740 void (*pt)(Proc*, int, vlong);
745 print("process %lud sleeps with %d locks held, last lock %#p locked at pc %#p, sleep called from %#p\n",
746 up->pid, up->nlocks, up->lastlock, up->lastlock->pc, getcallerpc(&r));
750 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
755 * Wakeup only knows there may be something to do by testing
756 * r->p in order to get something to lock on.
757 * Flush that information out to memory in case the sleep is
762 if((*f)(arg) || up->notepending){
764 * if condition happened or a note is pending
772 * now we are committed to
773 * change state and call scheduler
785 if(setlabel(&up->sched)) {
787 * here when the process is awakened
793 * here to go to sleep (i.e. stop Running)
797 gotolabel(&m->sched);
801 if(up->notepending) {
813 if(up->procctl == Proc_exitme && up->closingfgrp != nil)
821 return up->trend == nil || up->tfn(arg);
825 twakeup(Ureg*, Timer *t)
839 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
842 print("%s %lud: tsleep timer active: mode %d, tf %#p, pc %#p\n",
843 up->text, up->pid, up->tmode, up->tf, getcallerpc(&r));
848 up->tmode = Trelative;
866 * Expects that only one process can call wakeup for any given Rendez.
867 * We hold both locks to ensure that r->p and p->r remain consistent.
868 * Richard Miller has a better solution that doesn't require both to
869 * be held simultaneously, but I'm a paranoid - presotto.
884 if(p->state != Wakeme || p->r != r){
885 iprint("%p %p %d\n", p->r, r, p->state);
886 panic("wakeup: state");
901 * if waking a sleeping process, this routine must hold both
902 * p->rlock and r->lock. However, it can't know them in
903 * the same order as wakeup causing a possible lock ordering
904 * deadlock. We break the deadlock by giving up the p->rlock
905 * lock if we can't get the r->lock and retrying.
908 postnote(Proc *p, int dolock, char *n, int flag)
925 if(n != nil && flag != NUser && (p->notify == 0 || p->notified))
929 if(p->nnote < NNOTE && n != nil) {
930 kstrcpy(p->note[p->nnote].msg, n, ERRMAX);
931 p->note[p->nnote++].flag = flag;
938 /* this loop is to avoid lock ordering problems. */
946 /* waiting for a wakeup? */
950 /* try for the second lock */
952 if(p->state != Wakeme || r->p != p)
953 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
961 /* give other process time to get out of critical section and try again */
971 /* Try and pull out of a eqlock */
972 if((q = p->eql) != nil){
974 if(p->state == Queueing && p->eql == q){
977 for(l = nil, d = q->head; d != nil; l = d, d = d->qnext){
986 p->eql = nil; /* not taken */
996 /* Try and pull out of a rendezvous */
998 if(p->state == Rendezvous) {
1001 l = &REND(p->rgrp, p->rendtag);
1002 for(d = *l; d != nil; d = d->rendhash) {
1019 * weird thing: keep at most NBROKEN around
1033 if(broken.n == NBROKEN) {
1035 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
1038 broken.p[broken.n++] = p;
1053 for(b=0; b < broken.n; b++)
1054 if(broken.p[b] == p) {
1056 memmove(&broken.p[b], &broken.p[b+1],
1057 sizeof(Proc*)*(NBROKEN-(b+1)));
1071 for(i=0; i<n; i++) {
1081 pexit(char *exitstr, int freemem)
1092 void (*pt)(Proc*, int, vlong);
1100 /* nil out all the resources under lock (free later) */
1112 qunlock(&up->debug);
1126 * if not a kernel process and have a parent,
1127 * do some housekeeping.
1129 if(up->kp == 0 && up->parentpid != 0) {
1130 wq = smalloc(sizeof(Waitq));
1131 wq->w.pid = up->pid;
1132 utime = up->time[TUser] + up->time[TCUser];
1133 stime = up->time[TSys] + up->time[TCSys];
1134 wq->w.time[TUser] = tk2ms(utime);
1135 wq->w.time[TSys] = tk2ms(stime);
1136 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1137 if(exitstr != nil && exitstr[0])
1138 snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1140 wq->w.msg[0] = '\0';
1145 * Check that parent is still alive.
1147 if(p->pid == up->parentpid && p->state != Broken) {
1149 p->time[TCUser] += utime;
1150 p->time[TCSys] += stime;
1152 * If there would be more than 128 wait records
1153 * processes for my parent, then don't leave a wait
1154 * record behind. This helps prevent badly written
1155 * daemon processes from accumulating lots of wait
1158 if(p->nwait < 128) {
1159 wq->next = p->waitq;
1170 else if(up->kp == 0 && up->parent == nil){
1172 exitstr = "unknown";
1173 panic("boot process died: %s", exitstr);
1179 qlock(&up->seglock);
1180 es = &up->seg[NSEG];
1181 for(s = up->seg; s < es; s++) {
1187 qunlock(&up->seglock);
1189 lock(&up->exl); /* Prevent my children from leaving waits */
1195 while((wq = up->waitq) != nil){
1196 up->waitq = wq->next;
1200 /* release debuggers */
1202 if(up->pdbg != nil) {
1203 wakeup(&up->pdbg->sleep);
1206 if(up->syscalltrace != nil) {
1207 free(up->syscalltrace);
1208 up->syscalltrace = nil;
1210 if(up->watchpt != nil){
1215 qunlock(&up->debug);
1217 /* Sched must not loop for these locks */
1222 up->state = Moribund;
1230 return ((Proc*)x)->waitq != nil;
1239 if(!canqlock(&up->qwaitr))
1243 qunlock(&up->qwaitr);
1248 while(up->waitq == nil) {
1249 if(up->nchild == 0) {
1254 sleep(&up->waitr, haswaitq, up);
1258 up->waitq = wq->next;
1262 qunlock(&up->qwaitr);
1266 memmove(w, &wq->w, sizeof(Waitmsg));
1275 return &procalloc.arena[i];
1288 if(p->seg[BSEG] != nil)
1289 bss = p->seg[BSEG]->top;
1293 s = statename[p->state];
1294 print("%3lud:%10s pc %#p dbgpc %#p %8s (%s) ut %ld st %ld bss %lux qpc %#p nl %d nd %lud lpc %#p pri %lud\n",
1295 p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state],
1296 p->time[0], p->time[1], bss, p->qpc, p->nlocks, p->delaysched,
1297 p->lastlock ? p->lastlock->pc : 0, p->priority);
1307 print("up %lud\n", up->pid);
1309 print("no current process\n");
1310 for(i=0; i<conf.nproc; i++) {
1311 p = &procalloc.arena[i];
1312 if(p->state != Dead)
1318 * wait till all matching processes have flushed their mmu
1321 procflushmmu(int (*match)(Proc*, void*), void *a)
1327 * tell all matching processes to flush their mmu's
1330 for(i=0; i<conf.nproc; i++) {
1331 p = &procalloc.arena[i];
1332 if(p->state != Dead && (*match)(p, a)){
1334 for(nm = 0; nm < conf.nmach; nm++){
1335 if(MACHP(nm)->proc == p){
1336 MACHP(nm)->flushmmu = 1;
1347 * wait for all other processors to take a clock interrupt
1348 * and flush their mmu's
1350 for(nm = 0; nm < conf.nmach; nm++)
1351 while(m->machno != nm && MACHP(nm)->flushmmu)
1356 matchseg(Proc *p, void *a)
1360 for(ns = 0; ns < NSEG; ns++){
1367 procflushseg(Segment *s)
1369 procflushmmu(matchseg, s);
1373 matchpseg(Proc *p, void *a)
1378 for(ns = 0; ns < NSEG; ns++){
1380 if(s != nil && s->pseg == a)
1386 procflushpseg(Physseg *ps)
1388 procflushmmu(matchpseg, ps);
1397 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1400 print("rq%zd:", rq-runq);
1401 for(p = rq->head; p != nil; p = p->rnext)
1402 print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1406 print("nrdy %d\n", nrdy);
1410 kproc(char *name, void (*func)(void *), void *arg)
1421 p->scallnr = up->scallnr;
1424 p->slash = up->slash;
1425 p->dot = up->slash; /* unlike fork, do not inherit the dot for kprocs */
1429 memmove(p->note, up->note, sizeof(p->note));
1430 p->nnote = up->nnote;
1432 p->lastnote = up->lastnote;
1433 p->notify = up->notify;
1437 procpriority(p, PriKproc, 0);
1439 kprocchild(p, func, arg);
1441 kstrdup(&p->user, eve);
1442 kstrdup(&p->text, name);
1448 memset(p->time, 0, sizeof(p->time));
1449 p->time[TReal] = MACHP(0)->ticks;
1454 * called splhi() by notify(). See comment in notify for the
1463 switch(up->procctl) {
1466 pprint("Killed: Insufficient physical memory\n");
1467 pexit("Killed: Insufficient physical memory", 1);
1470 spllo(); /* pexit has locks in it */
1480 state = up->psstate;
1481 up->psstate = "Stopped";
1482 /* free a waiting debugger */
1485 if(up->pdbg != nil) {
1486 wakeup(&up->pdbg->sleep);
1489 qunlock(&up->debug);
1491 up->state = Stopped;
1493 up->psstate = state;
1506 assert(up->nerrlab < NERR);
1507 kstrcpy(up->errstr, err, ERRMAX);
1508 setlabel(&up->errlab[NERR-1]);
1515 assert(up->nerrlab > 0);
1516 gotolabel(&up->errlab[--up->nerrlab]);
1520 exhausted(char *resource)
1524 snprint(buf, sizeof buf, "no free %s", resource);
1525 iprint("%s\n", buf);
1530 procpagecount(Proc *p)
1536 eqlock(&p->seglock);
1538 qunlock(&p->seglock);
1542 for(i=0; i<NSEG; i++){
1543 if((s = p->seg[i]) != nil){
1545 pages += mcountseg(s);
1549 qunlock(&p->seglock);
1565 ep = procalloc.arena+conf.nproc;
1566 for(p = procalloc.arena; p < ep; p++) {
1567 if(p->state == Dead || p->kp)
1569 if((p->noswap || (p->procmode & 0222) == 0) && strcmp(eve, p->user) == 0)
1571 l = procpagecount(p);
1579 print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1580 qlock(&kp->seglock);
1581 for(p = procalloc.arena; p < ep; p++) {
1582 if(p->state == Dead || p->kp)
1584 if(p != kp && p->seg[BSEG] != nil && p->seg[BSEG] == kp->seg[BSEG])
1585 p->procctl = Proc_exitbig;
1587 kp->procctl = Proc_exitbig;
1588 for(i = 0; i < NSEG; i++) {
1592 switch(s->type & SG_TYPE){
1600 mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1603 qunlock(&kp->seglock);
1607 * change ownership to 'new' of all processes owned by 'old'. Used when
1611 renameuser(char *old, char *new)
1615 ep = procalloc.arena+conf.nproc;
1616 for(p = procalloc.arena; p < ep; p++)
1617 if(p->user != nil && strcmp(old, p->user) == 0)
1618 kstrdup(&p->user, new);
1622 * time accounting called by clock() splhi'd
1634 p->time[p->insyscall]++;
1637 /* calculate decaying duty cycles */
1639 per = n - m->perf.last;
1641 per = ((uvlong)m->perf.period*(HZ-1) + per)/HZ;
1643 m->perf.period = per;
1645 m->perf.avg_inidle = ((uvlong)m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1648 m->perf.avg_inintr = ((uvlong)m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1651 /* only one processor gets to compute system load averages */
1656 * calculate decaying load average.
1657 * if we decay by (n-1)/n then it takes
1658 * n clock ticks to go from load L to .36 L once
1659 * things quiet down. it takes about 5 n clock
1660 * ticks to go to zero. so using HZ means this is
1661 * approximately the load over the last second,
1662 * with a tail lasting about 5 seconds.
1666 n = (nrdy+n)*1000*100;
1667 load = ((uvlong)load*(HZ-1)+n)/HZ;
1674 static int gen, wrapped;
1680 pid = ++gen & 0x7FFFFFFF;
1685 h = pid % nelem(procalloc.ht);
1687 for(x = procalloc.ht[h]; x != nil; x = x->pidhash)
1692 p->pidhash = procalloc.ht[h];
1693 procalloc.ht[h] = p;
1705 h = p->pid % nelem(procalloc.ht);
1707 for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1716 procindex(ulong pid)
1723 h = pid % nelem(procalloc.ht);
1725 for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1727 s = p - procalloc.arena;