sys/src/9/port/proc.c

   1 #include        <u.h>
   2 #include        "../port/lib.h"
   3 #include        "mem.h"
   4 #include        "dat.h"
   5 #include        "fns.h"
   6 #include        "../port/error.h"
   7 #include        "edf.h"
   8 #include        <trace.h>
   9
  10 int     schedgain = 30; /* units in seconds */
  11 int     nrdy;
  12
  13 void updatecpu(Proc*);
  14 int reprioritize(Proc*);
  15
  16 ulong   delayedscheds;  /* statistics */
  17 long skipscheds;
  18 long preempts;
  19 ulong load;
  20
  21 static struct Procalloc
  22 {
  23         Lock;
  24         Proc*   ht[128];
  25         Proc*   arena;
  26         Proc*   free;
  27 } procalloc;
  28
  29 enum
  30 {
  31         Q=10,
  32         DQ=4,
  33         Scaling=2,
  34 };
  35
  36 Schedq  runq[Nrq];
  37 ulong   runvec;
  38
  39 char *statename[] =
  40 {       /* BUG: generate automatically */
  41         "Dead",
  42         "Moribund",
  43         "Ready",
  44         "Scheding",
  45         "Running",
  46         "Queueing",
  47         "QueueingR",
  48         "QueueingW",
  49         "Wakeme",
  50         "Broken",
  51         "Stopped",
  52         "Rendez",
  53         "Waitrelease",
  54 };
  55
  56 static void pidfree(Proc*);
  57 static void rebalance(void);
  58
  59 /*
  60  * Always splhi()'ed.
  61  */
  62 void
  63 schedinit(void)         /* never returns */
  64 {
  65         Edf *e;
  66
  67         setlabel(&m->sched);
  68         if(up) {
  69                 if((e = up->edf) && (e->flags & Admitted))
  70                         edfrecord(up);
  71                 m->proc = 0;
  72                 switch(up->state) {
  73                 case Running:
  74                         ready(up);
  75                         break;
  76                 case Moribund:
  77                         up->state = Dead;
  78                         edfstop(up);
  79                         if (up->edf)
  80                                 free(up->edf);
  81                         up->edf = nil;
  82
  83                         /*
  84                          * Holding locks from pexit:
  85                          *      procalloc
  86                          *      palloc
  87                          */
  88                         mmurelease(up);
  89                         unlock(&palloc);
  90
  91                         up->mach = nil;
  92                         updatecpu(up);
  93
  94                         up->qnext = procalloc.free;
  95                         procalloc.free = up;
  96
  97                         /* proc is free now, make sure unlock() wont touch it */
  98                         up = procalloc.Lock.p = nil;
  99                         unlock(&procalloc);
 100                         sched();
 101                 }
 102                 up->mach = nil;
 103                 updatecpu(up);
 104                 up = nil;
 105         }
 106         sched();
 107 }
 108
 109 /*
 110  *  If changing this routine, look also at sleep().  It
 111  *  contains a copy of the guts of sched().
 112  */
 113 void
 114 sched(void)
 115 {
 116         Proc *p;
 117
 118         if(m->ilockdepth)
 119                 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
 120                         m->machno,
 121                         m->ilockdepth,
 122                         up? up->lastilock: nil,
 123                         (up && up->lastilock)? up->lastilock->pc: 0,
 124                         getcallerpc(&p+2));
 125         if(up){
 126                 /*
 127                  * Delay the sched until the process gives up the locks
 128                  * it is holding.  This avoids dumb lock loops.
 129                  * Don't delay if the process is Moribund.
 130                  * It called sched to die.
 131                  * But do sched eventually.  This avoids a missing unlock
 132                  * from hanging the entire kernel.
 133                  * But don't reschedule procs holding palloc or procalloc.
 134                  * Those are far too important to be holding while asleep.
 135                  *
 136                  * This test is not exact.  There can still be a few instructions
 137                  * in the middle of taslock when a process holds a lock
 138                  * but Lock.p has not yet been initialized.
 139                  */
 140                 if(up->nlocks.ref)
 141                 if(up->state != Moribund)
 142                 if(up->delaysched < 20
 143                 || palloc.Lock.p == up
 144                 || procalloc.Lock.p == up){
 145                         up->delaysched++;
 146                         delayedscheds++;
 147                         return;
 148                 }
 149                 up->delaysched = 0;
 150
 151                 splhi();
 152
 153                 /* statistics */
 154                 m->cs++;
 155
 156                 procsave(up);
 157                 if(setlabel(&up->sched)){
 158                         procrestore(up);
 159                         spllo();
 160                         return;
 161                 }
 162                 gotolabel(&m->sched);
 163         }
 164         p = runproc();
 165         if(!p->edf){
 166                 updatecpu(p);
 167                 p->priority = reprioritize(p);
 168         }
 169         if(p != m->readied)
 170                 m->schedticks = m->ticks + HZ/10;
 171         m->readied = 0;
 172         up = p;
 173         up->state = Running;
 174         up->mach = MACHP(m->machno);
 175         m->proc = up;
 176         mmuswitch(up);
 177         gotolabel(&up->sched);
 178 }
 179
 180 int
 181 anyready(void)
 182 {
 183         return runvec;
 184 }
 185
 186 int
 187 anyhigher(void)
 188 {
 189         return runvec & ~((1<<(up->priority+1))-1);
 190 }
 191
 192 /*
 193  *  here once per clock tick to see if we should resched
 194  */
 195 void
 196 hzsched(void)
 197 {
 198         /* once a second, rebalance will reprioritize ready procs */
 199         if(m->machno == 0)
 200                 rebalance();
 201
 202         /* unless preempted, get to run for at least 100ms */
 203         if(anyhigher()
 204         || (!up->fixedpri && m->ticks > m->schedticks && anyready())){
 205                 m->readied = nil;       /* avoid cooperative scheduling */
 206                 up->delaysched++;
 207         }
 208 }
 209
 210 /*
 211  *  here at the end of non-clock interrupts to see if we should preempt the
 212  *  current process.  Returns 1 if preempted, 0 otherwise.
 213  */
 214 int
 215 preempted(void)
 216 {
 217         if(up && up->state == Running)
 218         if(up->preempted == 0)
 219         if(anyhigher())
 220         if(!active.exiting){
 221                 m->readied = nil;       /* avoid cooperative scheduling */
 222                 up->preempted = 1;
 223                 sched();
 224                 splhi();
 225                 up->preempted = 0;
 226                 return 1;
 227         }
 228         return 0;
 229 }
 230
 231 /*
 232  * Update the cpu time average for this particular process,
 233  * which is about to change from up -> not up or vice versa.
 234  * p->lastupdate is the last time an updatecpu happened.
 235  *
 236  * The cpu time average is a decaying average that lasts
 237  * about D clock ticks.  D is chosen to be approximately
 238  * the cpu time of a cpu-intensive "quick job".  A job has to run
 239  * for approximately D clock ticks before we home in on its
 240  * actual cpu usage.  Thus if you manage to get in and get out
 241  * quickly, you won't be penalized during your burst.  Once you
 242  * start using your share of the cpu for more than about D
 243  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
 244  * below all the other quick jobs.  Interactive tasks, because
 245  * they basically always use less than their fair share of cpu,
 246  * will be rewarded.
 247  *
 248  * If the process has not been running, then we want to
 249  * apply the filter
 250  *
 251  *      cpu = cpu * (D-1)/D
 252  *
 253  * n times, yielding
 254  *
 255  *      cpu = cpu * ((D-1)/D)^n
 256  *
 257  * but D is big enough that this is approximately
 258  *
 259  *      cpu = cpu * (D-n)/D
 260  *
 261  * so we use that instead.
 262  *
 263  * If the process has been running, we apply the filter to
 264  * 1 - cpu, yielding a similar equation.  Note that cpu is
 265  * stored in fixed point (* 1000).
 266  *
 267  * Updatecpu must be called before changing up, in order
 268  * to maintain accurate cpu usage statistics.  It can be called
 269  * at any time to bring the stats for a given proc up-to-date.
 270  */
 271 void
 272 updatecpu(Proc *p)
 273 {
 274         int n, t, ocpu;
 275         int D = schedgain*HZ*Scaling;
 276
 277         if(p->edf)
 278                 return;
 279
 280         t = MACHP(0)->ticks*Scaling + Scaling/2;
 281         n = t - p->lastupdate;
 282         p->lastupdate = t;
 283
 284         if(n == 0)
 285                 return;
 286         if(n > D)
 287                 n = D;
 288
 289         ocpu = p->cpu;
 290         if(p != up)
 291                 p->cpu = (ocpu*(D-n))/D;
 292         else{
 293                 t = 1000 - ocpu;
 294                 t = (t*(D-n))/D;
 295                 p->cpu = 1000 - t;
 296         }
 297
 298 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
 299 }
 300
 301 /*
 302  * On average, p has used p->cpu of a cpu recently.
 303  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
 304  * too much, penalize it.  If it has been getting not enough, reward it.
 305  * I don't think you can get much more than your fair share that
 306  * often, so most of the queues are for using less.  Having a priority
 307  * of 3 means you're just right.  Having a higher priority (up to p->basepri)
 308  * means you're not using as much as you could.
 309  */
 310 int
 311 reprioritize(Proc *p)
 312 {
 313         int fairshare, n, load, ratio;
 314
 315         load = MACHP(0)->load;
 316         if(load == 0)
 317                 return p->basepri;
 318
 319         /*
 320          * fairshare = 1.000 * conf.nmach * 1.000/load,
 321          * except the decimal point is moved three places
 322          * on both load and fairshare.
 323          */
 324         fairshare = (conf.nmach*1000*1000)/load;
 325         n = p->cpu;
 326         if(n == 0)
 327                 n = 1;
 328         ratio = (fairshare+n/2) / n;
 329         if(ratio > p->basepri)
 330                 ratio = p->basepri;
 331         if(ratio < 0)
 332                 panic("reprioritize");
 333 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
 334         return ratio;
 335 }
 336
 337 /*
 338  * add a process to a scheduling queue
 339  */
 340 void
 341 queueproc(Schedq *rq, Proc *p)
 342 {
 343         int pri;
 344
 345         pri = rq - runq;
 346         lock(runq);
 347         p->priority = pri;
 348         p->rnext = 0;
 349         if(rq->tail)
 350                 rq->tail->rnext = p;
 351         else
 352                 rq->head = p;
 353         rq->tail = p;
 354         rq->n++;
 355         nrdy++;
 356         runvec |= 1<<pri;
 357         unlock(runq);
 358 }
 359
 360 /*
 361  *  try to remove a process from a scheduling queue (called splhi)
 362  */
 363 Proc*
 364 dequeueproc(Schedq *rq, Proc *tp)
 365 {
 366         Proc *l, *p;
 367
 368         if(!canlock(runq))
 369                 return nil;
 370
 371         /*
 372          *  the queue may have changed before we locked runq,
 373          *  refind the target process.
 374          */
 375         l = 0;
 376         for(p = rq->head; p; p = p->rnext){
 377                 if(p == tp)
 378                         break;
 379                 l = p;
 380         }
 381
 382         /*
 383          *  p->mach==0 only when process state is saved
 384          */
 385         if(p == 0 || p->mach){
 386                 unlock(runq);
 387                 return nil;
 388         }
 389         if(p->rnext == 0)
 390                 rq->tail = l;
 391         if(l)
 392                 l->rnext = p->rnext;
 393         else
 394                 rq->head = p->rnext;
 395         if(rq->head == nil)
 396                 runvec &= ~(1<<(rq-runq));
 397         rq->n--;
 398         nrdy--;
 399         if(p->state != Ready)
 400                 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
 401
 402         unlock(runq);
 403         return p;
 404 }
 405
 406 /*
 407  *  ready(p) picks a new priority for a process and sticks it in the
 408  *  runq for that priority.
 409  */
 410 void
 411 ready(Proc *p)
 412 {
 413         int s, pri;
 414         Schedq *rq;
 415         void (*pt)(Proc*, int, vlong);
 416
 417         if(p->state == Ready){
 418                 print("double ready %s %lud pc %p\n", p->text, p->pid, getcallerpc(&p));
 419                 return;
 420         }
 421
 422         s = splhi();
 423         if(edfready(p)){
 424                 splx(s);
 425                 return;
 426         }
 427
 428         if(up != p)
 429                 m->readied = p; /* group scheduling */
 430
 431         updatecpu(p);
 432         pri = reprioritize(p);
 433         p->priority = pri;
 434         rq = &runq[pri];
 435         p->state = Ready;
 436         queueproc(rq, p);
 437         pt = proctrace;
 438         if(pt)
 439                 pt(p, SReady, 0);
 440         splx(s);
 441 }
 442
 443 /*
 444  *  yield the processor and drop our priority
 445  */
 446 void
 447 yield(void)
 448 {
 449         if(anyready()){
 450                 /* pretend we just used 1/2 tick */
 451                 up->lastupdate -= Scaling/2;
 452                 sched();
 453         }
 454 }
 455
 456 /*
 457  *  recalculate priorities once a second.  We need to do this
 458  *  since priorities will otherwise only be recalculated when
 459  *  the running process blocks.
 460  */
 461 ulong balancetime;
 462
 463 static void
 464 rebalance(void)
 465 {
 466         int pri, npri, t, x;
 467         Schedq *rq;
 468         Proc *p;
 469
 470         t = m->ticks;
 471         if(t - balancetime < HZ)
 472                 return;
 473         balancetime = t;
 474
 475         for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
 476 another:
 477                 p = rq->head;
 478                 if(p == nil)
 479                         continue;
 480                 if(p->mp != MACHP(m->machno))
 481                         continue;
 482                 if(pri == p->basepri)
 483                         continue;
 484                 updatecpu(p);
 485                 npri = reprioritize(p);
 486                 if(npri != pri){
 487                         x = splhi();
 488                         p = dequeueproc(rq, p);
 489                         if(p)
 490                                 queueproc(&runq[npri], p);
 491                         splx(x);
 492                         goto another;
 493                 }
 494         }
 495 }
 496
 497
 498 /*
 499  *  pick a process to run
 500  */
 501 Proc*
 502 runproc(void)
 503 {
 504         Schedq *rq;
 505         Proc *p;
 506         ulong start, now;
 507         int i;
 508         void (*pt)(Proc*, int, vlong);
 509
 510         start = perfticks();
 511
 512         /* cooperative scheduling until the clock ticks */
 513         if((p=m->readied) && p->mach==0 && p->state==Ready
 514         && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
 515                 skipscheds++;
 516                 rq = &runq[p->priority];
 517                 goto found;
 518         }
 519
 520         preempts++;
 521
 522 loop:
 523         /*
 524          *  find a process that last ran on this processor (affinity),
 525          *  or one that hasn't moved in a while (load balancing).  Every
 526          *  time around the loop affinity goes down.
 527          */
 528         spllo();
 529         for(i = 0;; i++){
 530                 /*
 531                  *  find the highest priority target process that this
 532                  *  processor can run given affinity constraints.
 533                  *
 534                  */
 535                 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
 536                         for(p = rq->head; p; p = p->rnext){
 537                                 if(p->mp == nil || p->mp == MACHP(m->machno)
 538                                 || (!p->wired && i > 0))
 539                                         goto found;
 540                         }
 541                 }
 542
 543                 /* waste time or halt the CPU */
 544                 idlehands();
 545
 546                 /* remember how much time we're here */
 547                 now = perfticks();
 548                 m->perf.inidle += now-start;
 549                 start = now;
 550         }
 551
 552 found:
 553         splhi();
 554         p = dequeueproc(rq, p);
 555         if(p == nil)
 556                 goto loop;
 557
 558         p->state = Scheding;
 559         p->mp = MACHP(m->machno);
 560
 561         if(edflock(p)){
 562                 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
 563                 edfunlock();
 564         }
 565         pt = proctrace;
 566         if(pt)
 567                 pt(p, SRun, 0);
 568         return p;
 569 }
 570
 571 int
 572 canpage(Proc *p)
 573 {
 574         int ok = 0;
 575
 576         splhi();
 577         lock(runq);
 578         /* Only reliable way to see if we are Running */
 579         if(p->mach == 0) {
 580                 p->newtlb = 1;
 581                 ok = 1;
 582         }
 583         unlock(runq);
 584         spllo();
 585
 586         return ok;
 587 }
 588
 589 Proc*
 590 newproc(void)
 591 {
 592         char msg[64];
 593         Proc *p;
 594
 595         lock(&procalloc);
 596         for(;;) {
 597                 if(p = procalloc.free)
 598                         break;
 599
 600                 snprint(msg, sizeof msg, "no procs; %s forking",
 601                         up? up->text: "kernel");
 602                 unlock(&procalloc);
 603                 resrcwait(msg);
 604                 lock(&procalloc);
 605         }
 606         procalloc.free = p->qnext;
 607         unlock(&procalloc);
 608
 609         p->state = Scheding;
 610         p->psstate = "New";
 611         p->mach = 0;
 612         p->eql = 0;
 613         p->qnext = 0;
 614         p->nchild = 0;
 615         p->nwait = 0;
 616         p->waitq = 0;
 617         p->parent = 0;
 618         p->pgrp = 0;
 619         p->egrp = 0;
 620         p->fgrp = 0;
 621         p->rgrp = 0;
 622         p->pdbg = 0;
 623         p->fpstate = FPinit;
 624         p->kp = 0;
 625         p->procctl = 0;
 626         p->syscalltrace = 0;
 627         p->notepending = 0;
 628         p->ureg = 0;
 629         p->privatemem = 0;
 630         p->noswap = 0;
 631         p->errstr = p->errbuf0;
 632         p->syserrstr = p->errbuf1;
 633         p->errbuf0[0] = '\0';
 634         p->errbuf1[0] = '\0';
 635         p->nlocks.ref = 0;
 636         p->delaysched = 0;
 637         p->trace = 0;
 638         kstrdup(&p->user, "*nouser");
 639         kstrdup(&p->text, "*notext");
 640         kstrdup(&p->args, "");
 641         p->nargs = 0;
 642         p->setargs = 0;
 643         memset(p->seg, 0, sizeof p->seg);
 644         p->parentpid = 0;
 645         p->noteid = pidalloc(p);
 646         if(p->kstack == 0)
 647                 p->kstack = smalloc(KSTACK);
 648
 649         /* sched params */
 650         p->mp = 0;
 651         p->wired = 0;
 652         procpriority(p, PriNormal, 0);
 653         p->cpu = 0;
 654         p->lastupdate = MACHP(0)->ticks*Scaling;
 655         p->edf = nil;
 656
 657         return p;
 658 }
 659
 660 /*
 661  * wire this proc to a machine
 662  */
 663 void
 664 procwired(Proc *p, int bm)
 665 {
 666         Proc *pp;
 667         int i;
 668         char nwired[MAXMACH];
 669         Mach *wm;
 670
 671         if(bm < 0){
 672                 /* pick a machine to wire to */
 673                 memset(nwired, 0, sizeof(nwired));
 674                 p->wired = 0;
 675                 pp = proctab(0);
 676                 for(i=0; i<conf.nproc; i++, pp++){
 677                         wm = pp->wired;
 678                         if(wm && pp->pid)
 679                                 nwired[wm->machno]++;
 680                 }
 681                 bm = 0;
 682                 for(i=0; i<conf.nmach; i++)
 683                         if(nwired[i] < nwired[bm])
 684                                 bm = i;
 685         } else {
 686                 /* use the virtual machine requested */
 687                 bm = bm % conf.nmach;
 688         }
 689
 690         p->wired = MACHP(bm);
 691         p->mp = p->wired;
 692 }
 693
 694 void
 695 procpriority(Proc *p, int pri, int fixed)
 696 {
 697         if(pri >= Npriq)
 698                 pri = Npriq - 1;
 699         else if(pri < 0)
 700                 pri = 0;
 701         p->basepri = pri;
 702         p->priority = pri;
 703         if(fixed){
 704                 p->fixedpri = 1;
 705         } else {
 706                 p->fixedpri = 0;
 707         }
 708 }
 709
 710 void
 711 procinit0(void)         /* bad planning - clashes with devproc.c */
 712 {
 713         Proc *p;
 714         int i;
 715
 716         procalloc.free = xalloc(conf.nproc*sizeof(Proc));
 717         if(procalloc.free == nil){
 718                 xsummary();
 719                 panic("cannot allocate %lud procs (%ludMB)", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
 720         }
 721         procalloc.arena = procalloc.free;
 722
 723         p = procalloc.free;
 724         for(i=0; i<conf.nproc-1; i++,p++)
 725                 p->qnext = p+1;
 726         p->qnext = 0;
 727 }
 728
 729 /*
 730  *  sleep if a condition is not true.  Another process will
 731  *  awaken us after it sets the condition.  When we awaken
 732  *  the condition may no longer be true.
 733  *
 734  *  we lock both the process and the rendezvous to keep r->p
 735  *  and p->r synchronized.
 736  */
 737 void
 738 sleep(Rendez *r, int (*f)(void*), void *arg)
 739 {
 740         int s;
 741         void (*pt)(Proc*, int, vlong);
 742
 743         s = splhi();
 744
 745         if(up->nlocks.ref)
 746                 print("process %lud sleeps with %lud locks held, last lock %#p locked at pc %#lux, sleep called from %#p\n",
 747                         up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
 748         lock(r);
 749         lock(&up->rlock);
 750         if(r->p){
 751                 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
 752                 dumpstack();
 753         }
 754
 755         /*
 756          *  Wakeup only knows there may be something to do by testing
 757          *  r->p in order to get something to lock on.
 758          *  Flush that information out to memory in case the sleep is
 759          *  committed.
 760          */
 761         r->p = up;
 762
 763         if((*f)(arg) || up->notepending){
 764                 /*
 765                  *  if condition happened or a note is pending
 766                  *  never mind
 767                  */
 768                 r->p = nil;
 769                 unlock(&up->rlock);
 770                 unlock(r);
 771         } else {
 772                 /*
 773                  *  now we are committed to
 774                  *  change state and call scheduler
 775                  */
 776                 pt = proctrace;
 777                 if(pt)
 778                         pt(up, SSleep, 0);
 779                 up->state = Wakeme;
 780                 up->r = r;
 781
 782                 /* statistics */
 783                 m->cs++;
 784
 785                 procsave(up);
 786                 if(setlabel(&up->sched)) {
 787                         /*
 788                          *  here when the process is awakened
 789                          */
 790                         procrestore(up);
 791                         spllo();
 792                 } else {
 793                         /*
 794                          *  here to go to sleep (i.e. stop Running)
 795                          */
 796                         unlock(&up->rlock);
 797                         unlock(r);
 798                         gotolabel(&m->sched);
 799                 }
 800         }
 801
 802         if(up->notepending) {
 803                 up->notepending = 0;
 804                 splx(s);
 805                 if(up->procctl == Proc_exitme && up->closingfgrp)
 806                         forceclosefgrp();
 807                 error(Eintr);
 808         }
 809
 810         splx(s);
 811 }
 812
 813 static int
 814 tfn(void *arg)
 815 {
 816         return up->trend == nil || up->tfn(arg);
 817 }
 818
 819 void
 820 twakeup(Ureg*, Timer *t)
 821 {
 822         Proc *p;
 823         Rendez *trend;
 824
 825         p = t->ta;
 826         trend = p->trend;
 827         p->trend = 0;
 828         if(trend)
 829                 wakeup(trend);
 830 }
 831
 832 void
 833 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
 834 {
 835         if (up->tt){
 836                 print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf);
 837                 timerdel(up);
 838         }
 839         up->tns = MS2NS(ms);
 840         up->tf = twakeup;
 841         up->tmode = Trelative;
 842         up->ta = up;
 843         up->trend = r;
 844         up->tfn = fn;
 845         timeradd(up);
 846
 847         if(waserror()){
 848                 timerdel(up);
 849                 nexterror();
 850         }
 851         sleep(r, tfn, arg);
 852         if (up->tt)
 853                 timerdel(up);
 854         up->twhen = 0;
 855         poperror();
 856 }
 857
 858 /*
 859  *  Expects that only one process can call wakeup for any given Rendez.
 860  *  We hold both locks to ensure that r->p and p->r remain consistent.
 861  *  Richard Miller has a better solution that doesn't require both to
 862  *  be held simultaneously, but I'm a paranoid - presotto.
 863  */
 864 Proc*
 865 wakeup(Rendez *r)
 866 {
 867         Proc *p;
 868         int s;
 869
 870         s = splhi();
 871
 872         lock(r);
 873         p = r->p;
 874
 875         if(p != nil){
 876                 lock(&p->rlock);
 877                 if(p->state != Wakeme || p->r != r){
 878                         iprint("%p %p %d\n", p->r, r, p->state);
 879                         panic("wakeup: state");
 880                 }
 881                 r->p = nil;
 882                 p->r = nil;
 883                 ready(p);
 884                 unlock(&p->rlock);
 885         }
 886         unlock(r);
 887
 888         splx(s);
 889
 890         return p;
 891 }
 892
 893 /*
 894  *  if waking a sleeping process, this routine must hold both
 895  *  p->rlock and r->lock.  However, it can't know them in
 896  *  the same order as wakeup causing a possible lock ordering
 897  *  deadlock.  We break the deadlock by giving up the p->rlock
 898  *  lock if we can't get the r->lock and retrying.
 899  */
 900 int
 901 postnote(Proc *p, int dolock, char *n, int flag)
 902 {
 903         int s, ret;
 904         QLock *q;
 905
 906         if(p == nil)
 907                 return 0;
 908
 909         if(dolock)
 910                 qlock(&p->debug);
 911
 912         if(p->pid == 0){
 913                 if(dolock)
 914                         qunlock(&p->debug);
 915                 return 0;
 916         }
 917
 918         if(n != nil && flag != NUser && (p->notify == 0 || p->notified))
 919                 p->nnote = 0;
 920
 921         ret = 0;
 922         if(p->nnote < NNOTE && n != nil) {
 923                 kstrcpy(p->note[p->nnote].msg, n, ERRMAX);
 924                 p->note[p->nnote++].flag = flag;
 925                 ret = 1;
 926         }
 927         p->notepending = 1;
 928         if(dolock)
 929                 qunlock(&p->debug);
 930
 931         /* this loop is to avoid lock ordering problems. */
 932         for(;;){
 933                 Rendez *r;
 934
 935                 s = splhi();
 936                 lock(&p->rlock);
 937                 r = p->r;
 938
 939                 /* waiting for a wakeup? */
 940                 if(r == nil)
 941                         break;  /* no */
 942
 943                 /* try for the second lock */
 944                 if(canlock(r)){
 945                         if(p->state != Wakeme || r->p != p)
 946                                 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
 947                         p->r = nil;
 948                         r->p = nil;
 949                         ready(p);
 950                         unlock(r);
 951                         break;
 952                 }
 953
 954                 /* give other process time to get out of critical section and try again */
 955                 unlock(&p->rlock);
 956                 splx(s);
 957                 sched();
 958         }
 959         unlock(&p->rlock);
 960         splx(s);
 961
 962         switch(p->state){
 963         case Queueing:
 964                 /* Try and pull out of a eqlock */
 965                 if(q = p->eql){
 966                         lock(&q->use);
 967                         if(p->state == Queueing && p->eql == q){
 968                                 Proc *d, *l;
 969
 970                                 for(l = nil, d = q->head; d; l = d, d = d->qnext){
 971                                         if(d == p){
 972                                                 if(l)
 973                                                         l->qnext = p->qnext;
 974                                                 else
 975                                                         q->head = p->qnext;
 976                                                 if(p->qnext == 0)
 977                                                         q->tail = l;
 978                                                 p->qnext = 0;
 979                                                 p->eql = 0;     /* not taken */
 980                                                 ready(p);
 981                                                 break;
 982                                         }
 983                                 }
 984                         }
 985                         unlock(&q->use);
 986                 }
 987                 break;
 988         case Rendezvous:
 989                 /* Try and pull out of a rendezvous */
 990                 lock(p->rgrp);
 991                 if(p->state == Rendezvous) {
 992                         Proc *d, **l;
 993
 994                         l = &REND(p->rgrp, p->rendtag);
 995                         for(d = *l; d; d = d->rendhash) {
 996                                 if(d == p) {
 997                                         *l = p->rendhash;
 998                                         p->rendval = ~0;
 999                                         ready(p);
1000                                         break;
1001                                 }
1002                                 l = &d->rendhash;
1003                         }
1004                 }
1005                 unlock(p->rgrp);
1006                 break;
1007         }
1008         return ret;
1009 }
1010
1011 /*
1012  * weird thing: keep at most NBROKEN around
1013  */
1014 #define NBROKEN 4
1015 struct
1016 {
1017         QLock;
1018         int     n;
1019         Proc    *p[NBROKEN];
1020 }broken;
1021
1022 void
1023 addbroken(Proc *p)
1024 {
1025         qlock(&broken);
1026         if(broken.n == NBROKEN) {
1027                 ready(broken.p[0]);
1028                 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
1029                 --broken.n;
1030         }
1031         broken.p[broken.n++] = p;
1032         qunlock(&broken);
1033
1034         edfstop(up);
1035         p->state = Broken;
1036         p->psstate = 0;
1037         sched();
1038 }
1039
1040 void
1041 unbreak(Proc *p)
1042 {
1043         int b;
1044
1045         qlock(&broken);
1046         for(b=0; b < broken.n; b++)
1047                 if(broken.p[b] == p) {
1048                         broken.n--;
1049                         memmove(&broken.p[b], &broken.p[b+1],
1050                                         sizeof(Proc*)*(NBROKEN-(b+1)));
1051                         ready(p);
1052                         break;
1053                 }
1054         qunlock(&broken);
1055 }
1056
1057 int
1058 freebroken(void)
1059 {
1060         int i, n;
1061
1062         qlock(&broken);
1063         n = broken.n;
1064         for(i=0; i<n; i++) {
1065                 ready(broken.p[i]);
1066                 broken.p[i] = 0;
1067         }
1068         broken.n = 0;
1069         qunlock(&broken);
1070         return n;
1071 }
1072
1073 void
1074 pexit(char *exitstr, int freemem)
1075 {
1076         Proc *p;
1077         Segment **s, **es;
1078         long utime, stime;
1079         Waitq *wq;
1080         Fgrp *fgrp;
1081         Egrp *egrp;
1082         Rgrp *rgrp;
1083         Pgrp *pgrp;
1084         Chan *dot;
1085         void (*pt)(Proc*, int, vlong);
1086
1087         up->alarm = 0;
1088         if (up->tt)
1089                 timerdel(up);
1090         pt = proctrace;
1091         if(pt)
1092                 pt(up, SDead, 0);
1093
1094         /* nil out all the resources under lock (free later) */
1095         qlock(&up->debug);
1096         fgrp = up->fgrp;
1097         up->fgrp = nil;
1098         egrp = up->egrp;
1099         up->egrp = nil;
1100         rgrp = up->rgrp;
1101         up->rgrp = nil;
1102         pgrp = up->pgrp;
1103         up->pgrp = nil;
1104         dot = up->dot;
1105         up->dot = nil;
1106         qunlock(&up->debug);
1107
1108         if(fgrp)
1109                 closefgrp(fgrp);
1110         if(egrp)
1111                 closeegrp(egrp);
1112         if(rgrp)
1113                 closergrp(rgrp);
1114         if(dot)
1115                 cclose(dot);
1116         if(pgrp)
1117                 closepgrp(pgrp);
1118
1119         /*
1120          * if not a kernel process and have a parent,
1121          * do some housekeeping.
1122          */
1123         if(up->kp == 0 && up->parentpid != 0) {
1124                 wq = smalloc(sizeof(Waitq));
1125                 wq->w.pid = up->pid;
1126                 utime = up->time[TUser] + up->time[TCUser];
1127                 stime = up->time[TSys] + up->time[TCSys];
1128                 wq->w.time[TUser] = tk2ms(utime);
1129                 wq->w.time[TSys] = tk2ms(stime);
1130                 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1131                 if(exitstr && exitstr[0])
1132                         snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1133                 else
1134                         wq->w.msg[0] = '\0';
1135
1136                 p = up->parent;
1137                 lock(&p->exl);
1138                 /*
1139                  * Check that parent is still alive.
1140                  */
1141                 if(p->pid == up->parentpid && p->state != Broken) {
1142                         p->nchild--;
1143                         p->time[TCUser] += utime;
1144                         p->time[TCSys] += stime;
1145                         /*
1146                          * If there would be more than 128 wait records
1147                          * processes for my parent, then don't leave a wait
1148                          * record behind.  This helps prevent badly written
1149                          * daemon processes from accumulating lots of wait
1150                          * records.
1151                          */
1152                         if(p->nwait < 128) {
1153                                 wq->next = p->waitq;
1154                                 p->waitq = wq;
1155                                 p->nwait++;
1156                                 wq = nil;
1157                                 wakeup(&p->waitr);
1158                         }
1159                 }
1160                 unlock(&p->exl);
1161                 if(wq)
1162                         free(wq);
1163         }
1164         else if(up->kp == 0 && up->parent == 0){
1165                 if(exitstr == 0)
1166                         exitstr = "unknown";
1167                 panic("boot process died: %s", exitstr);
1168         }
1169
1170         if(!freemem)
1171                 addbroken(up);
1172
1173         qlock(&up->seglock);
1174         es = &up->seg[NSEG];
1175         for(s = up->seg; s < es; s++) {
1176                 if(*s) {
1177                         putseg(*s);
1178                         *s = 0;
1179                 }
1180         }
1181         qunlock(&up->seglock);
1182
1183         lock(&up->exl);         /* Prevent my children from leaving waits */
1184         pidfree(up);
1185         up->pid = 0;
1186         wakeup(&up->waitr);
1187         unlock(&up->exl);
1188
1189         while((wq = up->waitq) != 0){
1190                 up->waitq = wq->next;
1191                 free(wq);
1192         }
1193
1194         /* release debuggers */
1195         qlock(&up->debug);
1196         if(up->pdbg) {
1197                 wakeup(&up->pdbg->sleep);
1198                 up->pdbg = 0;
1199         }
1200         if(up->syscalltrace) {
1201                 free(up->syscalltrace);
1202                 up->syscalltrace = 0;
1203         }
1204         qunlock(&up->debug);
1205
1206         /* Sched must not loop for these locks */
1207         lock(&procalloc);
1208         lock(&palloc);
1209
1210         edfstop(up);
1211         up->state = Moribund;
1212         sched();
1213         panic("pexit");
1214 }
1215
1216 static int
1217 haswaitq(void *x)
1218 {
1219         Proc *p;
1220
1221         p = (Proc *)x;
1222         return p->waitq != 0;
1223 }
1224
1225 ulong
1226 pwait(Waitmsg *w)
1227 {
1228         ulong cpid;
1229         Waitq *wq;
1230
1231         if(!canqlock(&up->qwaitr))
1232                 error(Einuse);
1233
1234         if(waserror()) {
1235                 qunlock(&up->qwaitr);
1236                 nexterror();
1237         }
1238
1239         lock(&up->exl);
1240         while(up->waitq == 0) {
1241                 if(up->nchild == 0) {
1242                         unlock(&up->exl);
1243                         error(Enochild);
1244                 }
1245                 unlock(&up->exl);
1246                 sleep(&up->waitr, haswaitq, up);
1247                 lock(&up->exl);
1248         }
1249         wq = up->waitq;
1250         up->waitq = wq->next;
1251         up->nwait--;
1252         unlock(&up->exl);
1253
1254         qunlock(&up->qwaitr);
1255         poperror();
1256
1257         if(w)
1258                 memmove(w, &wq->w, sizeof(Waitmsg));
1259         cpid = wq->w.pid;
1260         free(wq);
1261         return cpid;
1262 }
1263
1264 Proc*
1265 proctab(int i)
1266 {
1267         return &procalloc.arena[i];
1268 }
1269
1270 void
1271 dumpaproc(Proc *p)
1272 {
1273         ulong bss;
1274         char *s;
1275
1276         if(p == 0)
1277                 return;
1278
1279         bss = 0;
1280         if(p->seg[BSEG])
1281                 bss = p->seg[BSEG]->top;
1282
1283         s = p->psstate;
1284         if(s == 0)
1285                 s = statename[p->state];
1286         print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1287                 p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1288                 p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1289 }
1290
1291 void
1292 procdump(void)
1293 {
1294         int i;
1295         Proc *p;
1296
1297         if(up)
1298                 print("up %lud\n", up->pid);
1299         else
1300                 print("no current process\n");
1301         for(i=0; i<conf.nproc; i++) {
1302                 p = &procalloc.arena[i];
1303                 if(p->state == Dead)
1304                         continue;
1305
1306                 dumpaproc(p);
1307         }
1308 }
1309
1310 /*
1311  *  wait till all processes have flushed their mmu
1312  *  state about segement s
1313  */
1314 void
1315 procflushseg(Segment *s)
1316 {
1317         int i, ns, nm, nwait;
1318         Proc *p;
1319
1320         /*
1321          *  tell all processes with this
1322          *  segment to flush their mmu's
1323          */
1324         nwait = 0;
1325         for(i=0; i<conf.nproc; i++) {
1326                 p = &procalloc.arena[i];
1327                 if(p->state == Dead)
1328                         continue;
1329                 for(ns = 0; ns < NSEG; ns++)
1330                         if(p->seg[ns] == s){
1331                                 p->newtlb = 1;
1332                                 for(nm = 0; nm < conf.nmach; nm++){
1333                                         if(MACHP(nm)->proc == p){
1334                                                 MACHP(nm)->flushmmu = 1;
1335                                                 nwait++;
1336                                         }
1337                                 }
1338                                 break;
1339                         }
1340         }
1341
1342         if(nwait == 0)
1343                 return;
1344
1345         /*
1346          *  wait for all processors to take a clock interrupt
1347          *  and flush their mmu's
1348          */
1349         for(nm = 0; nm < conf.nmach; nm++)
1350                 if(MACHP(nm) != m)
1351                         while(MACHP(nm)->flushmmu)
1352                                 sched();
1353 }
1354
1355 void
1356 scheddump(void)
1357 {
1358         Proc *p;
1359         Schedq *rq;
1360
1361         for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1362                 if(rq->head == 0)
1363                         continue;
1364                 print("rq%ld:", rq-runq);
1365                 for(p = rq->head; p; p = p->rnext)
1366                         print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1367                 print("\n");
1368                 delay(150);
1369         }
1370         print("nrdy %d\n", nrdy);
1371 }
1372
1373 void
1374 kproc(char *name, void (*func)(void *), void *arg)
1375 {
1376         Proc *p;
1377         static Pgrp *kpgrp;
1378
1379         p = newproc();
1380         p->psstate = 0;
1381         p->procmode = 0640;
1382         p->kp = 1;
1383         p->noswap = 1;
1384
1385         p->scallnr = up->scallnr;
1386         p->s = up->s;
1387         p->nerrlab = 0;
1388         p->slash = up->slash;
1389         p->dot = up->dot;
1390         if(p->dot)
1391                 incref(p->dot);
1392
1393         memmove(p->note, up->note, sizeof(p->note));
1394         p->nnote = up->nnote;
1395         p->notified = 0;
1396         p->lastnote = up->lastnote;
1397         p->notify = up->notify;
1398         p->ureg = 0;
1399         p->dbgreg = 0;
1400
1401         procpriority(p, PriKproc, 0);
1402
1403         kprocchild(p, func, arg);
1404
1405         kstrdup(&p->user, eve);
1406         kstrdup(&p->text, name);
1407         if(kpgrp == 0)
1408                 kpgrp = newpgrp();
1409         p->pgrp = kpgrp;
1410         incref(kpgrp);
1411
1412         memset(p->time, 0, sizeof(p->time));
1413         p->time[TReal] = MACHP(0)->ticks;
1414         ready(p);
1415 }
1416
1417 /*
1418  *  called splhi() by notify().  See comment in notify for the
1419  *  reasoning.
1420  */
1421 void
1422 procctl(Proc *p)
1423 {
1424         char *state;
1425         ulong s;
1426
1427         switch(p->procctl) {
1428         case Proc_exitbig:
1429                 spllo();
1430                 pprint("Killed: Insufficient physical memory\n");
1431                 pexit("Killed: Insufficient physical memory", 1);
1432
1433         case Proc_exitme:
1434                 spllo();                /* pexit has locks in it */
1435                 pexit("Killed", 1);
1436
1437         case Proc_traceme:
1438                 if(p->nnote == 0)
1439                         return;
1440                 /* No break */
1441
1442         case Proc_stopme:
1443                 p->procctl = 0;
1444                 state = p->psstate;
1445                 p->psstate = "Stopped";
1446                 /* free a waiting debugger */
1447                 s = spllo();
1448                 qlock(&p->debug);
1449                 if(p->pdbg) {
1450                         wakeup(&p->pdbg->sleep);
1451                         p->pdbg = 0;
1452                 }
1453                 qunlock(&p->debug);
1454                 splhi();
1455                 p->state = Stopped;
1456                 sched();
1457                 p->psstate = state;
1458                 splx(s);
1459                 return;
1460         }
1461 }
1462
1463 #include "errstr.h"
1464
1465 void
1466 error(char *err)
1467 {
1468         spllo();
1469
1470         assert(up->nerrlab < NERR);
1471         kstrcpy(up->errstr, err, ERRMAX);
1472         setlabel(&up->errlab[NERR-1]);
1473         nexterror();
1474 }
1475
1476 void
1477 nexterror(void)
1478 {
1479         assert(up->nerrlab > 0);
1480         gotolabel(&up->errlab[--up->nerrlab]);
1481 }
1482
1483 void
1484 exhausted(char *resource)
1485 {
1486         char buf[ERRMAX];
1487
1488         snprint(buf, sizeof buf, "no free %s", resource);
1489         iprint("%s\n", buf);
1490         error(buf);
1491 }
1492
1493 void
1494 killbig(char *why)
1495 {
1496         int i;
1497         Segment *s;
1498         ulong l, max;
1499         Proc *p, *ep, *kp;
1500
1501         max = 0;
1502         kp = 0;
1503         ep = procalloc.arena+conf.nproc;
1504         for(p = procalloc.arena; p < ep; p++) {
1505                 if(p->state == Dead || p->kp || !canqlock(&p->seglock))
1506                         continue;
1507                 l = 0;
1508                 for(i=1; i<NSEG; i++) {
1509                         s = p->seg[i];
1510                         if(s == 0 || !canqlock(&s->lk))
1511                                 continue;
1512                         l += (ulong)mcountseg(s);
1513                         qunlock(&s->lk);
1514                 }
1515                 qunlock(&p->seglock);
1516                 if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1517                         kp = p;
1518                         max = l;
1519                 }
1520         }
1521         if(kp == 0 || !canqlock(&kp->seglock))
1522                 return;
1523         print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1524         for(p = procalloc.arena; p < ep; p++) {
1525                 if(p->state == Dead || p->kp)
1526                         continue;
1527                 if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1528                         p->procctl = Proc_exitbig;
1529         }
1530         kp->procctl = Proc_exitbig;
1531         for(i = 0; i < NSEG; i++) {
1532                 s = kp->seg[i];
1533                 if(s != 0 && canqlock(&s->lk)) {
1534                         mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1535                         qunlock(&s->lk);
1536                 }
1537         }
1538         qunlock(&kp->seglock);
1539 }
1540
1541 /*
1542  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1543  *  eve changes.
1544  */
1545 void
1546 renameuser(char *old, char *new)
1547 {
1548         Proc *p, *ep;
1549
1550         ep = procalloc.arena+conf.nproc;
1551         for(p = procalloc.arena; p < ep; p++)
1552                 if(p->user!=nil && strcmp(old, p->user)==0)
1553                         kstrdup(&p->user, new);
1554 }
1555
1556 /*
1557  *  time accounting called by clock() splhi'd
1558  */
1559 void
1560 accounttime(void)
1561 {
1562         Proc *p;
1563         ulong n, per;
1564         static ulong nrun;
1565
1566         p = m->proc;
1567         if(p) {
1568                 nrun++;
1569                 p->time[p->insyscall]++;
1570         }
1571
1572         /* calculate decaying duty cycles */
1573         n = perfticks();
1574         per = n - m->perf.last;
1575         m->perf.last = n;
1576         per = (m->perf.period*(HZ-1) + per)/HZ;
1577         if(per != 0)
1578                 m->perf.period = per;
1579
1580         m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1581         m->perf.inidle = 0;
1582
1583         m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1584         m->perf.inintr = 0;
1585
1586         /* only one processor gets to compute system load averages */
1587         if(m->machno != 0)
1588                 return;
1589
1590         /*
1591          * calculate decaying load average.
1592          * if we decay by (n-1)/n then it takes
1593          * n clock ticks to go from load L to .36 L once
1594          * things quiet down.  it takes about 5 n clock
1595          * ticks to go to zero.  so using HZ means this is
1596          * approximately the load over the last second,
1597          * with a tail lasting about 5 seconds.
1598          */
1599         n = nrun;
1600         nrun = 0;
1601         n = (nrdy+n)*1000;
1602         m->load = (m->load*(HZ-1)+n)/HZ;
1603 }
1604
1605 int
1606 pidalloc(Proc *p)
1607 {
1608         static int gen, wrapped;
1609         int pid, h;
1610         Proc *x;
1611
1612         lock(&procalloc);
1613 Retry:
1614         pid = ++gen & 0x7FFFFFFF;
1615         if(pid == 0){
1616                 wrapped = 1;
1617                 goto Retry;
1618         }
1619         h = pid % nelem(procalloc.ht);
1620         if(wrapped)
1621                 for(x = procalloc.ht[h]; x != nil; x = x->pidhash)
1622                         if(x->pid == pid)
1623                                 goto Retry;
1624         if(p){
1625                 p->pid = pid;
1626                 p->pidhash = procalloc.ht[h];
1627                 procalloc.ht[h] = p;
1628         }
1629         unlock(&procalloc);
1630         return pid;
1631 }
1632
1633 static void
1634 pidfree(Proc *p)
1635 {
1636         int h;
1637         Proc **l;
1638
1639         h = p->pid % nelem(procalloc.ht);
1640         lock(&procalloc);
1641         for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1642                 if(*l == p){
1643                         *l = p->pidhash;
1644                         break;
1645                 }
1646         unlock(&procalloc);
1647 }
1648
1649 int
1650 procindex(ulong pid)
1651 {
1652         Proc *p;
1653         int h;
1654         int s;
1655
1656         s = -1;
1657         h = pid % nelem(procalloc.ht);
1658         lock(&procalloc);
1659         for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1660                 if(p->pid == pid){
1661                         s = p - procalloc.arena;
1662                         break;
1663                 }
1664         unlock(&procalloc);
1665         return s;
1666 }