sys/src/9/port/proc.c

   1 #include        <u.h>
   2 #include        "../port/lib.h"
   3 #include        "mem.h"
   4 #include        "dat.h"
   5 #include        "fns.h"
   6 #include        "../port/error.h"
   7 #include        "edf.h"
   8 #include        <trace.h>
   9
  10 int     schedgain = 30; /* units in seconds */
  11 int     nrdy;
  12 Ref     noteidalloc;
  13
  14 void updatecpu(Proc*);
  15 int reprioritize(Proc*);
  16
  17 ulong   delayedscheds;  /* statistics */
  18 long skipscheds;
  19 long preempts;
  20 ulong load;
  21
  22 static Ref      pidalloc;
  23
  24 static struct Procalloc
  25 {
  26         Lock;
  27         Proc*   ht[128];
  28         Proc*   arena;
  29         Proc*   free;
  30 } procalloc;
  31
  32 enum
  33 {
  34         Q=10,
  35         DQ=4,
  36         Scaling=2,
  37 };
  38
  39 Schedq  runq[Nrq];
  40 ulong   runvec;
  41
  42 char *statename[] =
  43 {       /* BUG: generate automatically */
  44         "Dead",
  45         "Moribund",
  46         "Ready",
  47         "Scheding",
  48         "Running",
  49         "Queueing",
  50         "QueueingR",
  51         "QueueingW",
  52         "Wakeme",
  53         "Broken",
  54         "Stopped",
  55         "Rendez",
  56         "Waitrelease",
  57 };
  58
  59 static void pidhash(Proc*);
  60 static void pidunhash(Proc*);
  61 static void rebalance(void);
  62
  63 /*
  64  * Always splhi()'ed.
  65  */
  66 void
  67 schedinit(void)         /* never returns */
  68 {
  69         Edf *e;
  70
  71         setlabel(&m->sched);
  72         if(up) {
  73                 if((e = up->edf) && (e->flags & Admitted))
  74                         edfrecord(up);
  75                 m->proc = 0;
  76                 switch(up->state) {
  77                 case Running:
  78                         ready(up);
  79                         break;
  80                 case Moribund:
  81                         up->state = Dead;
  82                         edfstop(up);
  83                         if (up->edf)
  84                                 free(up->edf);
  85                         up->edf = nil;
  86
  87                         /*
  88                          * Holding locks from pexit:
  89                          *      procalloc
  90                          *      palloc
  91                          */
  92                         mmurelease(up);
  93
  94                         up->qnext = procalloc.free;
  95                         procalloc.free = up;
  96
  97                         unlock(&palloc);
  98                         unlock(&procalloc);
  99                         break;
 100                 }
 101                 up->mach = nil;
 102                 updatecpu(up);
 103                 up = nil;
 104         }
 105         sched();
 106 }
 107
 108 /*
 109  *  If changing this routine, look also at sleep().  It
 110  *  contains a copy of the guts of sched().
 111  */
 112 void
 113 sched(void)
 114 {
 115         Proc *p;
 116
 117         if(m->ilockdepth)
 118                 panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p",
 119                         m->machno,
 120                         m->ilockdepth,
 121                         up? up->lastilock: nil,
 122                         (up && up->lastilock)? up->lastilock->pc: 0,
 123                         getcallerpc(&p+2));
 124         if(up){
 125                 /*
 126                  * Delay the sched until the process gives up the locks
 127                  * it is holding.  This avoids dumb lock loops.
 128                  * Don't delay if the process is Moribund.
 129                  * It called sched to die.
 130                  * But do sched eventually.  This avoids a missing unlock
 131                  * from hanging the entire kernel.
 132                  * But don't reschedule procs holding palloc or procalloc.
 133                  * Those are far too important to be holding while asleep.
 134                  *
 135                  * This test is not exact.  There can still be a few instructions
 136                  * in the middle of taslock when a process holds a lock
 137                  * but Lock.p has not yet been initialized.
 138                  */
 139                 if(up->nlocks.ref)
 140                 if(up->state != Moribund)
 141                 if(up->delaysched < 20
 142                 || palloc.Lock.p == up
 143                 || procalloc.Lock.p == up){
 144                         up->delaysched++;
 145                         delayedscheds++;
 146                         return;
 147                 }
 148                 up->delaysched = 0;
 149
 150                 splhi();
 151
 152                 /* statistics */
 153                 m->cs++;
 154
 155                 procsave(up);
 156                 if(setlabel(&up->sched)){
 157                         procrestore(up);
 158                         spllo();
 159                         return;
 160                 }
 161                 gotolabel(&m->sched);
 162         }
 163         p = runproc();
 164         if(!p->edf){
 165                 updatecpu(p);
 166                 p->priority = reprioritize(p);
 167         }
 168         if(p != m->readied)
 169                 m->schedticks = m->ticks + HZ/10;
 170         m->readied = 0;
 171         up = p;
 172         up->state = Running;
 173         up->mach = MACHP(m->machno);
 174         m->proc = up;
 175         mmuswitch(up);
 176         gotolabel(&up->sched);
 177 }
 178
 179 int
 180 anyready(void)
 181 {
 182         return runvec;
 183 }
 184
 185 int
 186 anyhigher(void)
 187 {
 188         return runvec & ~((1<<(up->priority+1))-1);
 189 }
 190
 191 /*
 192  *  here once per clock tick to see if we should resched
 193  */
 194 void
 195 hzsched(void)
 196 {
 197         /* once a second, rebalance will reprioritize ready procs */
 198         if(m->machno == 0)
 199                 rebalance();
 200
 201         /* unless preempted, get to run for at least 100ms */
 202         if(anyhigher()
 203         || (!up->fixedpri && m->ticks > m->schedticks && anyready())){
 204                 m->readied = nil;       /* avoid cooperative scheduling */
 205                 up->delaysched++;
 206         }
 207 }
 208
 209 /*
 210  *  here at the end of non-clock interrupts to see if we should preempt the
 211  *  current process.  Returns 1 if preempted, 0 otherwise.
 212  */
 213 int
 214 preempted(void)
 215 {
 216         if(up && up->state == Running)
 217         if(up->preempted == 0)
 218         if(anyhigher())
 219         if(!active.exiting){
 220                 m->readied = nil;       /* avoid cooperative scheduling */
 221                 up->preempted = 1;
 222                 sched();
 223                 splhi();
 224                 up->preempted = 0;
 225                 return 1;
 226         }
 227         return 0;
 228 }
 229
 230 /*
 231  * Update the cpu time average for this particular process,
 232  * which is about to change from up -> not up or vice versa.
 233  * p->lastupdate is the last time an updatecpu happened.
 234  *
 235  * The cpu time average is a decaying average that lasts
 236  * about D clock ticks.  D is chosen to be approximately
 237  * the cpu time of a cpu-intensive "quick job".  A job has to run
 238  * for approximately D clock ticks before we home in on its
 239  * actual cpu usage.  Thus if you manage to get in and get out
 240  * quickly, you won't be penalized during your burst.  Once you
 241  * start using your share of the cpu for more than about D
 242  * clock ticks though, your p->cpu hits 1000 (1.0) and you end up
 243  * below all the other quick jobs.  Interactive tasks, because
 244  * they basically always use less than their fair share of cpu,
 245  * will be rewarded.
 246  *
 247  * If the process has not been running, then we want to
 248  * apply the filter
 249  *
 250  *      cpu = cpu * (D-1)/D
 251  *
 252  * n times, yielding
 253  *
 254  *      cpu = cpu * ((D-1)/D)^n
 255  *
 256  * but D is big enough that this is approximately
 257  *
 258  *      cpu = cpu * (D-n)/D
 259  *
 260  * so we use that instead.
 261  *
 262  * If the process has been running, we apply the filter to
 263  * 1 - cpu, yielding a similar equation.  Note that cpu is
 264  * stored in fixed point (* 1000).
 265  *
 266  * Updatecpu must be called before changing up, in order
 267  * to maintain accurate cpu usage statistics.  It can be called
 268  * at any time to bring the stats for a given proc up-to-date.
 269  */
 270 void
 271 updatecpu(Proc *p)
 272 {
 273         int n, t, ocpu;
 274         int D = schedgain*HZ*Scaling;
 275
 276         if(p->edf)
 277                 return;
 278
 279         t = MACHP(0)->ticks*Scaling + Scaling/2;
 280         n = t - p->lastupdate;
 281         p->lastupdate = t;
 282
 283         if(n == 0)
 284                 return;
 285         if(n > D)
 286                 n = D;
 287
 288         ocpu = p->cpu;
 289         if(p != up)
 290                 p->cpu = (ocpu*(D-n))/D;
 291         else{
 292                 t = 1000 - ocpu;
 293                 t = (t*(D-n))/D;
 294                 p->cpu = 1000 - t;
 295         }
 296
 297 //iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu);
 298 }
 299
 300 /*
 301  * On average, p has used p->cpu of a cpu recently.
 302  * Its fair share is conf.nmach/m->load of a cpu.  If it has been getting
 303  * too much, penalize it.  If it has been getting not enough, reward it.
 304  * I don't think you can get much more than your fair share that
 305  * often, so most of the queues are for using less.  Having a priority
 306  * of 3 means you're just right.  Having a higher priority (up to p->basepri)
 307  * means you're not using as much as you could.
 308  */
 309 int
 310 reprioritize(Proc *p)
 311 {
 312         int fairshare, n, load, ratio;
 313
 314         load = MACHP(0)->load;
 315         if(load == 0)
 316                 return p->basepri;
 317
 318         /*
 319          *  fairshare = 1.000 * conf.nproc * 1.000/load,
 320          * except the decimal point is moved three places
 321          * on both load and fairshare.
 322          */
 323         fairshare = (conf.nmach*1000*1000)/load;
 324         n = p->cpu;
 325         if(n == 0)
 326                 n = 1;
 327         ratio = (fairshare+n/2) / n;
 328         if(ratio > p->basepri)
 329                 ratio = p->basepri;
 330         if(ratio < 0)
 331                 panic("reprioritize");
 332 //iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio);
 333         return ratio;
 334 }
 335
 336 /*
 337  * add a process to a scheduling queue
 338  */
 339 void
 340 queueproc(Schedq *rq, Proc *p)
 341 {
 342         int pri;
 343
 344         pri = rq - runq;
 345         lock(runq);
 346         p->priority = pri;
 347         p->rnext = 0;
 348         if(rq->tail)
 349                 rq->tail->rnext = p;
 350         else
 351                 rq->head = p;
 352         rq->tail = p;
 353         rq->n++;
 354         nrdy++;
 355         runvec |= 1<<pri;
 356         unlock(runq);
 357 }
 358
 359 /*
 360  *  try to remove a process from a scheduling queue (called splhi)
 361  */
 362 Proc*
 363 dequeueproc(Schedq *rq, Proc *tp)
 364 {
 365         Proc *l, *p;
 366
 367         if(!canlock(runq))
 368                 return nil;
 369
 370         /*
 371          *  the queue may have changed before we locked runq,
 372          *  refind the target process.
 373          */
 374         l = 0;
 375         for(p = rq->head; p; p = p->rnext){
 376                 if(p == tp)
 377                         break;
 378                 l = p;
 379         }
 380
 381         /*
 382          *  p->mach==0 only when process state is saved
 383          */
 384         if(p == 0 || p->mach){
 385                 unlock(runq);
 386                 return nil;
 387         }
 388         if(p->rnext == 0)
 389                 rq->tail = l;
 390         if(l)
 391                 l->rnext = p->rnext;
 392         else
 393                 rq->head = p->rnext;
 394         if(rq->head == nil)
 395                 runvec &= ~(1<<(rq-runq));
 396         rq->n--;
 397         nrdy--;
 398         if(p->state != Ready)
 399                 print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]);
 400
 401         unlock(runq);
 402         return p;
 403 }
 404
 405 /*
 406  *  ready(p) picks a new priority for a process and sticks it in the
 407  *  runq for that priority.
 408  */
 409 void
 410 ready(Proc *p)
 411 {
 412         int s, pri;
 413         Schedq *rq;
 414         void (*pt)(Proc*, int, vlong);
 415
 416         s = splhi();
 417         if(edfready(p)){
 418                 splx(s);
 419                 return;
 420         }
 421
 422         if(up != p)
 423                 m->readied = p; /* group scheduling */
 424
 425         updatecpu(p);
 426         pri = reprioritize(p);
 427         p->priority = pri;
 428         rq = &runq[pri];
 429         p->state = Ready;
 430         queueproc(rq, p);
 431         pt = proctrace;
 432         if(pt)
 433                 pt(p, SReady, 0);
 434         splx(s);
 435 }
 436
 437 /*
 438  *  yield the processor and drop our priority
 439  */
 440 void
 441 yield(void)
 442 {
 443         if(anyready()){
 444                 /* pretend we just used 1/2 tick */
 445                 up->lastupdate -= Scaling/2;
 446                 sched();
 447         }
 448 }
 449
 450 /*
 451  *  recalculate priorities once a second.  We need to do this
 452  *  since priorities will otherwise only be recalculated when
 453  *  the running process blocks.
 454  */
 455 ulong balancetime;
 456
 457 static void
 458 rebalance(void)
 459 {
 460         int pri, npri, t, x;
 461         Schedq *rq;
 462         Proc *p;
 463
 464         t = m->ticks;
 465         if(t - balancetime < HZ)
 466                 return;
 467         balancetime = t;
 468
 469         for(pri=0, rq=runq; pri<Npriq; pri++, rq++){
 470 another:
 471                 p = rq->head;
 472                 if(p == nil)
 473                         continue;
 474                 if(p->mp != MACHP(m->machno))
 475                         continue;
 476                 if(pri == p->basepri)
 477                         continue;
 478                 updatecpu(p);
 479                 npri = reprioritize(p);
 480                 if(npri != pri){
 481                         x = splhi();
 482                         p = dequeueproc(rq, p);
 483                         if(p)
 484                                 queueproc(&runq[npri], p);
 485                         splx(x);
 486                         goto another;
 487                 }
 488         }
 489 }
 490
 491
 492 /*
 493  *  pick a process to run
 494  */
 495 Proc*
 496 runproc(void)
 497 {
 498         Schedq *rq;
 499         Proc *p;
 500         ulong start, now;
 501         int i;
 502         void (*pt)(Proc*, int, vlong);
 503
 504         start = perfticks();
 505
 506         /* cooperative scheduling until the clock ticks */
 507         if((p=m->readied) && p->mach==0 && p->state==Ready
 508         && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){
 509                 skipscheds++;
 510                 rq = &runq[p->priority];
 511                 goto found;
 512         }
 513
 514         preempts++;
 515
 516 loop:
 517         /*
 518          *  find a process that last ran on this processor (affinity),
 519          *  or one that hasn't moved in a while (load balancing).  Every
 520          *  time around the loop affinity goes down.
 521          */
 522         spllo();
 523         for(i = 0;; i++){
 524                 /*
 525                  *  find the highest priority target process that this
 526                  *  processor can run given affinity constraints.
 527                  *
 528                  */
 529                 for(rq = &runq[Nrq-1]; rq >= runq; rq--){
 530                         for(p = rq->head; p; p = p->rnext){
 531                                 if(p->mp == nil || p->mp == MACHP(m->machno)
 532                                 || (!p->wired && i > 0))
 533                                         goto found;
 534                         }
 535                 }
 536
 537                 /* waste time or halt the CPU */
 538                 idlehands();
 539
 540                 /* remember how much time we're here */
 541                 now = perfticks();
 542                 m->perf.inidle += now-start;
 543                 start = now;
 544         }
 545
 546 found:
 547         splhi();
 548         p = dequeueproc(rq, p);
 549         if(p == nil)
 550                 goto loop;
 551
 552         p->state = Scheding;
 553         p->mp = MACHP(m->machno);
 554
 555         if(edflock(p)){
 556                 edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */
 557                 edfunlock();
 558         }
 559         pt = proctrace;
 560         if(pt)
 561                 pt(p, SRun, 0);
 562         return p;
 563 }
 564
 565 int
 566 canpage(Proc *p)
 567 {
 568         int ok = 0;
 569
 570         splhi();
 571         lock(runq);
 572         /* Only reliable way to see if we are Running */
 573         if(p->mach == 0) {
 574                 p->newtlb = 1;
 575                 ok = 1;
 576         }
 577         unlock(runq);
 578         spllo();
 579
 580         return ok;
 581 }
 582
 583 Proc*
 584 newproc(void)
 585 {
 586         char msg[64];
 587         Proc *p;
 588
 589         lock(&procalloc);
 590         for(;;) {
 591                 if(p = procalloc.free)
 592                         break;
 593
 594                 snprint(msg, sizeof msg, "no procs; %s forking",
 595                         up? up->text: "kernel");
 596                 unlock(&procalloc);
 597                 resrcwait(msg);
 598                 lock(&procalloc);
 599         }
 600         procalloc.free = p->qnext;
 601         unlock(&procalloc);
 602
 603         p->state = Scheding;
 604         p->psstate = "New";
 605         p->mach = 0;
 606         p->qnext = 0;
 607         p->nchild = 0;
 608         p->nwait = 0;
 609         p->waitq = 0;
 610         p->parent = 0;
 611         p->pgrp = 0;
 612         p->egrp = 0;
 613         p->fgrp = 0;
 614         p->rgrp = 0;
 615         p->pdbg = 0;
 616         p->fpstate = FPinit;
 617         p->kp = 0;
 618         if(up && up->procctl == Proc_tracesyscall)
 619                 p->procctl = Proc_tracesyscall;
 620         else
 621                 p->procctl = 0;
 622         p->syscalltrace = 0;
 623         p->notepending = 0;
 624         p->ureg = 0;
 625         p->privatemem = 0;
 626         p->noswap = 0;
 627         p->errstr = p->errbuf0;
 628         p->syserrstr = p->errbuf1;
 629         p->errbuf0[0] = '\0';
 630         p->errbuf1[0] = '\0';
 631         p->nlocks.ref = 0;
 632         p->delaysched = 0;
 633         p->trace = 0;
 634         kstrdup(&p->user, "*nouser");
 635         kstrdup(&p->text, "*notext");
 636         kstrdup(&p->args, "");
 637         p->nargs = 0;
 638         p->setargs = 0;
 639         memset(p->seg, 0, sizeof p->seg);
 640         p->pid = incref(&pidalloc);
 641         pidhash(p);
 642         p->noteid = incref(&noteidalloc);
 643         if(p->pid==0 || p->noteid==0)
 644                 panic("pidalloc");
 645         if(p->kstack == 0)
 646                 p->kstack = smalloc(KSTACK);
 647
 648         /* sched params */
 649         p->mp = 0;
 650         p->wired = 0;
 651         procpriority(p, PriNormal, 0);
 652         p->cpu = 0;
 653         p->lastupdate = MACHP(0)->ticks*Scaling;
 654         p->edf = nil;
 655
 656         return p;
 657 }
 658
 659 /*
 660  * wire this proc to a machine
 661  */
 662 void
 663 procwired(Proc *p, int bm)
 664 {
 665         Proc *pp;
 666         int i;
 667         char nwired[MAXMACH];
 668         Mach *wm;
 669
 670         if(bm < 0){
 671                 /* pick a machine to wire to */
 672                 memset(nwired, 0, sizeof(nwired));
 673                 p->wired = 0;
 674                 pp = proctab(0);
 675                 for(i=0; i<conf.nproc; i++, pp++){
 676                         wm = pp->wired;
 677                         if(wm && pp->pid)
 678                                 nwired[wm->machno]++;
 679                 }
 680                 bm = 0;
 681                 for(i=0; i<conf.nmach; i++)
 682                         if(nwired[i] < nwired[bm])
 683                                 bm = i;
 684         } else {
 685                 /* use the virtual machine requested */
 686                 bm = bm % conf.nmach;
 687         }
 688
 689         p->wired = MACHP(bm);
 690         p->mp = p->wired;
 691 }
 692
 693 void
 694 procpriority(Proc *p, int pri, int fixed)
 695 {
 696         if(pri >= Npriq)
 697                 pri = Npriq - 1;
 698         else if(pri < 0)
 699                 pri = 0;
 700         p->basepri = pri;
 701         p->priority = pri;
 702         if(fixed){
 703                 p->fixedpri = 1;
 704         } else {
 705                 p->fixedpri = 0;
 706         }
 707 }
 708
 709 void
 710 procinit0(void)         /* bad planning - clashes with devproc.c */
 711 {
 712         Proc *p;
 713         int i;
 714
 715         procalloc.free = xalloc(conf.nproc*sizeof(Proc));
 716         if(procalloc.free == nil){
 717                 xsummary();
 718                 panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024));
 719         }
 720         procalloc.arena = procalloc.free;
 721
 722         p = procalloc.free;
 723         for(i=0; i<conf.nproc-1; i++,p++)
 724                 p->qnext = p+1;
 725         p->qnext = 0;
 726 }
 727
 728 /*
 729  *  sleep if a condition is not true.  Another process will
 730  *  awaken us after it sets the condition.  When we awaken
 731  *  the condition may no longer be true.
 732  *
 733  *  we lock both the process and the rendezvous to keep r->p
 734  *  and p->r synchronized.
 735  */
 736 void
 737 sleep(Rendez *r, int (*f)(void*), void *arg)
 738 {
 739         int s;
 740         void (*pt)(Proc*, int, vlong);
 741
 742         s = splhi();
 743
 744         if(up->nlocks.ref)
 745                 print("process %lud sleeps with %lud locks held, last lock %#p locked at pc %#lux, sleep called from %#p\n",
 746                         up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r));
 747         lock(r);
 748         lock(&up->rlock);
 749         if(r->p){
 750                 print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid);
 751                 dumpstack();
 752         }
 753
 754         /*
 755          *  Wakeup only knows there may be something to do by testing
 756          *  r->p in order to get something to lock on.
 757          *  Flush that information out to memory in case the sleep is
 758          *  committed.
 759          */
 760         r->p = up;
 761
 762         if((*f)(arg) || up->notepending){
 763                 /*
 764                  *  if condition happened or a note is pending
 765                  *  never mind
 766                  */
 767                 r->p = nil;
 768                 unlock(&up->rlock);
 769                 unlock(r);
 770         } else {
 771                 /*
 772                  *  now we are committed to
 773                  *  change state and call scheduler
 774                  */
 775                 pt = proctrace;
 776                 if(pt)
 777                         pt(up, SSleep, 0);
 778                 up->state = Wakeme;
 779                 up->r = r;
 780
 781                 /* statistics */
 782                 m->cs++;
 783
 784                 procsave(up);
 785                 if(setlabel(&up->sched)) {
 786                         /*
 787                          *  here when the process is awakened
 788                          */
 789                         procrestore(up);
 790                         spllo();
 791                 } else {
 792                         /*
 793                          *  here to go to sleep (i.e. stop Running)
 794                          */
 795                         unlock(&up->rlock);
 796                         unlock(r);
 797                         gotolabel(&m->sched);
 798                 }
 799         }
 800
 801         if(up->notepending) {
 802                 up->notepending = 0;
 803                 splx(s);
 804                 if(up->procctl == Proc_exitme && up->closingfgrp)
 805                         forceclosefgrp();
 806                 error(Eintr);
 807         }
 808
 809         splx(s);
 810 }
 811
 812 static int
 813 tfn(void *arg)
 814 {
 815         return up->trend == nil || up->tfn(arg);
 816 }
 817
 818 void
 819 twakeup(Ureg*, Timer *t)
 820 {
 821         Proc *p;
 822         Rendez *trend;
 823
 824         p = t->ta;
 825         trend = p->trend;
 826         p->trend = 0;
 827         if(trend)
 828                 wakeup(trend);
 829 }
 830
 831 void
 832 tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms)
 833 {
 834         if (up->tt){
 835                 print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf);
 836                 timerdel(up);
 837         }
 838         up->tns = MS2NS(ms);
 839         up->tf = twakeup;
 840         up->tmode = Trelative;
 841         up->ta = up;
 842         up->trend = r;
 843         up->tfn = fn;
 844         timeradd(up);
 845
 846         if(waserror()){
 847                 timerdel(up);
 848                 nexterror();
 849         }
 850         sleep(r, tfn, arg);
 851         if (up->tt)
 852                 timerdel(up);
 853         up->twhen = 0;
 854         poperror();
 855 }
 856
 857 /*
 858  *  Expects that only one process can call wakeup for any given Rendez.
 859  *  We hold both locks to ensure that r->p and p->r remain consistent.
 860  *  Richard Miller has a better solution that doesn't require both to
 861  *  be held simultaneously, but I'm a paranoid - presotto.
 862  */
 863 Proc*
 864 wakeup(Rendez *r)
 865 {
 866         Proc *p;
 867         int s;
 868
 869         s = splhi();
 870
 871         lock(r);
 872         p = r->p;
 873
 874         if(p != nil){
 875                 lock(&p->rlock);
 876                 if(p->state != Wakeme || p->r != r){
 877                         iprint("%p %p %d\n", p->r, r, p->state);
 878                         panic("wakeup: state");
 879                 }
 880                 r->p = nil;
 881                 p->r = nil;
 882                 ready(p);
 883                 unlock(&p->rlock);
 884         }
 885         unlock(r);
 886
 887         splx(s);
 888
 889         return p;
 890 }
 891
 892 /*
 893  *  if waking a sleeping process, this routine must hold both
 894  *  p->rlock and r->lock.  However, it can't know them in
 895  *  the same order as wakeup causing a possible lock ordering
 896  *  deadlock.  We break the deadlock by giving up the p->rlock
 897  *  lock if we can't get the r->lock and retrying.
 898  */
 899 int
 900 postnote(Proc *p, int dolock, char *n, int flag)
 901 {
 902         int s, ret;
 903         Rendez *r;
 904         Proc *d, **l;
 905
 906         if(dolock)
 907                 qlock(&p->debug);
 908
 909         if(flag != NUser && (p->notify == 0 || p->notified))
 910                 p->nnote = 0;
 911
 912         ret = 0;
 913         if(p->nnote < NNOTE) {
 914                 strcpy(p->note[p->nnote].msg, n);
 915                 p->note[p->nnote++].flag = flag;
 916                 ret = 1;
 917         }
 918         p->notepending = 1;
 919         if(dolock)
 920                 qunlock(&p->debug);
 921
 922         /* this loop is to avoid lock ordering problems. */
 923         for(;;){
 924                 s = splhi();
 925                 lock(&p->rlock);
 926                 r = p->r;
 927
 928                 /* waiting for a wakeup? */
 929                 if(r == nil)
 930                         break;  /* no */
 931
 932                 /* try for the second lock */
 933                 if(canlock(r)){
 934                         if(p->state != Wakeme || r->p != p)
 935                                 panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state);
 936                         p->r = nil;
 937                         r->p = nil;
 938                         ready(p);
 939                         unlock(r);
 940                         break;
 941                 }
 942
 943                 /* give other process time to get out of critical section and try again */
 944                 unlock(&p->rlock);
 945                 splx(s);
 946                 sched();
 947         }
 948         unlock(&p->rlock);
 949         splx(s);
 950
 951         if(p->state != Rendezvous)
 952                 return ret;
 953
 954         /* Try and pull out of a rendezvous */
 955         lock(p->rgrp);
 956         if(p->state == Rendezvous) {
 957                 p->rendval = ~0;
 958                 l = &REND(p->rgrp, p->rendtag);
 959                 for(d = *l; d; d = d->rendhash) {
 960                         if(d == p) {
 961                                 *l = p->rendhash;
 962                                 break;
 963                         }
 964                         l = &d->rendhash;
 965                 }
 966                 ready(p);
 967         }
 968         unlock(p->rgrp);
 969         return ret;
 970 }
 971
 972 /*
 973  * weird thing: keep at most NBROKEN around
 974  */
 975 #define NBROKEN 4
 976 struct
 977 {
 978         QLock;
 979         int     n;
 980         Proc    *p[NBROKEN];
 981 }broken;
 982
 983 void
 984 addbroken(Proc *p)
 985 {
 986         qlock(&broken);
 987         if(broken.n == NBROKEN) {
 988                 ready(broken.p[0]);
 989                 memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1));
 990                 --broken.n;
 991         }
 992         broken.p[broken.n++] = p;
 993         qunlock(&broken);
 994
 995         edfstop(up);
 996         p->state = Broken;
 997         p->psstate = 0;
 998         sched();
 999 }
1000
1001 void
1002 unbreak(Proc *p)
1003 {
1004         int b;
1005
1006         qlock(&broken);
1007         for(b=0; b < broken.n; b++)
1008                 if(broken.p[b] == p) {
1009                         broken.n--;
1010                         memmove(&broken.p[b], &broken.p[b+1],
1011                                         sizeof(Proc*)*(NBROKEN-(b+1)));
1012                         ready(p);
1013                         break;
1014                 }
1015         qunlock(&broken);
1016 }
1017
1018 int
1019 freebroken(void)
1020 {
1021         int i, n;
1022
1023         qlock(&broken);
1024         n = broken.n;
1025         for(i=0; i<n; i++) {
1026                 ready(broken.p[i]);
1027                 broken.p[i] = 0;
1028         }
1029         broken.n = 0;
1030         qunlock(&broken);
1031         return n;
1032 }
1033
1034 void
1035 pexit(char *exitstr, int freemem)
1036 {
1037         Proc *p;
1038         Segment **s, **es;
1039         long utime, stime;
1040         Waitq *wq, *f, *next;
1041         Fgrp *fgrp;
1042         Egrp *egrp;
1043         Rgrp *rgrp;
1044         Pgrp *pgrp;
1045         Chan *dot;
1046         void (*pt)(Proc*, int, vlong);
1047
1048         if(up->syscalltrace)
1049                 free(up->syscalltrace);
1050         up->alarm = 0;
1051         if (up->tt)
1052                 timerdel(up);
1053         pt = proctrace;
1054         if(pt)
1055                 pt(up, SDead, 0);
1056
1057         /* nil out all the resources under lock (free later) */
1058         qlock(&up->debug);
1059         fgrp = up->fgrp;
1060         up->fgrp = nil;
1061         egrp = up->egrp;
1062         up->egrp = nil;
1063         rgrp = up->rgrp;
1064         up->rgrp = nil;
1065         pgrp = up->pgrp;
1066         up->pgrp = nil;
1067         dot = up->dot;
1068         up->dot = nil;
1069         qunlock(&up->debug);
1070
1071         if(fgrp)
1072                 closefgrp(fgrp);
1073         if(egrp)
1074                 closeegrp(egrp);
1075         if(rgrp)
1076                 closergrp(rgrp);
1077         if(dot)
1078                 cclose(dot);
1079         if(pgrp)
1080                 closepgrp(pgrp);
1081
1082         /*
1083          * if not a kernel process and have a parent,
1084          * do some housekeeping.
1085          */
1086         if(up->kp == 0) {
1087                 p = up->parent;
1088                 if(p == 0) {
1089                         if(exitstr == 0)
1090                                 exitstr = "unknown";
1091                         panic("boot process died: %s", exitstr);
1092                 }
1093
1094                 while(waserror())
1095                         ;
1096
1097                 wq = smalloc(sizeof(Waitq));
1098                 poperror();
1099
1100                 wq->w.pid = up->pid;
1101                 utime = up->time[TUser] + up->time[TCUser];
1102                 stime = up->time[TSys] + up->time[TCSys];
1103                 wq->w.time[TUser] = tk2ms(utime);
1104                 wq->w.time[TSys] = tk2ms(stime);
1105                 wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]);
1106                 if(exitstr && exitstr[0])
1107                         snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr);
1108                 else
1109                         wq->w.msg[0] = '\0';
1110
1111                 lock(&p->exl);
1112                 /*
1113                  * Check that parent is still alive.
1114                  */
1115                 if(p->pid == up->parentpid && p->state != Broken) {
1116                         p->nchild--;
1117                         p->time[TCUser] += utime;
1118                         p->time[TCSys] += stime;
1119                         /*
1120                          * If there would be more than 128 wait records
1121                          * processes for my parent, then don't leave a wait
1122                          * record behind.  This helps prevent badly written
1123                          * daemon processes from accumulating lots of wait
1124                          * records.
1125                          */
1126                         if(p->nwait < 128) {
1127                                 wq->next = p->waitq;
1128                                 p->waitq = wq;
1129                                 p->nwait++;
1130                                 wq = nil;
1131                                 wakeup(&p->waitr);
1132                         }
1133                 }
1134                 unlock(&p->exl);
1135                 if(wq)
1136                         free(wq);
1137         }
1138
1139         if(!freemem)
1140                 addbroken(up);
1141
1142         qlock(&up->seglock);
1143         es = &up->seg[NSEG];
1144         for(s = up->seg; s < es; s++) {
1145                 if(*s) {
1146                         putseg(*s);
1147                         *s = 0;
1148                 }
1149         }
1150         qunlock(&up->seglock);
1151
1152         lock(&up->exl);         /* Prevent my children from leaving waits */
1153         pidunhash(up);
1154         up->pid = 0;
1155         wakeup(&up->waitr);
1156         unlock(&up->exl);
1157
1158         for(f = up->waitq; f; f = next) {
1159                 next = f->next;
1160                 free(f);
1161         }
1162
1163         /* release debuggers */
1164         qlock(&up->debug);
1165         if(up->pdbg) {
1166                 wakeup(&up->pdbg->sleep);
1167                 up->pdbg = 0;
1168         }
1169         qunlock(&up->debug);
1170
1171         /* Sched must not loop for these locks */
1172         lock(&procalloc);
1173         lock(&palloc);
1174
1175         edfstop(up);
1176         up->state = Moribund;
1177         sched();
1178         panic("pexit");
1179 }
1180
1181 int
1182 haswaitq(void *x)
1183 {
1184         Proc *p;
1185
1186         p = (Proc *)x;
1187         return p->waitq != 0;
1188 }
1189
1190 ulong
1191 pwait(Waitmsg *w)
1192 {
1193         ulong cpid;
1194         Waitq *wq;
1195
1196         if(!canqlock(&up->qwaitr))
1197                 error(Einuse);
1198
1199         if(waserror()) {
1200                 qunlock(&up->qwaitr);
1201                 nexterror();
1202         }
1203
1204         lock(&up->exl);
1205         if(up->nchild == 0 && up->waitq == 0) {
1206                 unlock(&up->exl);
1207                 error(Enochild);
1208         }
1209         unlock(&up->exl);
1210
1211         sleep(&up->waitr, haswaitq, up);
1212
1213         lock(&up->exl);
1214         wq = up->waitq;
1215         up->waitq = wq->next;
1216         up->nwait--;
1217         unlock(&up->exl);
1218
1219         qunlock(&up->qwaitr);
1220         poperror();
1221
1222         if(w)
1223                 memmove(w, &wq->w, sizeof(Waitmsg));
1224         cpid = wq->w.pid;
1225         free(wq);
1226         return cpid;
1227 }
1228
1229 Proc*
1230 proctab(int i)
1231 {
1232         return &procalloc.arena[i];
1233 }
1234
1235 void
1236 dumpaproc(Proc *p)
1237 {
1238         ulong bss;
1239         char *s;
1240
1241         if(p == 0)
1242                 return;
1243
1244         bss = 0;
1245         if(p->seg[BSEG])
1246                 bss = p->seg[BSEG]->top;
1247
1248         s = p->psstate;
1249         if(s == 0)
1250                 s = statename[p->state];
1251         print("%3lud:%10s pc %8lux dbgpc %8lux  %8s (%s) ut %ld st %ld bss %lux qpc %lux nl %lud nd %lud lpc %lux pri %lud\n",
1252                 p->pid, p->text, p->pc, dbgpc(p),  s, statename[p->state],
1253                 p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority);
1254 }
1255
1256 void
1257 procdump(void)
1258 {
1259         int i;
1260         Proc *p;
1261
1262         if(up)
1263                 print("up %lud\n", up->pid);
1264         else
1265                 print("no current process\n");
1266         for(i=0; i<conf.nproc; i++) {
1267                 p = &procalloc.arena[i];
1268                 if(p->state == Dead)
1269                         continue;
1270
1271                 dumpaproc(p);
1272         }
1273 }
1274
1275 /*
1276  *  wait till all processes have flushed their mmu
1277  *  state about segement s
1278  */
1279 void
1280 procflushseg(Segment *s)
1281 {
1282         int i, ns, nm, nwait;
1283         Proc *p;
1284
1285         /*
1286          *  tell all processes with this
1287          *  segment to flush their mmu's
1288          */
1289         nwait = 0;
1290         for(i=0; i<conf.nproc; i++) {
1291                 p = &procalloc.arena[i];
1292                 if(p->state == Dead)
1293                         continue;
1294                 for(ns = 0; ns < NSEG; ns++)
1295                         if(p->seg[ns] == s){
1296                                 p->newtlb = 1;
1297                                 for(nm = 0; nm < conf.nmach; nm++){
1298                                         if(MACHP(nm)->proc == p){
1299                                                 MACHP(nm)->flushmmu = 1;
1300                                                 nwait++;
1301                                         }
1302                                 }
1303                                 break;
1304                         }
1305         }
1306
1307         if(nwait == 0)
1308                 return;
1309
1310         /*
1311          *  wait for all processors to take a clock interrupt
1312          *  and flush their mmu's
1313          */
1314         for(nm = 0; nm < conf.nmach; nm++)
1315                 if(MACHP(nm) != m)
1316                         while(MACHP(nm)->flushmmu)
1317                                 sched();
1318 }
1319
1320 void
1321 scheddump(void)
1322 {
1323         Proc *p;
1324         Schedq *rq;
1325
1326         for(rq = &runq[Nrq-1]; rq >= runq; rq--){
1327                 if(rq->head == 0)
1328                         continue;
1329                 print("rq%ld:", rq-runq);
1330                 for(p = rq->head; p; p = p->rnext)
1331                         print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
1332                 print("\n");
1333                 delay(150);
1334         }
1335         print("nrdy %d\n", nrdy);
1336 }
1337
1338 void
1339 kproc(char *name, void (*func)(void *), void *arg)
1340 {
1341         Proc *p;
1342         static Pgrp *kpgrp;
1343
1344         p = newproc();
1345         p->psstate = 0;
1346         p->procmode = 0640;
1347         p->kp = 1;
1348         p->noswap = 1;
1349
1350         p->fpsave = up->fpsave;
1351         p->scallnr = up->scallnr;
1352         p->s = up->s;
1353         p->nerrlab = 0;
1354         p->slash = up->slash;
1355         p->dot = up->dot;
1356         if(p->dot)
1357                 incref(p->dot);
1358
1359         memmove(p->note, up->note, sizeof(p->note));
1360         p->nnote = up->nnote;
1361         p->notified = 0;
1362         p->lastnote = up->lastnote;
1363         p->notify = up->notify;
1364         p->ureg = 0;
1365         p->dbgreg = 0;
1366
1367         procpriority(p, PriKproc, 0);
1368
1369         kprocchild(p, func, arg);
1370
1371         kstrdup(&p->user, eve);
1372         kstrdup(&p->text, name);
1373         if(kpgrp == 0)
1374                 kpgrp = newpgrp();
1375         p->pgrp = kpgrp;
1376         incref(kpgrp);
1377
1378         memset(p->time, 0, sizeof(p->time));
1379         p->time[TReal] = MACHP(0)->ticks;
1380         ready(p);
1381 }
1382
1383 /*
1384  *  called splhi() by notify().  See comment in notify for the
1385  *  reasoning.
1386  */
1387 void
1388 procctl(Proc *p)
1389 {
1390         char *state;
1391         ulong s;
1392
1393         switch(p->procctl) {
1394         case Proc_exitbig:
1395                 spllo();
1396                 pexit("Killed: Insufficient physical memory", 1);
1397
1398         case Proc_exitme:
1399                 spllo();                /* pexit has locks in it */
1400                 pexit("Killed", 1);
1401
1402         case Proc_traceme:
1403                 if(p->nnote == 0)
1404                         return;
1405                 /* No break */
1406
1407         case Proc_stopme:
1408                 p->procctl = 0;
1409                 state = p->psstate;
1410                 p->psstate = "Stopped";
1411                 /* free a waiting debugger */
1412                 s = spllo();
1413                 qlock(&p->debug);
1414                 if(p->pdbg) {
1415                         wakeup(&p->pdbg->sleep);
1416                         p->pdbg = 0;
1417                 }
1418                 qunlock(&p->debug);
1419                 splhi();
1420                 p->state = Stopped;
1421                 sched();
1422                 p->psstate = state;
1423                 splx(s);
1424                 return;
1425         }
1426 }
1427
1428 #include "errstr.h"
1429
1430 void
1431 error(char *err)
1432 {
1433         spllo();
1434
1435         assert(up->nerrlab < NERR);
1436         kstrcpy(up->errstr, err, ERRMAX);
1437         setlabel(&up->errlab[NERR-1]);
1438         nexterror();
1439 }
1440
1441 void
1442 nexterror(void)
1443 {
1444         gotolabel(&up->errlab[--up->nerrlab]);
1445 }
1446
1447 void
1448 exhausted(char *resource)
1449 {
1450         char buf[ERRMAX];
1451
1452         snprint(buf, sizeof buf, "no free %s", resource);
1453         iprint("%s\n", buf);
1454         error(buf);
1455 }
1456
1457 void
1458 killbig(char *why)
1459 {
1460         int i;
1461         Segment *s;
1462         ulong l, max;
1463         Proc *p, *ep, *kp;
1464
1465         max = 0;
1466         kp = 0;
1467         ep = procalloc.arena+conf.nproc;
1468         for(p = procalloc.arena; p < ep; p++) {
1469                 if(p->state == Dead || p->kp)
1470                         continue;
1471                 l = 0;
1472                 for(i=1; i<NSEG; i++) {
1473                         s = p->seg[i];
1474                         if(s != 0)
1475                                 l += s->top - s->base;
1476                 }
1477                 if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) {
1478                         kp = p;
1479                         max = l;
1480                 }
1481         }
1482
1483         print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
1484         for(p = procalloc.arena; p < ep; p++) {
1485                 if(p->state == Dead || p->kp)
1486                         continue;
1487                 if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG])
1488                         p->procctl = Proc_exitbig;
1489         }
1490         kp->procctl = Proc_exitbig;
1491         for(i = 0; i < NSEG; i++) {
1492                 s = kp->seg[i];
1493                 if(s != 0 && canqlock(&s->lk)) {
1494                         mfreeseg(s, s->base, (s->top - s->base)/BY2PG);
1495                         qunlock(&s->lk);
1496                 }
1497         }
1498 }
1499
1500 /*
1501  *  change ownership to 'new' of all processes owned by 'old'.  Used when
1502  *  eve changes.
1503  */
1504 void
1505 renameuser(char *old, char *new)
1506 {
1507         Proc *p, *ep;
1508
1509         ep = procalloc.arena+conf.nproc;
1510         for(p = procalloc.arena; p < ep; p++)
1511                 if(p->user!=nil && strcmp(old, p->user)==0)
1512                         kstrdup(&p->user, new);
1513 }
1514
1515 /*
1516  *  time accounting called by clock() splhi'd
1517  */
1518 void
1519 accounttime(void)
1520 {
1521         Proc *p;
1522         ulong n, per;
1523         static ulong nrun;
1524
1525         p = m->proc;
1526         if(p) {
1527                 nrun++;
1528                 p->time[p->insyscall]++;
1529         }
1530
1531         /* calculate decaying duty cycles */
1532         n = perfticks();
1533         per = n - m->perf.last;
1534         m->perf.last = n;
1535         per = (m->perf.period*(HZ-1) + per)/HZ;
1536         if(per != 0)
1537                 m->perf.period = per;
1538
1539         m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ;
1540         m->perf.inidle = 0;
1541
1542         m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ;
1543         m->perf.inintr = 0;
1544
1545         /* only one processor gets to compute system load averages */
1546         if(m->machno != 0)
1547                 return;
1548
1549         /*
1550          * calculate decaying load average.
1551          * if we decay by (n-1)/n then it takes
1552          * n clock ticks to go from load L to .36 L once
1553          * things quiet down.  it takes about 5 n clock
1554          * ticks to go to zero.  so using HZ means this is
1555          * approximately the load over the last second,
1556          * with a tail lasting about 5 seconds.
1557          */
1558         n = nrun;
1559         nrun = 0;
1560         n = (nrdy+n)*1000;
1561         m->load = (m->load*(HZ-1)+n)/HZ;
1562 }
1563
1564 static void
1565 pidhash(Proc *p)
1566 {
1567         int h;
1568
1569         h = p->pid % nelem(procalloc.ht);
1570         lock(&procalloc);
1571         p->pidhash = procalloc.ht[h];
1572         procalloc.ht[h] = p;
1573         unlock(&procalloc);
1574 }
1575
1576 static void
1577 pidunhash(Proc *p)
1578 {
1579         int h;
1580         Proc **l;
1581
1582         h = p->pid % nelem(procalloc.ht);
1583         lock(&procalloc);
1584         for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
1585                 if(*l == p){
1586                         *l = p->pidhash;
1587                         break;
1588                 }
1589         unlock(&procalloc);
1590 }
1591
1592 int
1593 procindex(ulong pid)
1594 {
1595         Proc *p;
1596         int h;
1597         int s;
1598
1599         s = -1;
1600         h = pid % nelem(procalloc.ht);
1601         lock(&procalloc);
1602         for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
1603                 if(p->pid == pid){
1604                         s = p - procalloc.arena;
1605                         break;
1606                 }
1607         unlock(&procalloc);
1608         return s;
1609 }